2018-05-17 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob9608b769cf2a6650ce24a36020e0d0feae3cf2fd
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
216 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
219 /* A subroutine of vect_analyze_data_ref_dependence. Handle
220 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
221 distances. These distances are conservatively correct but they don't
222 reflect a guaranteed dependence.
224 Return true if this function does all the work necessary to avoid
225 an alias or false if the caller should use the dependence distances
226 to limit the vectorization factor in the usual way. LOOP_DEPTH is
227 the depth of the loop described by LOOP_VINFO and the other arguments
228 are as for vect_analyze_data_ref_dependence. */
230 static bool
231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 int loop_depth, unsigned int *max_vf)
235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
236 lambda_vector dist_v;
237 unsigned int i;
238 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
240 int dist = dist_v[loop_depth];
241 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
243 /* If the user asserted safelen >= DIST consecutive iterations
244 can be executed concurrently, assume independence.
246 ??? An alternative would be to add the alias check even
247 in this case, and vectorize the fallback loop with the
248 maximum VF set to safelen. However, if the user has
249 explicitly given a length, it's less likely that that
250 would be a win. */
251 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
253 if ((unsigned int) loop->safelen < *max_vf)
254 *max_vf = loop->safelen;
255 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
256 continue;
259 /* For dependence distances of 2 or more, we have the option
260 of limiting VF or checking for an alias at runtime.
261 Prefer to check at runtime if we can, to avoid limiting
262 the VF unnecessarily when the bases are in fact independent.
264 Note that the alias checks will be removed if the VF ends up
265 being small enough. */
266 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
269 return true;
273 /* Function vect_analyze_data_ref_dependence.
275 Return TRUE if there (might) exist a dependence between a memory-reference
276 DRA and a memory-reference DRB. When versioning for alias may check a
277 dependence at run-time, return FALSE. Adjust *MAX_VF according to
278 the data dependence. */
280 static bool
281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
282 loop_vec_info loop_vinfo,
283 unsigned int *max_vf)
285 unsigned int i;
286 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
287 struct data_reference *dra = DDR_A (ddr);
288 struct data_reference *drb = DDR_B (ddr);
289 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
290 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
291 lambda_vector dist_v;
292 unsigned int loop_depth;
294 /* In loop analysis all data references should be vectorizable. */
295 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
296 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
297 gcc_unreachable ();
299 /* Independent data accesses. */
300 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
301 return false;
303 if (dra == drb
304 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
305 return false;
307 /* We do not have to consider dependences between accesses that belong
308 to the same group, unless the stride could be smaller than the
309 group size. */
310 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
311 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)
312 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
313 return false;
315 /* Even if we have an anti-dependence then, as the vectorized loop covers at
316 least two scalar iterations, there is always also a true dependence.
317 As the vectorizer does not re-order loads and stores we can ignore
318 the anti-dependence if TBAA can disambiguate both DRs similar to the
319 case with known negative distance anti-dependences (positive
320 distance anti-dependences would violate TBAA constraints). */
321 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
322 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
323 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
324 get_alias_set (DR_REF (drb))))
325 return false;
327 /* Unknown data dependence. */
328 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
330 /* If user asserted safelen consecutive iterations can be
331 executed concurrently, assume independence. */
332 if (loop->safelen >= 2)
334 if ((unsigned int) loop->safelen < *max_vf)
335 *max_vf = loop->safelen;
336 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
337 return false;
340 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
341 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
343 if (dump_enabled_p ())
345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
346 "versioning for alias not supported for: "
347 "can't determine dependence between ");
348 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
349 DR_REF (dra));
350 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
351 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
352 DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
355 return true;
358 if (dump_enabled_p ())
360 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
361 "versioning for alias required: "
362 "can't determine dependence between ");
363 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
364 DR_REF (dra));
365 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
366 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
367 DR_REF (drb));
368 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
371 /* Add to list of ddrs that need to be tested at run-time. */
372 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
375 /* Known data dependence. */
376 if (DDR_NUM_DIST_VECTS (ddr) == 0)
378 /* If user asserted safelen consecutive iterations can be
379 executed concurrently, assume independence. */
380 if (loop->safelen >= 2)
382 if ((unsigned int) loop->safelen < *max_vf)
383 *max_vf = loop->safelen;
384 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
385 return false;
388 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
389 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
391 if (dump_enabled_p ())
393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
394 "versioning for alias not supported for: "
395 "bad dist vector for ");
396 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
397 DR_REF (dra));
398 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
399 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
400 DR_REF (drb));
401 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
403 return true;
406 if (dump_enabled_p ())
408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
409 "versioning for alias required: "
410 "bad dist vector for ");
411 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
412 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
413 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
414 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
416 /* Add to list of ddrs that need to be tested at run-time. */
417 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
420 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
422 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
423 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
424 loop_depth, max_vf))
425 return false;
427 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
429 int dist = dist_v[loop_depth];
431 if (dump_enabled_p ())
432 dump_printf_loc (MSG_NOTE, vect_location,
433 "dependence distance = %d.\n", dist);
435 if (dist == 0)
437 if (dump_enabled_p ())
439 dump_printf_loc (MSG_NOTE, vect_location,
440 "dependence distance == 0 between ");
441 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
442 dump_printf (MSG_NOTE, " and ");
443 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
444 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
447 /* When we perform grouped accesses and perform implicit CSE
448 by detecting equal accesses and doing disambiguation with
449 runtime alias tests like for
450 .. = a[i];
451 .. = a[i+1];
452 a[i] = ..;
453 a[i+1] = ..;
454 *p = ..;
455 .. = a[i];
456 .. = a[i+1];
457 where we will end up loading { a[i], a[i+1] } once, make
458 sure that inserting group loads before the first load and
459 stores after the last store will do the right thing.
460 Similar for groups like
461 a[i] = ...;
462 ... = a[i];
463 a[i+1] = ...;
464 where loads from the group interleave with the store. */
465 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
467 if (dump_enabled_p ())
468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
469 "READ_WRITE dependence in interleaving.\n");
470 return true;
473 if (loop->safelen < 2)
475 tree indicator = dr_zero_step_indicator (dra);
476 if (TREE_CODE (indicator) != INTEGER_CST)
477 vect_check_nonzero_value (loop_vinfo, indicator);
478 else if (integer_zerop (indicator))
480 if (dump_enabled_p ())
481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
482 "access also has a zero step\n");
483 return true;
486 continue;
489 if (dist > 0 && DDR_REVERSED_P (ddr))
491 /* If DDR_REVERSED_P the order of the data-refs in DDR was
492 reversed (to make distance vector positive), and the actual
493 distance is negative. */
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
496 "dependence distance negative.\n");
497 /* Record a negative dependence distance to later limit the
498 amount of stmt copying / unrolling we can perform.
499 Only need to handle read-after-write dependence. */
500 if (DR_IS_READ (drb)
501 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
502 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
503 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
504 continue;
507 unsigned int abs_dist = abs (dist);
508 if (abs_dist >= 2 && abs_dist < *max_vf)
510 /* The dependence distance requires reduction of the maximal
511 vectorization factor. */
512 *max_vf = abs (dist);
513 if (dump_enabled_p ())
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "adjusting maximal vectorization factor to %i\n",
516 *max_vf);
519 if (abs_dist >= *max_vf)
521 /* Dependence distance does not create dependence, as far as
522 vectorization is concerned, in this case. */
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_NOTE, vect_location,
525 "dependence distance >= VF.\n");
526 continue;
529 if (dump_enabled_p ())
531 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
532 "not vectorized, possible dependence "
533 "between data-refs ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 dump_printf (MSG_NOTE, " and ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 dump_printf (MSG_NOTE, "\n");
540 return true;
543 return false;
546 /* Function vect_analyze_data_ref_dependences.
548 Examine all the data references in the loop, and make sure there do not
549 exist any data dependences between them. Set *MAX_VF according to
550 the maximum vectorization factor the data dependences allow. */
552 bool
553 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
554 unsigned int *max_vf)
556 unsigned int i;
557 struct data_dependence_relation *ddr;
559 if (dump_enabled_p ())
560 dump_printf_loc (MSG_NOTE, vect_location,
561 "=== vect_analyze_data_ref_dependences ===\n");
563 LOOP_VINFO_DDRS (loop_vinfo)
564 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
565 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
566 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
567 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
568 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
569 &LOOP_VINFO_DDRS (loop_vinfo),
570 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
571 return false;
573 /* For epilogues we either have no aliases or alias versioning
574 was applied to original loop. Therefore we may just get max_vf
575 using VF of original loop. */
576 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
577 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
578 else
579 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
580 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
581 return false;
583 return true;
587 /* Function vect_slp_analyze_data_ref_dependence.
589 Return TRUE if there (might) exist a dependence between a memory-reference
590 DRA and a memory-reference DRB. When versioning for alias may check a
591 dependence at run-time, return FALSE. Adjust *MAX_VF according to
592 the data dependence. */
594 static bool
595 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
597 struct data_reference *dra = DDR_A (ddr);
598 struct data_reference *drb = DDR_B (ddr);
600 /* We need to check dependences of statements marked as unvectorizable
601 as well, they still can prohibit vectorization. */
603 /* Independent data accesses. */
604 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
605 return false;
607 if (dra == drb)
608 return false;
610 /* Read-read is OK. */
611 if (DR_IS_READ (dra) && DR_IS_READ (drb))
612 return false;
614 /* If dra and drb are part of the same interleaving chain consider
615 them independent. */
616 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
617 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
618 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
619 return false;
621 /* Unknown data dependence. */
622 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
624 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 "can't determine dependence between ");
628 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
629 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
630 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
631 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
634 else if (dump_enabled_p ())
636 dump_printf_loc (MSG_NOTE, vect_location,
637 "determined dependence between ");
638 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
639 dump_printf (MSG_NOTE, " and ");
640 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
641 dump_printf (MSG_NOTE, "\n");
644 return true;
648 /* Analyze dependences involved in the transform of SLP NODE. STORES
649 contain the vector of scalar stores of this instance if we are
650 disambiguating the loads. */
652 static bool
653 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
654 vec<gimple *> stores, gimple *last_store)
656 /* This walks over all stmts involved in the SLP load/store done
657 in NODE verifying we can sink them up to the last stmt in the
658 group. */
659 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
660 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
662 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
663 if (access == last_access)
664 continue;
665 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
666 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
667 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
669 gimple *stmt = gsi_stmt (gsi);
670 if (! gimple_vuse (stmt)
671 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
672 continue;
674 /* If we couldn't record a (single) data reference for this
675 stmt we have to give up. */
676 /* ??? Here and below if dependence analysis fails we can resort
677 to the alias oracle which can handle more kinds of stmts. */
678 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
679 if (!dr_b)
680 return false;
682 bool dependent = false;
683 /* If we run into a store of this same instance (we've just
684 marked those) then delay dependence checking until we run
685 into the last store because this is where it will have
686 been sunk to (and we verify if we can do that as well). */
687 if (gimple_visited_p (stmt))
689 if (stmt != last_store)
690 continue;
691 unsigned i;
692 gimple *store;
693 FOR_EACH_VEC_ELT (stores, i, store)
695 data_reference *store_dr
696 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
697 ddr_p ddr = initialize_data_dependence_relation
698 (dr_a, store_dr, vNULL);
699 dependent = vect_slp_analyze_data_ref_dependence (ddr);
700 free_dependence_relation (ddr);
701 if (dependent)
702 break;
705 else
707 ddr_p ddr = initialize_data_dependence_relation (dr_a,
708 dr_b, vNULL);
709 dependent = vect_slp_analyze_data_ref_dependence (ddr);
710 free_dependence_relation (ddr);
712 if (dependent)
713 return false;
716 return true;
720 /* Function vect_analyze_data_ref_dependences.
722 Examine all the data references in the basic-block, and make sure there
723 do not exist any data dependences between them. Set *MAX_VF according to
724 the maximum vectorization factor the data dependences allow. */
726 bool
727 vect_slp_analyze_instance_dependence (slp_instance instance)
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_NOTE, vect_location,
731 "=== vect_slp_analyze_instance_dependence ===\n");
733 /* The stores of this instance are at the root of the SLP tree. */
734 slp_tree store = SLP_INSTANCE_TREE (instance);
735 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
736 store = NULL;
738 /* Verify we can sink stores to the vectorized stmt insert location. */
739 gimple *last_store = NULL;
740 if (store)
742 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
743 return false;
745 /* Mark stores in this instance and remember the last one. */
746 last_store = vect_find_last_scalar_stmt_in_slp (store);
747 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
748 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
751 bool res = true;
753 /* Verify we can sink loads to the vectorized stmt insert location,
754 special-casing stores of this instance. */
755 slp_tree load;
756 unsigned int i;
757 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
758 if (! vect_slp_analyze_node_dependences (instance, load,
759 store
760 ? SLP_TREE_SCALAR_STMTS (store)
761 : vNULL, last_store))
763 res = false;
764 break;
767 /* Unset the visited flag. */
768 if (store)
769 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
770 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
772 return res;
775 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
776 the statement that contains DRB, which is useful for recording in the
777 dump file. */
779 static void
780 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
781 innermost_loop_behavior *drb)
783 bool existed;
784 innermost_loop_behavior *&entry
785 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
786 if (!existed || entry->base_alignment < drb->base_alignment)
788 entry = drb;
789 if (dump_enabled_p ())
791 dump_printf_loc (MSG_NOTE, vect_location,
792 "recording new base alignment for ");
793 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
794 dump_printf (MSG_NOTE, "\n");
795 dump_printf_loc (MSG_NOTE, vect_location,
796 " alignment: %d\n", drb->base_alignment);
797 dump_printf_loc (MSG_NOTE, vect_location,
798 " misalignment: %d\n", drb->base_misalignment);
799 dump_printf_loc (MSG_NOTE, vect_location,
800 " based on: ");
801 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
806 /* If the region we're going to vectorize is reached, all unconditional
807 data references occur at least once. We can therefore pool the base
808 alignment guarantees from each unconditional reference. Do this by
809 going through all the data references in VINFO and checking whether
810 the containing statement makes the reference unconditionally. If so,
811 record the alignment of the base address in VINFO so that it can be
812 used for all other references with the same base. */
814 void
815 vect_record_base_alignments (vec_info *vinfo)
817 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
818 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
819 data_reference *dr;
820 unsigned int i;
821 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
822 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
824 gimple *stmt = DR_STMT (dr);
825 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
827 /* If DR is nested in the loop that is being vectorized, we can also
828 record the alignment of the base wrt the outer loop. */
829 if (loop && nested_in_vect_loop_p (loop, stmt))
831 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
832 vect_record_base_alignment
833 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
838 /* Return the target alignment for the vectorized form of DR. */
840 static unsigned int
841 vect_calculate_target_alignment (struct data_reference *dr)
843 gimple *stmt = DR_STMT (dr);
844 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
845 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
846 return targetm.vectorize.preferred_vector_alignment (vectype);
849 /* Function vect_compute_data_ref_alignment
851 Compute the misalignment of the data reference DR.
853 Output:
854 1. If during the misalignment computation it is found that the data reference
855 cannot be vectorized then false is returned.
856 2. DR_MISALIGNMENT (DR) is defined.
858 FOR NOW: No analysis is actually performed. Misalignment is calculated
859 only for trivial cases. TODO. */
861 bool
862 vect_compute_data_ref_alignment (struct data_reference *dr)
864 gimple *stmt = DR_STMT (dr);
865 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
866 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
867 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
868 struct loop *loop = NULL;
869 tree ref = DR_REF (dr);
870 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_NOTE, vect_location,
874 "vect_compute_data_ref_alignment:\n");
876 if (loop_vinfo)
877 loop = LOOP_VINFO_LOOP (loop_vinfo);
879 /* Initialize misalignment to unknown. */
880 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
882 innermost_loop_behavior *drb = vect_dr_behavior (dr);
883 bool step_preserves_misalignment_p;
885 unsigned HOST_WIDE_INT vector_alignment
886 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
887 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
889 /* No step for BB vectorization. */
890 if (!loop)
892 gcc_assert (integer_zerop (drb->step));
893 step_preserves_misalignment_p = true;
896 /* In case the dataref is in an inner-loop of the loop that is being
897 vectorized (LOOP), we use the base and misalignment information
898 relative to the outer-loop (LOOP). This is ok only if the misalignment
899 stays the same throughout the execution of the inner-loop, which is why
900 we have to check that the stride of the dataref in the inner-loop evenly
901 divides by the vector alignment. */
902 else if (nested_in_vect_loop_p (loop, stmt))
904 step_preserves_misalignment_p
905 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
907 if (dump_enabled_p ())
909 if (step_preserves_misalignment_p)
910 dump_printf_loc (MSG_NOTE, vect_location,
911 "inner step divides the vector alignment.\n");
912 else
913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
914 "inner step doesn't divide the vector"
915 " alignment.\n");
919 /* Similarly we can only use base and misalignment information relative to
920 an innermost loop if the misalignment stays the same throughout the
921 execution of the loop. As above, this is the case if the stride of
922 the dataref evenly divides by the alignment. */
923 else
925 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
926 step_preserves_misalignment_p
927 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
929 if (!step_preserves_misalignment_p && dump_enabled_p ())
930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
931 "step doesn't divide the vector alignment.\n");
934 unsigned int base_alignment = drb->base_alignment;
935 unsigned int base_misalignment = drb->base_misalignment;
937 /* Calculate the maximum of the pooled base address alignment and the
938 alignment that we can compute for DR itself. */
939 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
940 if (entry && base_alignment < (*entry)->base_alignment)
942 base_alignment = (*entry)->base_alignment;
943 base_misalignment = (*entry)->base_misalignment;
946 if (drb->offset_alignment < vector_alignment
947 || !step_preserves_misalignment_p
948 /* We need to know whether the step wrt the vectorized loop is
949 negative when computing the starting misalignment below. */
950 || TREE_CODE (drb->step) != INTEGER_CST)
952 if (dump_enabled_p ())
954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
955 "Unknown alignment for access: ");
956 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
957 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
959 return true;
962 if (base_alignment < vector_alignment)
964 unsigned int max_alignment;
965 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
966 if (max_alignment < vector_alignment
967 || !vect_can_force_dr_alignment_p (base,
968 vector_alignment * BITS_PER_UNIT))
970 if (dump_enabled_p ())
972 dump_printf_loc (MSG_NOTE, vect_location,
973 "can't force alignment of ref: ");
974 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
975 dump_printf (MSG_NOTE, "\n");
977 return true;
980 /* Force the alignment of the decl.
981 NOTE: This is the only change to the code we make during
982 the analysis phase, before deciding to vectorize the loop. */
983 if (dump_enabled_p ())
985 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
986 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
987 dump_printf (MSG_NOTE, "\n");
990 DR_VECT_AUX (dr)->base_decl = base;
991 DR_VECT_AUX (dr)->base_misaligned = true;
992 base_misalignment = 0;
994 poly_int64 misalignment
995 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
997 /* If this is a backward running DR then first access in the larger
998 vectype actually is N-1 elements before the address in the DR.
999 Adjust misalign accordingly. */
1000 if (tree_int_cst_sgn (drb->step) < 0)
1001 /* PLUS because STEP is negative. */
1002 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1003 * TREE_INT_CST_LOW (drb->step));
1005 unsigned int const_misalignment;
1006 if (!known_misalignment (misalignment, vector_alignment,
1007 &const_misalignment))
1009 if (dump_enabled_p ())
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "Non-constant misalignment for access: ");
1013 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1014 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1016 return true;
1019 SET_DR_MISALIGNMENT (dr, const_misalignment);
1021 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1024 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1025 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1026 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1029 return true;
1032 /* Function vect_update_misalignment_for_peel.
1033 Sets DR's misalignment
1034 - to 0 if it has the same alignment as DR_PEEL,
1035 - to the misalignment computed using NPEEL if DR's salignment is known,
1036 - to -1 (unknown) otherwise.
1038 DR - the data reference whose misalignment is to be adjusted.
1039 DR_PEEL - the data reference whose misalignment is being made
1040 zero in the vector loop by the peel.
1041 NPEEL - the number of iterations in the peel loop if the misalignment
1042 of DR_PEEL is known at compile time. */
1044 static void
1045 vect_update_misalignment_for_peel (struct data_reference *dr,
1046 struct data_reference *dr_peel, int npeel)
1048 unsigned int i;
1049 vec<dr_p> same_aligned_drs;
1050 struct data_reference *current_dr;
1051 int dr_size = vect_get_scalar_dr_size (dr);
1052 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1053 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1054 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1056 /* For interleaved data accesses the step in the loop must be multiplied by
1057 the size of the interleaving group. */
1058 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1059 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1060 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1061 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1063 /* It can be assumed that the data refs with the same alignment as dr_peel
1064 are aligned in the vector loop. */
1065 same_aligned_drs
1066 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1067 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1069 if (current_dr != dr)
1070 continue;
1071 gcc_assert (!known_alignment_for_access_p (dr)
1072 || !known_alignment_for_access_p (dr_peel)
1073 || (DR_MISALIGNMENT (dr) / dr_size
1074 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1075 SET_DR_MISALIGNMENT (dr, 0);
1076 return;
1079 if (known_alignment_for_access_p (dr)
1080 && known_alignment_for_access_p (dr_peel))
1082 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1083 int misal = DR_MISALIGNMENT (dr);
1084 misal += negative ? -npeel * dr_size : npeel * dr_size;
1085 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1086 SET_DR_MISALIGNMENT (dr, misal);
1087 return;
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1092 "to unknown (-1).\n");
1093 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1097 /* Function verify_data_ref_alignment
1099 Return TRUE if DR can be handled with respect to alignment. */
1101 static bool
1102 verify_data_ref_alignment (data_reference_p dr)
1104 enum dr_alignment_support supportable_dr_alignment
1105 = vect_supportable_dr_alignment (dr, false);
1106 if (!supportable_dr_alignment)
1108 if (dump_enabled_p ())
1110 if (DR_IS_READ (dr))
1111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1112 "not vectorized: unsupported unaligned load.");
1113 else
1114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1115 "not vectorized: unsupported unaligned "
1116 "store.");
1118 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1119 DR_REF (dr));
1120 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1122 return false;
1125 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1126 dump_printf_loc (MSG_NOTE, vect_location,
1127 "Vectorizing an unaligned access.\n");
1129 return true;
1132 /* Function vect_verify_datarefs_alignment
1134 Return TRUE if all data references in the loop can be
1135 handled with respect to alignment. */
1137 bool
1138 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1140 vec<data_reference_p> datarefs = vinfo->datarefs;
1141 struct data_reference *dr;
1142 unsigned int i;
1144 FOR_EACH_VEC_ELT (datarefs, i, dr)
1146 gimple *stmt = DR_STMT (dr);
1147 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1149 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1150 continue;
1152 /* For interleaving, only the alignment of the first access matters. */
1153 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1154 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1155 continue;
1157 /* Strided accesses perform only component accesses, alignment is
1158 irrelevant for them. */
1159 if (STMT_VINFO_STRIDED_P (stmt_info)
1160 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1161 continue;
1163 if (! verify_data_ref_alignment (dr))
1164 return false;
1167 return true;
1170 /* Given an memory reference EXP return whether its alignment is less
1171 than its size. */
1173 static bool
1174 not_size_aligned (tree exp)
1176 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1177 return true;
1179 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1180 > get_object_alignment (exp));
1183 /* Function vector_alignment_reachable_p
1185 Return true if vector alignment for DR is reachable by peeling
1186 a few loop iterations. Return false otherwise. */
1188 static bool
1189 vector_alignment_reachable_p (struct data_reference *dr)
1191 gimple *stmt = DR_STMT (dr);
1192 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1193 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1195 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1197 /* For interleaved access we peel only if number of iterations in
1198 the prolog loop ({VF - misalignment}), is a multiple of the
1199 number of the interleaved accesses. */
1200 int elem_size, mis_in_elements;
1202 /* FORNOW: handle only known alignment. */
1203 if (!known_alignment_for_access_p (dr))
1204 return false;
1206 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1207 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1208 elem_size = vector_element_size (vector_size, nelements);
1209 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1211 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1212 return false;
1215 /* If misalignment is known at the compile time then allow peeling
1216 only if natural alignment is reachable through peeling. */
1217 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1219 HOST_WIDE_INT elmsize =
1220 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1221 if (dump_enabled_p ())
1223 dump_printf_loc (MSG_NOTE, vect_location,
1224 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1225 dump_printf (MSG_NOTE,
1226 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1228 if (DR_MISALIGNMENT (dr) % elmsize)
1230 if (dump_enabled_p ())
1231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232 "data size does not divide the misalignment.\n");
1233 return false;
1237 if (!known_alignment_for_access_p (dr))
1239 tree type = TREE_TYPE (DR_REF (dr));
1240 bool is_packed = not_size_aligned (DR_REF (dr));
1241 if (dump_enabled_p ())
1242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1243 "Unknown misalignment, %snaturally aligned\n",
1244 is_packed ? "not " : "");
1245 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1248 return true;
1252 /* Calculate the cost of the memory access represented by DR. */
1254 static void
1255 vect_get_data_access_cost (struct data_reference *dr,
1256 unsigned int *inside_cost,
1257 unsigned int *outside_cost,
1258 stmt_vector_for_cost *body_cost_vec,
1259 stmt_vector_for_cost *prologue_cost_vec)
1261 gimple *stmt = DR_STMT (dr);
1262 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1263 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1264 int ncopies;
1266 if (PURE_SLP_STMT (stmt_info))
1267 ncopies = 1;
1268 else
1269 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1271 if (DR_IS_READ (dr))
1272 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1273 prologue_cost_vec, body_cost_vec, false);
1274 else
1275 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1277 if (dump_enabled_p ())
1278 dump_printf_loc (MSG_NOTE, vect_location,
1279 "vect_get_data_access_cost: inside_cost = %d, "
1280 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1284 typedef struct _vect_peel_info
1286 struct data_reference *dr;
1287 int npeel;
1288 unsigned int count;
1289 } *vect_peel_info;
1291 typedef struct _vect_peel_extended_info
1293 struct _vect_peel_info peel_info;
1294 unsigned int inside_cost;
1295 unsigned int outside_cost;
1296 } *vect_peel_extended_info;
1299 /* Peeling hashtable helpers. */
1301 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1303 static inline hashval_t hash (const _vect_peel_info *);
1304 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1307 inline hashval_t
1308 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1310 return (hashval_t) peel_info->npeel;
1313 inline bool
1314 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1316 return (a->npeel == b->npeel);
1320 /* Insert DR into peeling hash table with NPEEL as key. */
1322 static void
1323 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1324 loop_vec_info loop_vinfo, struct data_reference *dr,
1325 int npeel)
1327 struct _vect_peel_info elem, *slot;
1328 _vect_peel_info **new_slot;
1329 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1331 elem.npeel = npeel;
1332 slot = peeling_htab->find (&elem);
1333 if (slot)
1334 slot->count++;
1335 else
1337 slot = XNEW (struct _vect_peel_info);
1338 slot->npeel = npeel;
1339 slot->dr = dr;
1340 slot->count = 1;
1341 new_slot = peeling_htab->find_slot (slot, INSERT);
1342 *new_slot = slot;
1345 if (!supportable_dr_alignment
1346 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1347 slot->count += VECT_MAX_COST;
1351 /* Traverse peeling hash table to find peeling option that aligns maximum
1352 number of data accesses. */
1355 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1356 _vect_peel_extended_info *max)
1358 vect_peel_info elem = *slot;
1360 if (elem->count > max->peel_info.count
1361 || (elem->count == max->peel_info.count
1362 && max->peel_info.npeel > elem->npeel))
1364 max->peel_info.npeel = elem->npeel;
1365 max->peel_info.count = elem->count;
1366 max->peel_info.dr = elem->dr;
1369 return 1;
1372 /* Get the costs of peeling NPEEL iterations checking data access costs
1373 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1374 misalignment will be zero after peeling. */
1376 static void
1377 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1378 struct data_reference *dr0,
1379 unsigned int *inside_cost,
1380 unsigned int *outside_cost,
1381 stmt_vector_for_cost *body_cost_vec,
1382 stmt_vector_for_cost *prologue_cost_vec,
1383 unsigned int npeel,
1384 bool unknown_misalignment)
1386 unsigned i;
1387 data_reference *dr;
1389 FOR_EACH_VEC_ELT (datarefs, i, dr)
1391 gimple *stmt = DR_STMT (dr);
1392 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1393 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1394 continue;
1396 /* For interleaving, only the alignment of the first access
1397 matters. */
1398 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1399 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1400 continue;
1402 /* Strided accesses perform only component accesses, alignment is
1403 irrelevant for them. */
1404 if (STMT_VINFO_STRIDED_P (stmt_info)
1405 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1406 continue;
1408 int save_misalignment;
1409 save_misalignment = DR_MISALIGNMENT (dr);
1410 if (npeel == 0)
1412 else if (unknown_misalignment && dr == dr0)
1413 SET_DR_MISALIGNMENT (dr, 0);
1414 else
1415 vect_update_misalignment_for_peel (dr, dr0, npeel);
1416 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1417 body_cost_vec, prologue_cost_vec);
1418 SET_DR_MISALIGNMENT (dr, save_misalignment);
1422 /* Traverse peeling hash table and calculate cost for each peeling option.
1423 Find the one with the lowest cost. */
1426 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1427 _vect_peel_extended_info *min)
1429 vect_peel_info elem = *slot;
1430 int dummy;
1431 unsigned int inside_cost = 0, outside_cost = 0;
1432 gimple *stmt = DR_STMT (elem->dr);
1433 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1434 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1435 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1436 epilogue_cost_vec;
1438 prologue_cost_vec.create (2);
1439 body_cost_vec.create (2);
1440 epilogue_cost_vec.create (2);
1442 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1443 elem->dr, &inside_cost, &outside_cost,
1444 &body_cost_vec, &prologue_cost_vec,
1445 elem->npeel, false);
1447 body_cost_vec.release ();
1449 outside_cost += vect_get_known_peeling_cost
1450 (loop_vinfo, elem->npeel, &dummy,
1451 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1452 &prologue_cost_vec, &epilogue_cost_vec);
1454 /* Prologue and epilogue costs are added to the target model later.
1455 These costs depend only on the scalar iteration cost, the
1456 number of peeling iterations finally chosen, and the number of
1457 misaligned statements. So discard the information found here. */
1458 prologue_cost_vec.release ();
1459 epilogue_cost_vec.release ();
1461 if (inside_cost < min->inside_cost
1462 || (inside_cost == min->inside_cost
1463 && outside_cost < min->outside_cost))
1465 min->inside_cost = inside_cost;
1466 min->outside_cost = outside_cost;
1467 min->peel_info.dr = elem->dr;
1468 min->peel_info.npeel = elem->npeel;
1469 min->peel_info.count = elem->count;
1472 return 1;
1476 /* Choose best peeling option by traversing peeling hash table and either
1477 choosing an option with the lowest cost (if cost model is enabled) or the
1478 option that aligns as many accesses as possible. */
1480 static struct _vect_peel_extended_info
1481 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1482 loop_vec_info loop_vinfo)
1484 struct _vect_peel_extended_info res;
1486 res.peel_info.dr = NULL;
1488 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1490 res.inside_cost = INT_MAX;
1491 res.outside_cost = INT_MAX;
1492 peeling_htab->traverse <_vect_peel_extended_info *,
1493 vect_peeling_hash_get_lowest_cost> (&res);
1495 else
1497 res.peel_info.count = 0;
1498 peeling_htab->traverse <_vect_peel_extended_info *,
1499 vect_peeling_hash_get_most_frequent> (&res);
1500 res.inside_cost = 0;
1501 res.outside_cost = 0;
1504 return res;
1507 /* Return true if the new peeling NPEEL is supported. */
1509 static bool
1510 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1511 unsigned npeel)
1513 unsigned i;
1514 struct data_reference *dr = NULL;
1515 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1516 gimple *stmt;
1517 stmt_vec_info stmt_info;
1518 enum dr_alignment_support supportable_dr_alignment;
1520 /* Ensure that all data refs can be vectorized after the peel. */
1521 FOR_EACH_VEC_ELT (datarefs, i, dr)
1523 int save_misalignment;
1525 if (dr == dr0)
1526 continue;
1528 stmt = DR_STMT (dr);
1529 stmt_info = vinfo_for_stmt (stmt);
1530 /* For interleaving, only the alignment of the first access
1531 matters. */
1532 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1533 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1534 continue;
1536 /* Strided accesses perform only component accesses, alignment is
1537 irrelevant for them. */
1538 if (STMT_VINFO_STRIDED_P (stmt_info)
1539 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1540 continue;
1542 save_misalignment = DR_MISALIGNMENT (dr);
1543 vect_update_misalignment_for_peel (dr, dr0, npeel);
1544 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1545 SET_DR_MISALIGNMENT (dr, save_misalignment);
1547 if (!supportable_dr_alignment)
1548 return false;
1551 return true;
1554 /* Function vect_enhance_data_refs_alignment
1556 This pass will use loop versioning and loop peeling in order to enhance
1557 the alignment of data references in the loop.
1559 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1560 original loop is to be vectorized. Any other loops that are created by
1561 the transformations performed in this pass - are not supposed to be
1562 vectorized. This restriction will be relaxed.
1564 This pass will require a cost model to guide it whether to apply peeling
1565 or versioning or a combination of the two. For example, the scheme that
1566 intel uses when given a loop with several memory accesses, is as follows:
1567 choose one memory access ('p') which alignment you want to force by doing
1568 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1569 other accesses are not necessarily aligned, or (2) use loop versioning to
1570 generate one loop in which all accesses are aligned, and another loop in
1571 which only 'p' is necessarily aligned.
1573 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1574 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1575 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1577 Devising a cost model is the most critical aspect of this work. It will
1578 guide us on which access to peel for, whether to use loop versioning, how
1579 many versions to create, etc. The cost model will probably consist of
1580 generic considerations as well as target specific considerations (on
1581 powerpc for example, misaligned stores are more painful than misaligned
1582 loads).
1584 Here are the general steps involved in alignment enhancements:
1586 -- original loop, before alignment analysis:
1587 for (i=0; i<N; i++){
1588 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1589 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1592 -- After vect_compute_data_refs_alignment:
1593 for (i=0; i<N; i++){
1594 x = q[i]; # DR_MISALIGNMENT(q) = 3
1595 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1598 -- Possibility 1: we do loop versioning:
1599 if (p is aligned) {
1600 for (i=0; i<N; i++){ # loop 1A
1601 x = q[i]; # DR_MISALIGNMENT(q) = 3
1602 p[i] = y; # DR_MISALIGNMENT(p) = 0
1605 else {
1606 for (i=0; i<N; i++){ # loop 1B
1607 x = q[i]; # DR_MISALIGNMENT(q) = 3
1608 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1612 -- Possibility 2: we do loop peeling:
1613 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1614 x = q[i];
1615 p[i] = y;
1617 for (i = 3; i < N; i++){ # loop 2A
1618 x = q[i]; # DR_MISALIGNMENT(q) = 0
1619 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1622 -- Possibility 3: combination of loop peeling and versioning:
1623 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1624 x = q[i];
1625 p[i] = y;
1627 if (p is aligned) {
1628 for (i = 3; i<N; i++){ # loop 3A
1629 x = q[i]; # DR_MISALIGNMENT(q) = 0
1630 p[i] = y; # DR_MISALIGNMENT(p) = 0
1633 else {
1634 for (i = 3; i<N; i++){ # loop 3B
1635 x = q[i]; # DR_MISALIGNMENT(q) = 0
1636 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1640 These loops are later passed to loop_transform to be vectorized. The
1641 vectorizer will use the alignment information to guide the transformation
1642 (whether to generate regular loads/stores, or with special handling for
1643 misalignment). */
1645 bool
1646 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1648 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1649 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1650 enum dr_alignment_support supportable_dr_alignment;
1651 struct data_reference *dr0 = NULL, *first_store = NULL;
1652 struct data_reference *dr;
1653 unsigned int i, j;
1654 bool do_peeling = false;
1655 bool do_versioning = false;
1656 bool stat;
1657 gimple *stmt;
1658 stmt_vec_info stmt_info;
1659 unsigned int npeel = 0;
1660 bool one_misalignment_known = false;
1661 bool one_misalignment_unknown = false;
1662 bool one_dr_unsupportable = false;
1663 struct data_reference *unsupportable_dr = NULL;
1664 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1665 unsigned possible_npeel_number = 1;
1666 tree vectype;
1667 unsigned int mis, same_align_drs_max = 0;
1668 hash_table<peel_info_hasher> peeling_htab (1);
1670 if (dump_enabled_p ())
1671 dump_printf_loc (MSG_NOTE, vect_location,
1672 "=== vect_enhance_data_refs_alignment ===\n");
1674 /* Reset data so we can safely be called multiple times. */
1675 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1676 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1678 /* While cost model enhancements are expected in the future, the high level
1679 view of the code at this time is as follows:
1681 A) If there is a misaligned access then see if peeling to align
1682 this access can make all data references satisfy
1683 vect_supportable_dr_alignment. If so, update data structures
1684 as needed and return true.
1686 B) If peeling wasn't possible and there is a data reference with an
1687 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1688 then see if loop versioning checks can be used to make all data
1689 references satisfy vect_supportable_dr_alignment. If so, update
1690 data structures as needed and return true.
1692 C) If neither peeling nor versioning were successful then return false if
1693 any data reference does not satisfy vect_supportable_dr_alignment.
1695 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1697 Note, Possibility 3 above (which is peeling and versioning together) is not
1698 being done at this time. */
1700 /* (1) Peeling to force alignment. */
1702 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1703 Considerations:
1704 + How many accesses will become aligned due to the peeling
1705 - How many accesses will become unaligned due to the peeling,
1706 and the cost of misaligned accesses.
1707 - The cost of peeling (the extra runtime checks, the increase
1708 in code size). */
1710 FOR_EACH_VEC_ELT (datarefs, i, dr)
1712 stmt = DR_STMT (dr);
1713 stmt_info = vinfo_for_stmt (stmt);
1715 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1716 continue;
1718 /* For interleaving, only the alignment of the first access
1719 matters. */
1720 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1721 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1722 continue;
1724 /* For invariant accesses there is nothing to enhance. */
1725 if (integer_zerop (DR_STEP (dr)))
1726 continue;
1728 /* Strided accesses perform only component accesses, alignment is
1729 irrelevant for them. */
1730 if (STMT_VINFO_STRIDED_P (stmt_info)
1731 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1732 continue;
1734 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1735 do_peeling = vector_alignment_reachable_p (dr);
1736 if (do_peeling)
1738 if (known_alignment_for_access_p (dr))
1740 unsigned int npeel_tmp = 0;
1741 bool negative = tree_int_cst_compare (DR_STEP (dr),
1742 size_zero_node) < 0;
1744 vectype = STMT_VINFO_VECTYPE (stmt_info);
1745 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1746 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1747 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1748 if (DR_MISALIGNMENT (dr) != 0)
1749 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1751 /* For multiple types, it is possible that the bigger type access
1752 will have more than one peeling option. E.g., a loop with two
1753 types: one of size (vector size / 4), and the other one of
1754 size (vector size / 8). Vectorization factor will 8. If both
1755 accesses are misaligned by 3, the first one needs one scalar
1756 iteration to be aligned, and the second one needs 5. But the
1757 first one will be aligned also by peeling 5 scalar
1758 iterations, and in that case both accesses will be aligned.
1759 Hence, except for the immediate peeling amount, we also want
1760 to try to add full vector size, while we don't exceed
1761 vectorization factor.
1762 We do this automatically for cost model, since we calculate
1763 cost for every peeling option. */
1764 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1766 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1767 ? vf * GROUP_SIZE (stmt_info) : vf);
1768 possible_npeel_number
1769 = vect_get_num_vectors (nscalars, vectype);
1771 /* NPEEL_TMP is 0 when there is no misalignment, but also
1772 allow peeling NELEMENTS. */
1773 if (DR_MISALIGNMENT (dr) == 0)
1774 possible_npeel_number++;
1777 /* Save info about DR in the hash table. Also include peeling
1778 amounts according to the explanation above. */
1779 for (j = 0; j < possible_npeel_number; j++)
1781 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1782 dr, npeel_tmp);
1783 npeel_tmp += target_align / dr_size;
1786 one_misalignment_known = true;
1788 else
1790 /* If we don't know any misalignment values, we prefer
1791 peeling for data-ref that has the maximum number of data-refs
1792 with the same alignment, unless the target prefers to align
1793 stores over load. */
1794 unsigned same_align_drs
1795 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1796 if (!dr0
1797 || same_align_drs_max < same_align_drs)
1799 same_align_drs_max = same_align_drs;
1800 dr0 = dr;
1802 /* For data-refs with the same number of related
1803 accesses prefer the one where the misalign
1804 computation will be invariant in the outermost loop. */
1805 else if (same_align_drs_max == same_align_drs)
1807 struct loop *ivloop0, *ivloop;
1808 ivloop0 = outermost_invariant_loop_for_expr
1809 (loop, DR_BASE_ADDRESS (dr0));
1810 ivloop = outermost_invariant_loop_for_expr
1811 (loop, DR_BASE_ADDRESS (dr));
1812 if ((ivloop && !ivloop0)
1813 || (ivloop && ivloop0
1814 && flow_loop_nested_p (ivloop, ivloop0)))
1815 dr0 = dr;
1818 one_misalignment_unknown = true;
1820 /* Check for data refs with unsupportable alignment that
1821 can be peeled. */
1822 if (!supportable_dr_alignment)
1824 one_dr_unsupportable = true;
1825 unsupportable_dr = dr;
1828 if (!first_store && DR_IS_WRITE (dr))
1829 first_store = dr;
1832 else
1834 if (!aligned_access_p (dr))
1836 if (dump_enabled_p ())
1837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838 "vector alignment may not be reachable\n");
1839 break;
1844 /* Check if we can possibly peel the loop. */
1845 if (!vect_can_advance_ivs_p (loop_vinfo)
1846 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1847 || loop->inner)
1848 do_peeling = false;
1850 struct _vect_peel_extended_info peel_for_known_alignment;
1851 struct _vect_peel_extended_info peel_for_unknown_alignment;
1852 struct _vect_peel_extended_info best_peel;
1854 peel_for_unknown_alignment.inside_cost = INT_MAX;
1855 peel_for_unknown_alignment.outside_cost = INT_MAX;
1856 peel_for_unknown_alignment.peel_info.count = 0;
1858 if (do_peeling
1859 && one_misalignment_unknown)
1861 /* Check if the target requires to prefer stores over loads, i.e., if
1862 misaligned stores are more expensive than misaligned loads (taking
1863 drs with same alignment into account). */
1864 unsigned int load_inside_cost = 0;
1865 unsigned int load_outside_cost = 0;
1866 unsigned int store_inside_cost = 0;
1867 unsigned int store_outside_cost = 0;
1868 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1870 stmt_vector_for_cost dummy;
1871 dummy.create (2);
1872 vect_get_peeling_costs_all_drs (datarefs, dr0,
1873 &load_inside_cost,
1874 &load_outside_cost,
1875 &dummy, &dummy, estimated_npeels, true);
1876 dummy.release ();
1878 if (first_store)
1880 dummy.create (2);
1881 vect_get_peeling_costs_all_drs (datarefs, first_store,
1882 &store_inside_cost,
1883 &store_outside_cost,
1884 &dummy, &dummy,
1885 estimated_npeels, true);
1886 dummy.release ();
1888 else
1890 store_inside_cost = INT_MAX;
1891 store_outside_cost = INT_MAX;
1894 if (load_inside_cost > store_inside_cost
1895 || (load_inside_cost == store_inside_cost
1896 && load_outside_cost > store_outside_cost))
1898 dr0 = first_store;
1899 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1900 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1902 else
1904 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1905 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1908 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1909 prologue_cost_vec.create (2);
1910 epilogue_cost_vec.create (2);
1912 int dummy2;
1913 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1914 (loop_vinfo, estimated_npeels, &dummy2,
1915 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1916 &prologue_cost_vec, &epilogue_cost_vec);
1918 prologue_cost_vec.release ();
1919 epilogue_cost_vec.release ();
1921 peel_for_unknown_alignment.peel_info.count = 1
1922 + STMT_VINFO_SAME_ALIGN_REFS
1923 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1926 peel_for_unknown_alignment.peel_info.npeel = 0;
1927 peel_for_unknown_alignment.peel_info.dr = dr0;
1929 best_peel = peel_for_unknown_alignment;
1931 peel_for_known_alignment.inside_cost = INT_MAX;
1932 peel_for_known_alignment.outside_cost = INT_MAX;
1933 peel_for_known_alignment.peel_info.count = 0;
1934 peel_for_known_alignment.peel_info.dr = NULL;
1936 if (do_peeling && one_misalignment_known)
1938 /* Peeling is possible, but there is no data access that is not supported
1939 unless aligned. So we try to choose the best possible peeling from
1940 the hash table. */
1941 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1942 (&peeling_htab, loop_vinfo);
1945 /* Compare costs of peeling for known and unknown alignment. */
1946 if (peel_for_known_alignment.peel_info.dr != NULL
1947 && peel_for_unknown_alignment.inside_cost
1948 >= peel_for_known_alignment.inside_cost)
1950 best_peel = peel_for_known_alignment;
1952 /* If the best peeling for known alignment has NPEEL == 0, perform no
1953 peeling at all except if there is an unsupportable dr that we can
1954 align. */
1955 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1956 do_peeling = false;
1959 /* If there is an unsupportable data ref, prefer this over all choices so far
1960 since we'd have to discard a chosen peeling except when it accidentally
1961 aligned the unsupportable data ref. */
1962 if (one_dr_unsupportable)
1963 dr0 = unsupportable_dr;
1964 else if (do_peeling)
1966 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1967 TODO: Use nopeel_outside_cost or get rid of it? */
1968 unsigned nopeel_inside_cost = 0;
1969 unsigned nopeel_outside_cost = 0;
1971 stmt_vector_for_cost dummy;
1972 dummy.create (2);
1973 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1974 &nopeel_outside_cost, &dummy, &dummy,
1975 0, false);
1976 dummy.release ();
1978 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1979 costs will be recorded. */
1980 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1981 prologue_cost_vec.create (2);
1982 epilogue_cost_vec.create (2);
1984 int dummy2;
1985 nopeel_outside_cost += vect_get_known_peeling_cost
1986 (loop_vinfo, 0, &dummy2,
1987 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1988 &prologue_cost_vec, &epilogue_cost_vec);
1990 prologue_cost_vec.release ();
1991 epilogue_cost_vec.release ();
1993 npeel = best_peel.peel_info.npeel;
1994 dr0 = best_peel.peel_info.dr;
1996 /* If no peeling is not more expensive than the best peeling we
1997 have so far, don't perform any peeling. */
1998 if (nopeel_inside_cost <= best_peel.inside_cost)
1999 do_peeling = false;
2002 if (do_peeling)
2004 stmt = DR_STMT (dr0);
2005 stmt_info = vinfo_for_stmt (stmt);
2006 vectype = STMT_VINFO_VECTYPE (stmt_info);
2008 if (known_alignment_for_access_p (dr0))
2010 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2011 size_zero_node) < 0;
2012 if (!npeel)
2014 /* Since it's known at compile time, compute the number of
2015 iterations in the peeled loop (the peeling factor) for use in
2016 updating DR_MISALIGNMENT values. The peeling factor is the
2017 vectorization factor minus the misalignment as an element
2018 count. */
2019 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2020 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2021 npeel = ((mis & (target_align - 1))
2022 / vect_get_scalar_dr_size (dr0));
2025 /* For interleaved data access every iteration accesses all the
2026 members of the group, therefore we divide the number of iterations
2027 by the group size. */
2028 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2029 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2030 npeel /= GROUP_SIZE (stmt_info);
2032 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_NOTE, vect_location,
2034 "Try peeling by %d\n", npeel);
2037 /* Ensure that all datarefs can be vectorized after the peel. */
2038 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2039 do_peeling = false;
2041 /* Check if all datarefs are supportable and log. */
2042 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2044 stat = vect_verify_datarefs_alignment (loop_vinfo);
2045 if (!stat)
2046 do_peeling = false;
2047 else
2048 return stat;
2051 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2052 if (do_peeling)
2054 unsigned max_allowed_peel
2055 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2056 if (max_allowed_peel != (unsigned)-1)
2058 unsigned max_peel = npeel;
2059 if (max_peel == 0)
2061 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2062 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2064 if (max_peel > max_allowed_peel)
2066 do_peeling = false;
2067 if (dump_enabled_p ())
2068 dump_printf_loc (MSG_NOTE, vect_location,
2069 "Disable peeling, max peels reached: %d\n", max_peel);
2074 /* Cost model #2 - if peeling may result in a remaining loop not
2075 iterating enough to be vectorized then do not peel. Since this
2076 is a cost heuristic rather than a correctness decision, use the
2077 most likely runtime value for variable vectorization factors. */
2078 if (do_peeling
2079 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2081 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2082 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2083 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2084 < assumed_vf + max_peel)
2085 do_peeling = false;
2088 if (do_peeling)
2090 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2091 If the misalignment of DR_i is identical to that of dr0 then set
2092 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2093 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2094 by the peeling factor times the element size of DR_i (MOD the
2095 vectorization factor times the size). Otherwise, the
2096 misalignment of DR_i must be set to unknown. */
2097 FOR_EACH_VEC_ELT (datarefs, i, dr)
2098 if (dr != dr0)
2100 /* Strided accesses perform only component accesses, alignment
2101 is irrelevant for them. */
2102 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2103 if (STMT_VINFO_STRIDED_P (stmt_info)
2104 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2105 continue;
2107 vect_update_misalignment_for_peel (dr, dr0, npeel);
2110 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2111 if (npeel)
2112 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2113 else
2114 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2115 = DR_MISALIGNMENT (dr0);
2116 SET_DR_MISALIGNMENT (dr0, 0);
2117 if (dump_enabled_p ())
2119 dump_printf_loc (MSG_NOTE, vect_location,
2120 "Alignment of access forced using peeling.\n");
2121 dump_printf_loc (MSG_NOTE, vect_location,
2122 "Peeling for alignment will be applied.\n");
2125 /* The inside-loop cost will be accounted for in vectorizable_load
2126 and vectorizable_store correctly with adjusted alignments.
2127 Drop the body_cst_vec on the floor here. */
2128 stat = vect_verify_datarefs_alignment (loop_vinfo);
2129 gcc_assert (stat);
2130 return stat;
2134 /* (2) Versioning to force alignment. */
2136 /* Try versioning if:
2137 1) optimize loop for speed
2138 2) there is at least one unsupported misaligned data ref with an unknown
2139 misalignment, and
2140 3) all misaligned data refs with a known misalignment are supported, and
2141 4) the number of runtime alignment checks is within reason. */
2143 do_versioning =
2144 optimize_loop_nest_for_speed_p (loop)
2145 && (!loop->inner); /* FORNOW */
2147 if (do_versioning)
2149 FOR_EACH_VEC_ELT (datarefs, i, dr)
2151 stmt = DR_STMT (dr);
2152 stmt_info = vinfo_for_stmt (stmt);
2154 /* For interleaving, only the alignment of the first access
2155 matters. */
2156 if (aligned_access_p (dr)
2157 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2158 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2159 continue;
2161 if (STMT_VINFO_STRIDED_P (stmt_info))
2163 /* Strided loads perform only component accesses, alignment is
2164 irrelevant for them. */
2165 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2166 continue;
2167 do_versioning = false;
2168 break;
2171 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2173 if (!supportable_dr_alignment)
2175 gimple *stmt;
2176 int mask;
2177 tree vectype;
2179 if (known_alignment_for_access_p (dr)
2180 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2181 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2183 do_versioning = false;
2184 break;
2187 stmt = DR_STMT (dr);
2188 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2189 gcc_assert (vectype);
2191 /* At present we don't support versioning for alignment
2192 with variable VF, since there's no guarantee that the
2193 VF is a power of two. We could relax this if we added
2194 a way of enforcing a power-of-two size. */
2195 unsigned HOST_WIDE_INT size;
2196 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2198 do_versioning = false;
2199 break;
2202 /* The rightmost bits of an aligned address must be zeros.
2203 Construct the mask needed for this test. For example,
2204 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2205 mask must be 15 = 0xf. */
2206 mask = size - 1;
2208 /* FORNOW: use the same mask to test all potentially unaligned
2209 references in the loop. The vectorizer currently supports
2210 a single vector size, see the reference to
2211 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2212 vectorization factor is computed. */
2213 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2214 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2215 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2216 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2217 DR_STMT (dr));
2221 /* Versioning requires at least one misaligned data reference. */
2222 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2223 do_versioning = false;
2224 else if (!do_versioning)
2225 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2228 if (do_versioning)
2230 vec<gimple *> may_misalign_stmts
2231 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2232 gimple *stmt;
2234 /* It can now be assumed that the data references in the statements
2235 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2236 of the loop being vectorized. */
2237 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2239 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2240 dr = STMT_VINFO_DATA_REF (stmt_info);
2241 SET_DR_MISALIGNMENT (dr, 0);
2242 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_NOTE, vect_location,
2244 "Alignment of access forced using versioning.\n");
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_NOTE, vect_location,
2249 "Versioning for alignment will be applied.\n");
2251 /* Peeling and versioning can't be done together at this time. */
2252 gcc_assert (! (do_peeling && do_versioning));
2254 stat = vect_verify_datarefs_alignment (loop_vinfo);
2255 gcc_assert (stat);
2256 return stat;
2259 /* This point is reached if neither peeling nor versioning is being done. */
2260 gcc_assert (! (do_peeling || do_versioning));
2262 stat = vect_verify_datarefs_alignment (loop_vinfo);
2263 return stat;
2267 /* Function vect_find_same_alignment_drs.
2269 Update group and alignment relations according to the chosen
2270 vectorization factor. */
2272 static void
2273 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2275 struct data_reference *dra = DDR_A (ddr);
2276 struct data_reference *drb = DDR_B (ddr);
2277 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2278 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2280 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2281 return;
2283 if (dra == drb)
2284 return;
2286 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2287 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2288 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2289 return;
2291 /* Two references with distance zero have the same alignment. */
2292 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2293 - wi::to_poly_offset (DR_INIT (drb)));
2294 if (maybe_ne (diff, 0))
2296 /* Get the wider of the two alignments. */
2297 unsigned int align_a = (vect_calculate_target_alignment (dra)
2298 / BITS_PER_UNIT);
2299 unsigned int align_b = (vect_calculate_target_alignment (drb)
2300 / BITS_PER_UNIT);
2301 unsigned int max_align = MAX (align_a, align_b);
2303 /* Require the gap to be a multiple of the larger vector alignment. */
2304 if (!multiple_p (diff, max_align))
2305 return;
2308 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2309 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2310 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_NOTE, vect_location,
2313 "accesses have the same alignment: ");
2314 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2315 dump_printf (MSG_NOTE, " and ");
2316 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2317 dump_printf (MSG_NOTE, "\n");
2322 /* Function vect_analyze_data_refs_alignment
2324 Analyze the alignment of the data-references in the loop.
2325 Return FALSE if a data reference is found that cannot be vectorized. */
2327 bool
2328 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2330 if (dump_enabled_p ())
2331 dump_printf_loc (MSG_NOTE, vect_location,
2332 "=== vect_analyze_data_refs_alignment ===\n");
2334 /* Mark groups of data references with same alignment using
2335 data dependence information. */
2336 vec<ddr_p> ddrs = vinfo->ddrs;
2337 struct data_dependence_relation *ddr;
2338 unsigned int i;
2340 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2341 vect_find_same_alignment_drs (ddr);
2343 vec<data_reference_p> datarefs = vinfo->datarefs;
2344 struct data_reference *dr;
2346 vect_record_base_alignments (vinfo);
2347 FOR_EACH_VEC_ELT (datarefs, i, dr)
2349 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2350 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2351 && !vect_compute_data_ref_alignment (dr))
2353 /* Strided accesses perform only component accesses, misalignment
2354 information is irrelevant for them. */
2355 if (STMT_VINFO_STRIDED_P (stmt_info)
2356 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2357 continue;
2359 if (dump_enabled_p ())
2360 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361 "not vectorized: can't calculate alignment "
2362 "for data ref.\n");
2364 return false;
2368 return true;
2372 /* Analyze alignment of DRs of stmts in NODE. */
2374 static bool
2375 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2377 /* We vectorize from the first scalar stmt in the node unless
2378 the node is permuted in which case we start from the first
2379 element in the group. */
2380 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2381 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2382 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2383 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2385 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2386 if (! vect_compute_data_ref_alignment (dr)
2387 /* For creating the data-ref pointer we need alignment of the
2388 first element anyway. */
2389 || (dr != first_dr
2390 && ! vect_compute_data_ref_alignment (first_dr))
2391 || ! verify_data_ref_alignment (dr))
2393 if (dump_enabled_p ())
2394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2395 "not vectorized: bad data alignment in basic "
2396 "block.\n");
2397 return false;
2400 return true;
2403 /* Function vect_slp_analyze_instance_alignment
2405 Analyze the alignment of the data-references in the SLP instance.
2406 Return FALSE if a data reference is found that cannot be vectorized. */
2408 bool
2409 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2411 if (dump_enabled_p ())
2412 dump_printf_loc (MSG_NOTE, vect_location,
2413 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2415 slp_tree node;
2416 unsigned i;
2417 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2418 if (! vect_slp_analyze_and_verify_node_alignment (node))
2419 return false;
2421 node = SLP_INSTANCE_TREE (instance);
2422 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2423 && ! vect_slp_analyze_and_verify_node_alignment
2424 (SLP_INSTANCE_TREE (instance)))
2425 return false;
2427 return true;
2431 /* Analyze groups of accesses: check that DR belongs to a group of
2432 accesses of legal size, step, etc. Detect gaps, single element
2433 interleaving, and other special cases. Set grouped access info.
2434 Collect groups of strided stores for further use in SLP analysis.
2435 Worker for vect_analyze_group_access. */
2437 static bool
2438 vect_analyze_group_access_1 (struct data_reference *dr)
2440 tree step = DR_STEP (dr);
2441 tree scalar_type = TREE_TYPE (DR_REF (dr));
2442 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2443 gimple *stmt = DR_STMT (dr);
2444 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2445 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2446 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2447 HOST_WIDE_INT dr_step = -1;
2448 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2449 bool slp_impossible = false;
2451 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2452 size of the interleaving group (including gaps). */
2453 if (tree_fits_shwi_p (step))
2455 dr_step = tree_to_shwi (step);
2456 /* Check that STEP is a multiple of type size. Otherwise there is
2457 a non-element-sized gap at the end of the group which we
2458 cannot represent in GROUP_GAP or GROUP_SIZE.
2459 ??? As we can handle non-constant step fine here we should
2460 simply remove uses of GROUP_GAP between the last and first
2461 element and instead rely on DR_STEP. GROUP_SIZE then would
2462 simply not include that gap. */
2463 if ((dr_step % type_size) != 0)
2465 if (dump_enabled_p ())
2467 dump_printf_loc (MSG_NOTE, vect_location,
2468 "Step ");
2469 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2470 dump_printf (MSG_NOTE,
2471 " is not a multiple of the element size for ");
2472 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2473 dump_printf (MSG_NOTE, "\n");
2475 return false;
2477 groupsize = absu_hwi (dr_step) / type_size;
2479 else
2480 groupsize = 0;
2482 /* Not consecutive access is possible only if it is a part of interleaving. */
2483 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2485 /* Check if it this DR is a part of interleaving, and is a single
2486 element of the group that is accessed in the loop. */
2488 /* Gaps are supported only for loads. STEP must be a multiple of the type
2489 size. */
2490 if (DR_IS_READ (dr)
2491 && (dr_step % type_size) == 0
2492 && groupsize > 0)
2494 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2495 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2496 GROUP_GAP (stmt_info) = groupsize - 1;
2497 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_NOTE, vect_location,
2500 "Detected single element interleaving ");
2501 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2502 dump_printf (MSG_NOTE, " step ");
2503 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2504 dump_printf (MSG_NOTE, "\n");
2507 return true;
2510 if (dump_enabled_p ())
2512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2513 "not consecutive access ");
2514 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2517 if (bb_vinfo)
2519 /* Mark the statement as unvectorizable. */
2520 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2521 return true;
2524 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2525 STMT_VINFO_STRIDED_P (stmt_info) = true;
2526 return true;
2529 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2531 /* First stmt in the interleaving chain. Check the chain. */
2532 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2533 struct data_reference *data_ref = dr;
2534 unsigned int count = 1;
2535 tree prev_init = DR_INIT (data_ref);
2536 gimple *prev = stmt;
2537 HOST_WIDE_INT diff, gaps = 0;
2539 /* By construction, all group members have INTEGER_CST DR_INITs. */
2540 while (next)
2542 /* Skip same data-refs. In case that two or more stmts share
2543 data-ref (supported only for loads), we vectorize only the first
2544 stmt, and the rest get their vectorized loads from the first
2545 one. */
2546 if (!tree_int_cst_compare (DR_INIT (data_ref),
2547 DR_INIT (STMT_VINFO_DATA_REF (
2548 vinfo_for_stmt (next)))))
2550 if (DR_IS_WRITE (data_ref))
2552 if (dump_enabled_p ())
2553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554 "Two store stmts share the same dr.\n");
2555 return false;
2558 if (dump_enabled_p ())
2559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2560 "Two or more load stmts share the same dr.\n");
2562 /* For load use the same data-ref load. */
2563 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2565 prev = next;
2566 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2567 continue;
2570 prev = next;
2571 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2573 /* All group members have the same STEP by construction. */
2574 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2576 /* Check that the distance between two accesses is equal to the type
2577 size. Otherwise, we have gaps. */
2578 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2579 - TREE_INT_CST_LOW (prev_init)) / type_size;
2580 if (diff != 1)
2582 /* FORNOW: SLP of accesses with gaps is not supported. */
2583 slp_impossible = true;
2584 if (DR_IS_WRITE (data_ref))
2586 if (dump_enabled_p ())
2587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2588 "interleaved store with gaps\n");
2589 return false;
2592 gaps += diff - 1;
2595 last_accessed_element += diff;
2597 /* Store the gap from the previous member of the group. If there is no
2598 gap in the access, GROUP_GAP is always 1. */
2599 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2601 prev_init = DR_INIT (data_ref);
2602 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2603 /* Count the number of data-refs in the chain. */
2604 count++;
2607 if (groupsize == 0)
2608 groupsize = count + gaps;
2610 /* This could be UINT_MAX but as we are generating code in a very
2611 inefficient way we have to cap earlier. See PR78699 for example. */
2612 if (groupsize > 4096)
2614 if (dump_enabled_p ())
2615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2616 "group is too large\n");
2617 return false;
2620 /* Check that the size of the interleaving is equal to count for stores,
2621 i.e., that there are no gaps. */
2622 if (groupsize != count
2623 && !DR_IS_READ (dr))
2625 if (dump_enabled_p ())
2626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2627 "interleaved store with gaps\n");
2628 return false;
2631 /* If there is a gap after the last load in the group it is the
2632 difference between the groupsize and the last accessed
2633 element.
2634 When there is no gap, this difference should be 0. */
2635 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2637 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2638 if (dump_enabled_p ())
2640 dump_printf_loc (MSG_NOTE, vect_location,
2641 "Detected interleaving ");
2642 if (DR_IS_READ (dr))
2643 dump_printf (MSG_NOTE, "load ");
2644 else
2645 dump_printf (MSG_NOTE, "store ");
2646 dump_printf (MSG_NOTE, "of size %u starting with ",
2647 (unsigned)groupsize);
2648 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2649 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2650 dump_printf_loc (MSG_NOTE, vect_location,
2651 "There is a gap of %u elements after the group\n",
2652 GROUP_GAP (vinfo_for_stmt (stmt)));
2655 /* SLP: create an SLP data structure for every interleaving group of
2656 stores for further analysis in vect_analyse_slp. */
2657 if (DR_IS_WRITE (dr) && !slp_impossible)
2659 if (loop_vinfo)
2660 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2661 if (bb_vinfo)
2662 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2666 return true;
2669 /* Analyze groups of accesses: check that DR belongs to a group of
2670 accesses of legal size, step, etc. Detect gaps, single element
2671 interleaving, and other special cases. Set grouped access info.
2672 Collect groups of strided stores for further use in SLP analysis. */
2674 static bool
2675 vect_analyze_group_access (struct data_reference *dr)
2677 if (!vect_analyze_group_access_1 (dr))
2679 /* Dissolve the group if present. */
2680 gimple *next;
2681 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2682 while (stmt)
2684 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2685 next = GROUP_NEXT_ELEMENT (vinfo);
2686 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2687 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2688 stmt = next;
2690 return false;
2692 return true;
2695 /* Analyze the access pattern of the data-reference DR.
2696 In case of non-consecutive accesses call vect_analyze_group_access() to
2697 analyze groups of accesses. */
2699 static bool
2700 vect_analyze_data_ref_access (struct data_reference *dr)
2702 tree step = DR_STEP (dr);
2703 tree scalar_type = TREE_TYPE (DR_REF (dr));
2704 gimple *stmt = DR_STMT (dr);
2705 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2706 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2707 struct loop *loop = NULL;
2709 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2710 return true;
2712 if (loop_vinfo)
2713 loop = LOOP_VINFO_LOOP (loop_vinfo);
2715 if (loop_vinfo && !step)
2717 if (dump_enabled_p ())
2718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2719 "bad data-ref access in loop\n");
2720 return false;
2723 /* Allow loads with zero step in inner-loop vectorization. */
2724 if (loop_vinfo && integer_zerop (step))
2726 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2727 if (!nested_in_vect_loop_p (loop, stmt))
2728 return DR_IS_READ (dr);
2729 /* Allow references with zero step for outer loops marked
2730 with pragma omp simd only - it guarantees absence of
2731 loop-carried dependencies between inner loop iterations. */
2732 if (loop->safelen < 2)
2734 if (dump_enabled_p ())
2735 dump_printf_loc (MSG_NOTE, vect_location,
2736 "zero step in inner loop of nest\n");
2737 return false;
2741 if (loop && nested_in_vect_loop_p (loop, stmt))
2743 /* Interleaved accesses are not yet supported within outer-loop
2744 vectorization for references in the inner-loop. */
2745 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2747 /* For the rest of the analysis we use the outer-loop step. */
2748 step = STMT_VINFO_DR_STEP (stmt_info);
2749 if (integer_zerop (step))
2751 if (dump_enabled_p ())
2752 dump_printf_loc (MSG_NOTE, vect_location,
2753 "zero step in outer loop.\n");
2754 return DR_IS_READ (dr);
2758 /* Consecutive? */
2759 if (TREE_CODE (step) == INTEGER_CST)
2761 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2762 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2763 || (dr_step < 0
2764 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2766 /* Mark that it is not interleaving. */
2767 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2768 return true;
2772 if (loop && nested_in_vect_loop_p (loop, stmt))
2774 if (dump_enabled_p ())
2775 dump_printf_loc (MSG_NOTE, vect_location,
2776 "grouped access in outer loop.\n");
2777 return false;
2781 /* Assume this is a DR handled by non-constant strided load case. */
2782 if (TREE_CODE (step) != INTEGER_CST)
2783 return (STMT_VINFO_STRIDED_P (stmt_info)
2784 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2785 || vect_analyze_group_access (dr)));
2787 /* Not consecutive access - check if it's a part of interleaving group. */
2788 return vect_analyze_group_access (dr);
2791 /* Compare two data-references DRA and DRB to group them into chunks
2792 suitable for grouping. */
2794 static int
2795 dr_group_sort_cmp (const void *dra_, const void *drb_)
2797 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2798 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2799 int cmp;
2801 /* Stabilize sort. */
2802 if (dra == drb)
2803 return 0;
2805 /* DRs in different loops never belong to the same group. */
2806 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2807 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2808 if (loopa != loopb)
2809 return loopa->num < loopb->num ? -1 : 1;
2811 /* Ordering of DRs according to base. */
2812 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2813 DR_BASE_ADDRESS (drb));
2814 if (cmp != 0)
2815 return cmp;
2817 /* And according to DR_OFFSET. */
2818 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2819 if (cmp != 0)
2820 return cmp;
2822 /* Put reads before writes. */
2823 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2824 return DR_IS_READ (dra) ? -1 : 1;
2826 /* Then sort after access size. */
2827 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2828 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2829 if (cmp != 0)
2830 return cmp;
2832 /* And after step. */
2833 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2834 if (cmp != 0)
2835 return cmp;
2837 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2838 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2839 if (cmp == 0)
2840 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2841 return cmp;
2844 /* If OP is the result of a conversion, return the unconverted value,
2845 otherwise return null. */
2847 static tree
2848 strip_conversion (tree op)
2850 if (TREE_CODE (op) != SSA_NAME)
2851 return NULL_TREE;
2852 gimple *stmt = SSA_NAME_DEF_STMT (op);
2853 if (!is_gimple_assign (stmt)
2854 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2855 return NULL_TREE;
2856 return gimple_assign_rhs1 (stmt);
2859 /* Return true if vectorizable_* routines can handle statements STMT1
2860 and STMT2 being in a single group. */
2862 static bool
2863 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2865 if (gimple_assign_single_p (stmt1))
2866 return gimple_assign_single_p (stmt2);
2868 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2870 /* Check for two masked loads or two masked stores. */
2871 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2872 return false;
2873 internal_fn ifn = gimple_call_internal_fn (stmt1);
2874 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2875 return false;
2876 if (ifn != gimple_call_internal_fn (stmt2))
2877 return false;
2879 /* Check that the masks are the same. Cope with casts of masks,
2880 like those created by build_mask_conversion. */
2881 tree mask1 = gimple_call_arg (stmt1, 2);
2882 tree mask2 = gimple_call_arg (stmt2, 2);
2883 if (!operand_equal_p (mask1, mask2, 0))
2885 mask1 = strip_conversion (mask1);
2886 if (!mask1)
2887 return false;
2888 mask2 = strip_conversion (mask2);
2889 if (!mask2)
2890 return false;
2891 if (!operand_equal_p (mask1, mask2, 0))
2892 return false;
2894 return true;
2897 return false;
2900 /* Function vect_analyze_data_ref_accesses.
2902 Analyze the access pattern of all the data references in the loop.
2904 FORNOW: the only access pattern that is considered vectorizable is a
2905 simple step 1 (consecutive) access.
2907 FORNOW: handle only arrays and pointer accesses. */
2909 bool
2910 vect_analyze_data_ref_accesses (vec_info *vinfo)
2912 unsigned int i;
2913 vec<data_reference_p> datarefs = vinfo->datarefs;
2914 struct data_reference *dr;
2916 if (dump_enabled_p ())
2917 dump_printf_loc (MSG_NOTE, vect_location,
2918 "=== vect_analyze_data_ref_accesses ===\n");
2920 if (datarefs.is_empty ())
2921 return true;
2923 /* Sort the array of datarefs to make building the interleaving chains
2924 linear. Don't modify the original vector's order, it is needed for
2925 determining what dependencies are reversed. */
2926 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2927 datarefs_copy.qsort (dr_group_sort_cmp);
2929 /* Build the interleaving chains. */
2930 for (i = 0; i < datarefs_copy.length () - 1;)
2932 data_reference_p dra = datarefs_copy[i];
2933 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2934 stmt_vec_info lastinfo = NULL;
2935 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2936 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2938 ++i;
2939 continue;
2941 for (i = i + 1; i < datarefs_copy.length (); ++i)
2943 data_reference_p drb = datarefs_copy[i];
2944 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2945 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2946 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2947 break;
2949 /* ??? Imperfect sorting (non-compatible types, non-modulo
2950 accesses, same accesses) can lead to a group to be artificially
2951 split here as we don't just skip over those. If it really
2952 matters we can push those to a worklist and re-iterate
2953 over them. The we can just skip ahead to the next DR here. */
2955 /* DRs in a different loop should not be put into the same
2956 interleaving group. */
2957 if (gimple_bb (DR_STMT (dra))->loop_father
2958 != gimple_bb (DR_STMT (drb))->loop_father)
2959 break;
2961 /* Check that the data-refs have same first location (except init)
2962 and they are both either store or load (not load and store,
2963 not masked loads or stores). */
2964 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2965 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2966 DR_BASE_ADDRESS (drb)) != 0
2967 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2968 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2969 break;
2971 /* Check that the data-refs have the same constant size. */
2972 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2973 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2974 if (!tree_fits_uhwi_p (sza)
2975 || !tree_fits_uhwi_p (szb)
2976 || !tree_int_cst_equal (sza, szb))
2977 break;
2979 /* Check that the data-refs have the same step. */
2980 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2981 break;
2983 /* Check the types are compatible.
2984 ??? We don't distinguish this during sorting. */
2985 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2986 TREE_TYPE (DR_REF (drb))))
2987 break;
2989 /* Check that the DR_INITs are compile-time constants. */
2990 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2991 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2992 break;
2994 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2995 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2996 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2997 HOST_WIDE_INT init_prev
2998 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2999 gcc_assert (init_a <= init_b
3000 && init_a <= init_prev
3001 && init_prev <= init_b);
3003 /* Do not place the same access in the interleaving chain twice. */
3004 if (init_b == init_prev)
3006 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3007 < gimple_uid (DR_STMT (drb)));
3008 /* ??? For now we simply "drop" the later reference which is
3009 otherwise the same rather than finishing off this group.
3010 In the end we'd want to re-process duplicates forming
3011 multiple groups from the refs, likely by just collecting
3012 all candidates (including duplicates and split points
3013 below) in a vector and then process them together. */
3014 continue;
3017 /* If init_b == init_a + the size of the type * k, we have an
3018 interleaving, and DRA is accessed before DRB. */
3019 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3020 if (type_size_a == 0
3021 || (init_b - init_a) % type_size_a != 0)
3022 break;
3024 /* If we have a store, the accesses are adjacent. This splits
3025 groups into chunks we support (we don't support vectorization
3026 of stores with gaps). */
3027 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3028 break;
3030 /* If the step (if not zero or non-constant) is greater than the
3031 difference between data-refs' inits this splits groups into
3032 suitable sizes. */
3033 if (tree_fits_shwi_p (DR_STEP (dra)))
3035 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3036 if (step != 0 && step <= (init_b - init_a))
3037 break;
3040 if (dump_enabled_p ())
3042 dump_printf_loc (MSG_NOTE, vect_location,
3043 "Detected interleaving ");
3044 if (DR_IS_READ (dra))
3045 dump_printf (MSG_NOTE, "load ");
3046 else
3047 dump_printf (MSG_NOTE, "store ");
3048 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3049 dump_printf (MSG_NOTE, " and ");
3050 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3051 dump_printf (MSG_NOTE, "\n");
3054 /* Link the found element into the group list. */
3055 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3057 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3058 lastinfo = stmtinfo_a;
3060 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3061 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3062 lastinfo = stmtinfo_b;
3066 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3067 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3068 && !vect_analyze_data_ref_access (dr))
3070 if (dump_enabled_p ())
3071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3072 "not vectorized: complicated access pattern.\n");
3074 if (is_a <bb_vec_info> (vinfo))
3076 /* Mark the statement as not vectorizable. */
3077 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3078 continue;
3080 else
3082 datarefs_copy.release ();
3083 return false;
3087 datarefs_copy.release ();
3088 return true;
3091 /* Function vect_vfa_segment_size.
3093 Input:
3094 DR: The data reference.
3095 LENGTH_FACTOR: segment length to consider.
3097 Return a value suitable for the dr_with_seg_len::seg_len field.
3098 This is the "distance travelled" by the pointer from the first
3099 iteration in the segment to the last. Note that it does not include
3100 the size of the access; in effect it only describes the first byte. */
3102 static tree
3103 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3105 length_factor = size_binop (MINUS_EXPR,
3106 fold_convert (sizetype, length_factor),
3107 size_one_node);
3108 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3109 length_factor);
3112 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3113 gives the worst-case number of bytes covered by the segment. */
3115 static unsigned HOST_WIDE_INT
3116 vect_vfa_access_size (data_reference *dr)
3118 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3119 tree ref_type = TREE_TYPE (DR_REF (dr));
3120 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3121 unsigned HOST_WIDE_INT access_size = ref_size;
3122 if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3124 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3125 access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3127 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3128 && (vect_supportable_dr_alignment (dr, false)
3129 == dr_explicit_realign_optimized))
3131 /* We might access a full vector's worth. */
3132 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3133 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3135 return access_size;
3138 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3140 static unsigned int
3141 vect_vfa_align (const data_reference *dr)
3143 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3146 /* Function vect_no_alias_p.
3148 Given data references A and B with equal base and offset, see whether
3149 the alias relation can be decided at compilation time. Return 1 if
3150 it can and the references alias, 0 if it can and the references do
3151 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3152 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3153 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3155 static int
3156 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3157 tree segment_length_a, tree segment_length_b,
3158 unsigned HOST_WIDE_INT access_size_a,
3159 unsigned HOST_WIDE_INT access_size_b)
3161 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3162 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3163 poly_uint64 const_length_a;
3164 poly_uint64 const_length_b;
3166 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3167 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3168 [a, a+12) */
3169 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3171 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3172 offset_a = (offset_a + access_size_a) - const_length_a;
3174 else
3175 const_length_a = tree_to_poly_uint64 (segment_length_a);
3176 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3178 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3179 offset_b = (offset_b + access_size_b) - const_length_b;
3181 else
3182 const_length_b = tree_to_poly_uint64 (segment_length_b);
3184 const_length_a += access_size_a;
3185 const_length_b += access_size_b;
3187 if (ranges_known_overlap_p (offset_a, const_length_a,
3188 offset_b, const_length_b))
3189 return 1;
3191 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3192 offset_b, const_length_b))
3193 return 0;
3195 return -1;
3198 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3199 in DDR is >= VF. */
3201 static bool
3202 dependence_distance_ge_vf (data_dependence_relation *ddr,
3203 unsigned int loop_depth, poly_uint64 vf)
3205 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3206 || DDR_NUM_DIST_VECTS (ddr) == 0)
3207 return false;
3209 /* If the dependence is exact, we should have limited the VF instead. */
3210 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3212 unsigned int i;
3213 lambda_vector dist_v;
3214 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3216 HOST_WIDE_INT dist = dist_v[loop_depth];
3217 if (dist != 0
3218 && !(dist > 0 && DDR_REVERSED_P (ddr))
3219 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3220 return false;
3223 if (dump_enabled_p ())
3225 dump_printf_loc (MSG_NOTE, vect_location,
3226 "dependence distance between ");
3227 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3228 dump_printf (MSG_NOTE, " and ");
3229 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3230 dump_printf (MSG_NOTE, " is >= VF\n");
3233 return true;
3236 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3238 static void
3239 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3241 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3242 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3243 dump_printf (dump_kind, ") >= ");
3244 dump_dec (dump_kind, lower_bound.min_value);
3247 /* Record that the vectorized loop requires the vec_lower_bound described
3248 by EXPR, UNSIGNED_P and MIN_VALUE. */
3250 static void
3251 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3252 poly_uint64 min_value)
3254 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3255 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3256 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3258 unsigned_p &= lower_bounds[i].unsigned_p;
3259 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3260 if (lower_bounds[i].unsigned_p != unsigned_p
3261 || maybe_lt (lower_bounds[i].min_value, min_value))
3263 lower_bounds[i].unsigned_p = unsigned_p;
3264 lower_bounds[i].min_value = min_value;
3265 if (dump_enabled_p ())
3267 dump_printf_loc (MSG_NOTE, vect_location,
3268 "updating run-time check to ");
3269 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3270 dump_printf (MSG_NOTE, "\n");
3273 return;
3276 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3277 if (dump_enabled_p ())
3279 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3280 dump_lower_bound (MSG_NOTE, lower_bound);
3281 dump_printf (MSG_NOTE, "\n");
3283 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3286 /* Return true if it's unlikely that the step of the vectorized form of DR
3287 will span fewer than GAP bytes. */
3289 static bool
3290 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3292 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3293 HOST_WIDE_INT count
3294 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3295 if (GROUP_FIRST_ELEMENT (stmt_info))
3296 count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3297 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3300 /* Return true if we know that there is no alias between DR_A and DR_B
3301 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3302 *LOWER_BOUND_OUT to this N. */
3304 static bool
3305 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3306 poly_uint64 *lower_bound_out)
3308 /* Check that there is a constant gap of known sign between DR_A
3309 and DR_B. */
3310 poly_int64 init_a, init_b;
3311 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3312 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3313 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3314 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3315 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3316 || !ordered_p (init_a, init_b))
3317 return false;
3319 /* Sort DR_A and DR_B by the address they access. */
3320 if (maybe_lt (init_b, init_a))
3322 std::swap (init_a, init_b);
3323 std::swap (dr_a, dr_b);
3326 /* If the two accesses could be dependent within a scalar iteration,
3327 make sure that we'd retain their order. */
3328 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3329 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3330 return false;
3332 /* There is no alias if abs (DR_STEP) is greater than or equal to
3333 the bytes spanned by the combination of the two accesses. */
3334 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3335 return true;
3338 /* Function vect_prune_runtime_alias_test_list.
3340 Prune a list of ddrs to be tested at run-time by versioning for alias.
3341 Merge several alias checks into one if possible.
3342 Return FALSE if resulting list of ddrs is longer then allowed by
3343 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3345 bool
3346 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3348 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3349 hash_set <tree_pair_hash> compared_objects;
3351 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3352 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3353 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3354 vec<vec_object_pair> &check_unequal_addrs
3355 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3356 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3357 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3359 ddr_p ddr;
3360 unsigned int i;
3361 tree length_factor;
3363 if (dump_enabled_p ())
3364 dump_printf_loc (MSG_NOTE, vect_location,
3365 "=== vect_prune_runtime_alias_test_list ===\n");
3367 /* Step values are irrelevant for aliasing if the number of vector
3368 iterations is equal to the number of scalar iterations (which can
3369 happen for fully-SLP loops). */
3370 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3372 if (!ignore_step_p)
3374 /* Convert the checks for nonzero steps into bound tests. */
3375 tree value;
3376 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3377 vect_check_lower_bound (loop_vinfo, value, true, 1);
3380 if (may_alias_ddrs.is_empty ())
3381 return true;
3383 comp_alias_ddrs.create (may_alias_ddrs.length ());
3385 unsigned int loop_depth
3386 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3387 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3389 /* First, we collect all data ref pairs for aliasing checks. */
3390 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3392 int comp_res;
3393 poly_uint64 lower_bound;
3394 struct data_reference *dr_a, *dr_b;
3395 gimple *dr_group_first_a, *dr_group_first_b;
3396 tree segment_length_a, segment_length_b;
3397 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3398 unsigned int align_a, align_b;
3399 gimple *stmt_a, *stmt_b;
3401 /* Ignore the alias if the VF we chose ended up being no greater
3402 than the dependence distance. */
3403 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3404 continue;
3406 if (DDR_OBJECT_A (ddr))
3408 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3409 if (!compared_objects.add (new_pair))
3411 if (dump_enabled_p ())
3413 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3414 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3415 dump_printf (MSG_NOTE, " and ");
3416 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3417 dump_printf (MSG_NOTE, " have different addresses\n");
3419 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3421 continue;
3424 dr_a = DDR_A (ddr);
3425 stmt_a = DR_STMT (DDR_A (ddr));
3427 dr_b = DDR_B (ddr);
3428 stmt_b = DR_STMT (DDR_B (ddr));
3430 /* Skip the pair if inter-iteration dependencies are irrelevant
3431 and intra-iteration dependencies are guaranteed to be honored. */
3432 if (ignore_step_p
3433 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3434 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3436 if (dump_enabled_p ())
3438 dump_printf_loc (MSG_NOTE, vect_location,
3439 "no need for alias check between ");
3440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3441 dump_printf (MSG_NOTE, " and ");
3442 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3443 dump_printf (MSG_NOTE, " when VF is 1\n");
3445 continue;
3448 /* See whether we can handle the alias using a bounds check on
3449 the step, and whether that's likely to be the best approach.
3450 (It might not be, for example, if the minimum step is much larger
3451 than the number of bytes handled by one vector iteration.) */
3452 if (!ignore_step_p
3453 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3454 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3455 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3456 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3458 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3459 if (dump_enabled_p ())
3461 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3462 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3463 dump_printf (MSG_NOTE, " and ");
3464 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3465 dump_printf (MSG_NOTE, " when the step ");
3466 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3467 dump_printf (MSG_NOTE, " is outside ");
3468 if (unsigned_p)
3469 dump_printf (MSG_NOTE, "[0");
3470 else
3472 dump_printf (MSG_NOTE, "(");
3473 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3475 dump_printf (MSG_NOTE, ", ");
3476 dump_dec (MSG_NOTE, lower_bound);
3477 dump_printf (MSG_NOTE, ")\n");
3479 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3480 lower_bound);
3481 continue;
3484 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3485 if (dr_group_first_a)
3487 stmt_a = dr_group_first_a;
3488 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3491 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3492 if (dr_group_first_b)
3494 stmt_b = dr_group_first_b;
3495 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3498 if (ignore_step_p)
3500 segment_length_a = size_zero_node;
3501 segment_length_b = size_zero_node;
3503 else
3505 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3506 length_factor = scalar_loop_iters;
3507 else
3508 length_factor = size_int (vect_factor);
3509 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3510 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3512 access_size_a = vect_vfa_access_size (dr_a);
3513 access_size_b = vect_vfa_access_size (dr_b);
3514 align_a = vect_vfa_align (dr_a);
3515 align_b = vect_vfa_align (dr_b);
3517 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3518 DR_BASE_ADDRESS (dr_b));
3519 if (comp_res == 0)
3520 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3521 DR_OFFSET (dr_b));
3523 /* See whether the alias is known at compilation time. */
3524 if (comp_res == 0
3525 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3526 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3527 && poly_int_tree_p (segment_length_a)
3528 && poly_int_tree_p (segment_length_b))
3530 int res = vect_compile_time_alias (dr_a, dr_b,
3531 segment_length_a,
3532 segment_length_b,
3533 access_size_a,
3534 access_size_b);
3535 if (res >= 0 && dump_enabled_p ())
3537 dump_printf_loc (MSG_NOTE, vect_location,
3538 "can tell at compile time that ");
3539 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3540 dump_printf (MSG_NOTE, " and ");
3541 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3542 if (res == 0)
3543 dump_printf (MSG_NOTE, " do not alias\n");
3544 else
3545 dump_printf (MSG_NOTE, " alias\n");
3548 if (res == 0)
3549 continue;
3551 if (res == 1)
3553 if (dump_enabled_p ())
3554 dump_printf_loc (MSG_NOTE, vect_location,
3555 "not vectorized: compilation time alias.\n");
3556 return false;
3560 dr_with_seg_len_pair_t dr_with_seg_len_pair
3561 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3562 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3564 /* Canonicalize pairs by sorting the two DR members. */
3565 if (comp_res > 0)
3566 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3568 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3571 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3573 unsigned int count = (comp_alias_ddrs.length ()
3574 + check_unequal_addrs.length ());
3576 dump_printf_loc (MSG_NOTE, vect_location,
3577 "improved number of alias checks from %d to %d\n",
3578 may_alias_ddrs.length (), count);
3579 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3581 if (dump_enabled_p ())
3582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3583 "number of versioning for alias "
3584 "run-time tests exceeds %d "
3585 "(--param vect-max-version-for-alias-checks)\n",
3586 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3587 return false;
3590 return true;
3593 /* Check whether we can use an internal function for a gather load
3594 or scatter store. READ_P is true for loads and false for stores.
3595 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3596 the type of the memory elements being loaded or stored. OFFSET_BITS
3597 is the number of bits in each scalar offset and OFFSET_SIGN is the
3598 sign of the offset. SCALE is the amount by which the offset should
3599 be multiplied *after* it has been converted to address width.
3601 Return true if the function is supported, storing the function
3602 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3604 bool
3605 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3606 tree memory_type, unsigned int offset_bits,
3607 signop offset_sign, int scale,
3608 internal_fn *ifn_out, tree *element_type_out)
3610 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3611 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3612 if (offset_bits > element_bits)
3613 /* Internal functions require the offset to be the same width as
3614 the vector elements. We can extend narrower offsets, but it isn't
3615 safe to truncate wider offsets. */
3616 return false;
3618 if (element_bits != memory_bits)
3619 /* For now the vector elements must be the same width as the
3620 memory elements. */
3621 return false;
3623 /* Work out which function we need. */
3624 internal_fn ifn;
3625 if (read_p)
3626 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3627 else
3628 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3630 /* Test whether the target supports this combination. */
3631 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3632 offset_sign, scale))
3633 return false;
3635 *ifn_out = ifn;
3636 *element_type_out = TREE_TYPE (vectype);
3637 return true;
3640 /* CALL is a call to an internal gather load or scatter store function.
3641 Describe the operation in INFO. */
3643 static void
3644 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3646 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3647 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3648 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3650 info->ifn = gimple_call_internal_fn (call);
3651 info->decl = NULL_TREE;
3652 info->base = gimple_call_arg (call, 0);
3653 info->offset = gimple_call_arg (call, 1);
3654 info->offset_dt = vect_unknown_def_type;
3655 info->offset_vectype = NULL_TREE;
3656 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3657 info->element_type = TREE_TYPE (vectype);
3658 info->memory_type = TREE_TYPE (DR_REF (dr));
3661 /* Return true if a non-affine read or write in STMT is suitable for a
3662 gather load or scatter store. Describe the operation in *INFO if so. */
3664 bool
3665 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3666 gather_scatter_info *info)
3668 HOST_WIDE_INT scale = 1;
3669 poly_int64 pbitpos, pbitsize;
3670 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3671 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3672 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3673 tree offtype = NULL_TREE;
3674 tree decl = NULL_TREE, base, off;
3675 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3676 tree memory_type = TREE_TYPE (DR_REF (dr));
3677 machine_mode pmode;
3678 int punsignedp, reversep, pvolatilep = 0;
3679 internal_fn ifn;
3680 tree element_type;
3681 bool masked_p = false;
3683 /* See whether this is already a call to a gather/scatter internal function.
3684 If not, see whether it's a masked load or store. */
3685 gcall *call = dyn_cast <gcall *> (stmt);
3686 if (call && gimple_call_internal_p (call))
3688 ifn = gimple_call_internal_fn (stmt);
3689 if (internal_gather_scatter_fn_p (ifn))
3691 vect_describe_gather_scatter_call (call, info);
3692 return true;
3694 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3697 /* True if we should aim to use internal functions rather than
3698 built-in functions. */
3699 bool use_ifn_p = (DR_IS_READ (dr)
3700 ? supports_vec_gather_load_p ()
3701 : supports_vec_scatter_store_p ());
3703 base = DR_REF (dr);
3704 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3705 see if we can use the def stmt of the address. */
3706 if (masked_p
3707 && TREE_CODE (base) == MEM_REF
3708 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3709 && integer_zerop (TREE_OPERAND (base, 1))
3710 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3712 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3713 if (is_gimple_assign (def_stmt)
3714 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3715 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3718 /* The gather and scatter builtins need address of the form
3719 loop_invariant + vector * {1, 2, 4, 8}
3721 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3722 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3723 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3724 multiplications and additions in it. To get a vector, we need
3725 a single SSA_NAME that will be defined in the loop and will
3726 contain everything that is not loop invariant and that can be
3727 vectorized. The following code attempts to find such a preexistng
3728 SSA_NAME OFF and put the loop invariants into a tree BASE
3729 that can be gimplified before the loop. */
3730 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3731 &punsignedp, &reversep, &pvolatilep);
3732 gcc_assert (base && !reversep);
3733 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3735 if (TREE_CODE (base) == MEM_REF)
3737 if (!integer_zerop (TREE_OPERAND (base, 1)))
3739 if (off == NULL_TREE)
3740 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3741 else
3742 off = size_binop (PLUS_EXPR, off,
3743 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3745 base = TREE_OPERAND (base, 0);
3747 else
3748 base = build_fold_addr_expr (base);
3750 if (off == NULL_TREE)
3751 off = size_zero_node;
3753 /* If base is not loop invariant, either off is 0, then we start with just
3754 the constant offset in the loop invariant BASE and continue with base
3755 as OFF, otherwise give up.
3756 We could handle that case by gimplifying the addition of base + off
3757 into some SSA_NAME and use that as off, but for now punt. */
3758 if (!expr_invariant_in_loop_p (loop, base))
3760 if (!integer_zerop (off))
3761 return false;
3762 off = base;
3763 base = size_int (pbytepos);
3765 /* Otherwise put base + constant offset into the loop invariant BASE
3766 and continue with OFF. */
3767 else
3769 base = fold_convert (sizetype, base);
3770 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3773 /* OFF at this point may be either a SSA_NAME or some tree expression
3774 from get_inner_reference. Try to peel off loop invariants from it
3775 into BASE as long as possible. */
3776 STRIP_NOPS (off);
3777 while (offtype == NULL_TREE)
3779 enum tree_code code;
3780 tree op0, op1, add = NULL_TREE;
3782 if (TREE_CODE (off) == SSA_NAME)
3784 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3786 if (expr_invariant_in_loop_p (loop, off))
3787 return false;
3789 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3790 break;
3792 op0 = gimple_assign_rhs1 (def_stmt);
3793 code = gimple_assign_rhs_code (def_stmt);
3794 op1 = gimple_assign_rhs2 (def_stmt);
3796 else
3798 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3799 return false;
3800 code = TREE_CODE (off);
3801 extract_ops_from_tree (off, &code, &op0, &op1);
3803 switch (code)
3805 case POINTER_PLUS_EXPR:
3806 case PLUS_EXPR:
3807 if (expr_invariant_in_loop_p (loop, op0))
3809 add = op0;
3810 off = op1;
3811 do_add:
3812 add = fold_convert (sizetype, add);
3813 if (scale != 1)
3814 add = size_binop (MULT_EXPR, add, size_int (scale));
3815 base = size_binop (PLUS_EXPR, base, add);
3816 continue;
3818 if (expr_invariant_in_loop_p (loop, op1))
3820 add = op1;
3821 off = op0;
3822 goto do_add;
3824 break;
3825 case MINUS_EXPR:
3826 if (expr_invariant_in_loop_p (loop, op1))
3828 add = fold_convert (sizetype, op1);
3829 add = size_binop (MINUS_EXPR, size_zero_node, add);
3830 off = op0;
3831 goto do_add;
3833 break;
3834 case MULT_EXPR:
3835 if (scale == 1 && tree_fits_shwi_p (op1))
3837 int new_scale = tree_to_shwi (op1);
3838 /* Only treat this as a scaling operation if the target
3839 supports it. */
3840 if (use_ifn_p
3841 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3842 vectype, memory_type, 1,
3843 TYPE_SIGN (TREE_TYPE (op0)),
3844 new_scale, &ifn,
3845 &element_type))
3846 break;
3847 scale = new_scale;
3848 off = op0;
3849 continue;
3851 break;
3852 case SSA_NAME:
3853 off = op0;
3854 continue;
3855 CASE_CONVERT:
3856 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3857 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3858 break;
3859 if (TYPE_PRECISION (TREE_TYPE (op0))
3860 == TYPE_PRECISION (TREE_TYPE (off)))
3862 off = op0;
3863 continue;
3866 /* The internal functions need the offset to be the same width
3867 as the elements of VECTYPE. Don't include operations that
3868 cast the offset from that width to a different width. */
3869 if (use_ifn_p
3870 && (int_size_in_bytes (TREE_TYPE (vectype))
3871 == int_size_in_bytes (TREE_TYPE (off))))
3872 break;
3874 if (TYPE_PRECISION (TREE_TYPE (op0))
3875 < TYPE_PRECISION (TREE_TYPE (off)))
3877 off = op0;
3878 offtype = TREE_TYPE (off);
3879 STRIP_NOPS (off);
3880 continue;
3882 break;
3883 default:
3884 break;
3886 break;
3889 /* If at the end OFF still isn't a SSA_NAME or isn't
3890 defined in the loop, punt. */
3891 if (TREE_CODE (off) != SSA_NAME
3892 || expr_invariant_in_loop_p (loop, off))
3893 return false;
3895 if (offtype == NULL_TREE)
3896 offtype = TREE_TYPE (off);
3898 if (use_ifn_p)
3900 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3901 memory_type, TYPE_PRECISION (offtype),
3902 TYPE_SIGN (offtype), scale, &ifn,
3903 &element_type))
3904 return false;
3906 else
3908 if (DR_IS_READ (dr))
3910 if (targetm.vectorize.builtin_gather)
3911 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3913 else
3915 if (targetm.vectorize.builtin_scatter)
3916 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3919 if (!decl)
3920 return false;
3922 ifn = IFN_LAST;
3923 element_type = TREE_TYPE (vectype);
3926 info->ifn = ifn;
3927 info->decl = decl;
3928 info->base = base;
3929 info->offset = off;
3930 info->offset_dt = vect_unknown_def_type;
3931 info->offset_vectype = NULL_TREE;
3932 info->scale = scale;
3933 info->element_type = element_type;
3934 info->memory_type = memory_type;
3935 return true;
3938 /* Function vect_analyze_data_refs.
3940 Find all the data references in the loop or basic block.
3942 The general structure of the analysis of data refs in the vectorizer is as
3943 follows:
3944 1- vect_analyze_data_refs(loop/bb): call
3945 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3946 in the loop/bb and their dependences.
3947 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3948 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3949 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3953 bool
3954 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3956 struct loop *loop = NULL;
3957 unsigned int i;
3958 struct data_reference *dr;
3959 tree scalar_type;
3961 if (dump_enabled_p ())
3962 dump_printf_loc (MSG_NOTE, vect_location,
3963 "=== vect_analyze_data_refs ===\n");
3965 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3966 loop = LOOP_VINFO_LOOP (loop_vinfo);
3968 /* Go through the data-refs, check that the analysis succeeded. Update
3969 pointer from stmt_vec_info struct to DR and vectype. */
3971 vec<data_reference_p> datarefs = vinfo->datarefs;
3972 FOR_EACH_VEC_ELT (datarefs, i, dr)
3974 gimple *stmt;
3975 stmt_vec_info stmt_info;
3976 tree base, offset, init;
3977 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3978 bool simd_lane_access = false;
3979 poly_uint64 vf;
3981 again:
3982 if (!dr || !DR_REF (dr))
3984 if (dump_enabled_p ())
3985 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3986 "not vectorized: unhandled data-ref\n");
3987 return false;
3990 stmt = DR_STMT (dr);
3991 stmt_info = vinfo_for_stmt (stmt);
3993 /* Discard clobbers from the dataref vector. We will remove
3994 clobber stmts during vectorization. */
3995 if (gimple_clobber_p (stmt))
3997 free_data_ref (dr);
3998 if (i == datarefs.length () - 1)
4000 datarefs.pop ();
4001 break;
4003 datarefs.ordered_remove (i);
4004 dr = datarefs[i];
4005 goto again;
4008 /* Check that analysis of the data-ref succeeded. */
4009 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4010 || !DR_STEP (dr))
4012 bool maybe_gather
4013 = DR_IS_READ (dr)
4014 && !TREE_THIS_VOLATILE (DR_REF (dr))
4015 && (targetm.vectorize.builtin_gather != NULL
4016 || supports_vec_gather_load_p ());
4017 bool maybe_scatter
4018 = DR_IS_WRITE (dr)
4019 && !TREE_THIS_VOLATILE (DR_REF (dr))
4020 && (targetm.vectorize.builtin_scatter != NULL
4021 || supports_vec_scatter_store_p ());
4022 bool maybe_simd_lane_access
4023 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4025 /* If target supports vector gather loads or scatter stores, or if
4026 this might be a SIMD lane access, see if they can't be used. */
4027 if (is_a <loop_vec_info> (vinfo)
4028 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4029 && !nested_in_vect_loop_p (loop, stmt))
4031 struct data_reference *newdr
4032 = create_data_ref (NULL, loop_containing_stmt (stmt),
4033 DR_REF (dr), stmt, !maybe_scatter,
4034 DR_IS_CONDITIONAL_IN_STMT (dr));
4035 gcc_assert (newdr != NULL && DR_REF (newdr));
4036 if (DR_BASE_ADDRESS (newdr)
4037 && DR_OFFSET (newdr)
4038 && DR_INIT (newdr)
4039 && DR_STEP (newdr)
4040 && integer_zerop (DR_STEP (newdr)))
4042 if (maybe_simd_lane_access)
4044 tree off = DR_OFFSET (newdr);
4045 STRIP_NOPS (off);
4046 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4047 && TREE_CODE (off) == MULT_EXPR
4048 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4050 tree step = TREE_OPERAND (off, 1);
4051 off = TREE_OPERAND (off, 0);
4052 STRIP_NOPS (off);
4053 if (CONVERT_EXPR_P (off)
4054 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4055 0)))
4056 < TYPE_PRECISION (TREE_TYPE (off)))
4057 off = TREE_OPERAND (off, 0);
4058 if (TREE_CODE (off) == SSA_NAME)
4060 gimple *def = SSA_NAME_DEF_STMT (off);
4061 tree reft = TREE_TYPE (DR_REF (newdr));
4062 if (is_gimple_call (def)
4063 && gimple_call_internal_p (def)
4064 && (gimple_call_internal_fn (def)
4065 == IFN_GOMP_SIMD_LANE))
4067 tree arg = gimple_call_arg (def, 0);
4068 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4069 arg = SSA_NAME_VAR (arg);
4070 if (arg == loop->simduid
4071 /* For now. */
4072 && tree_int_cst_equal
4073 (TYPE_SIZE_UNIT (reft),
4074 step))
4076 DR_OFFSET (newdr) = ssize_int (0);
4077 DR_STEP (newdr) = step;
4078 DR_OFFSET_ALIGNMENT (newdr)
4079 = BIGGEST_ALIGNMENT;
4080 DR_STEP_ALIGNMENT (newdr)
4081 = highest_pow2_factor (step);
4082 dr = newdr;
4083 simd_lane_access = true;
4089 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4091 dr = newdr;
4092 if (maybe_gather)
4093 gatherscatter = GATHER;
4094 else
4095 gatherscatter = SCATTER;
4098 if (gatherscatter == SG_NONE && !simd_lane_access)
4099 free_data_ref (newdr);
4102 if (gatherscatter == SG_NONE && !simd_lane_access)
4104 if (dump_enabled_p ())
4106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4107 "not vectorized: data ref analysis "
4108 "failed ");
4109 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4112 if (is_a <bb_vec_info> (vinfo))
4113 break;
4115 return false;
4119 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4121 if (dump_enabled_p ())
4122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4123 "not vectorized: base addr of dr is a "
4124 "constant\n");
4126 if (is_a <bb_vec_info> (vinfo))
4127 break;
4129 if (gatherscatter != SG_NONE || simd_lane_access)
4130 free_data_ref (dr);
4131 return false;
4134 if (TREE_THIS_VOLATILE (DR_REF (dr)))
4136 if (dump_enabled_p ())
4138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4139 "not vectorized: volatile type ");
4140 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4143 if (is_a <bb_vec_info> (vinfo))
4144 break;
4146 return false;
4149 if (stmt_can_throw_internal (stmt))
4151 if (dump_enabled_p ())
4153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4154 "not vectorized: statement can throw an "
4155 "exception ");
4156 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4159 if (is_a <bb_vec_info> (vinfo))
4160 break;
4162 if (gatherscatter != SG_NONE || simd_lane_access)
4163 free_data_ref (dr);
4164 return false;
4167 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4168 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4170 if (dump_enabled_p ())
4172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4173 "not vectorized: statement is bitfield "
4174 "access ");
4175 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4178 if (is_a <bb_vec_info> (vinfo))
4179 break;
4181 if (gatherscatter != SG_NONE || simd_lane_access)
4182 free_data_ref (dr);
4183 return false;
4186 base = unshare_expr (DR_BASE_ADDRESS (dr));
4187 offset = unshare_expr (DR_OFFSET (dr));
4188 init = unshare_expr (DR_INIT (dr));
4190 if (is_gimple_call (stmt)
4191 && (!gimple_call_internal_p (stmt)
4192 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4193 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4195 if (dump_enabled_p ())
4197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4198 "not vectorized: dr in a call ");
4199 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4202 if (is_a <bb_vec_info> (vinfo))
4203 break;
4205 if (gatherscatter != SG_NONE || simd_lane_access)
4206 free_data_ref (dr);
4207 return false;
4210 /* Update DR field in stmt_vec_info struct. */
4212 /* If the dataref is in an inner-loop of the loop that is considered for
4213 for vectorization, we also want to analyze the access relative to
4214 the outer-loop (DR contains information only relative to the
4215 inner-most enclosing loop). We do that by building a reference to the
4216 first location accessed by the inner-loop, and analyze it relative to
4217 the outer-loop. */
4218 if (loop && nested_in_vect_loop_p (loop, stmt))
4220 /* Build a reference to the first location accessed by the
4221 inner loop: *(BASE + INIT + OFFSET). By construction,
4222 this address must be invariant in the inner loop, so we
4223 can consider it as being used in the outer loop. */
4224 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4225 init, offset);
4226 tree init_addr = fold_build_pointer_plus (base, init_offset);
4227 tree init_ref = build_fold_indirect_ref (init_addr);
4229 if (dump_enabled_p ())
4231 dump_printf_loc (MSG_NOTE, vect_location,
4232 "analyze in outer loop: ");
4233 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4234 dump_printf (MSG_NOTE, "\n");
4237 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4238 init_ref, loop))
4239 /* dr_analyze_innermost already explained the failure. */
4240 return false;
4242 if (dump_enabled_p ())
4244 dump_printf_loc (MSG_NOTE, vect_location,
4245 "\touter base_address: ");
4246 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4247 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4248 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4249 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4250 STMT_VINFO_DR_OFFSET (stmt_info));
4251 dump_printf (MSG_NOTE,
4252 "\n\touter constant offset from base address: ");
4253 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4254 STMT_VINFO_DR_INIT (stmt_info));
4255 dump_printf (MSG_NOTE, "\n\touter step: ");
4256 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4257 STMT_VINFO_DR_STEP (stmt_info));
4258 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4259 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4260 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4261 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4262 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4263 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4264 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4265 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4269 if (STMT_VINFO_DATA_REF (stmt_info))
4271 if (dump_enabled_p ())
4273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4274 "not vectorized: more than one data ref "
4275 "in stmt: ");
4276 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4279 if (is_a <bb_vec_info> (vinfo))
4280 break;
4282 if (gatherscatter != SG_NONE || simd_lane_access)
4283 free_data_ref (dr);
4284 return false;
4287 STMT_VINFO_DATA_REF (stmt_info) = dr;
4288 if (simd_lane_access)
4290 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4291 free_data_ref (datarefs[i]);
4292 datarefs[i] = dr;
4295 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4296 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4297 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4299 if (dump_enabled_p ())
4301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4302 "not vectorized: base object not addressable "
4303 "for stmt: ");
4304 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4306 if (is_a <bb_vec_info> (vinfo))
4308 /* In BB vectorization the ref can still participate
4309 in dependence analysis, we just can't vectorize it. */
4310 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4311 continue;
4313 return false;
4316 /* Set vectype for STMT. */
4317 scalar_type = TREE_TYPE (DR_REF (dr));
4318 STMT_VINFO_VECTYPE (stmt_info)
4319 = get_vectype_for_scalar_type (scalar_type);
4320 if (!STMT_VINFO_VECTYPE (stmt_info))
4322 if (dump_enabled_p ())
4324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4325 "not vectorized: no vectype for stmt: ");
4326 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4327 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4329 scalar_type);
4330 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4333 if (is_a <bb_vec_info> (vinfo))
4335 /* No vector type is fine, the ref can still participate
4336 in dependence analysis, we just can't vectorize it. */
4337 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4338 continue;
4341 if (gatherscatter != SG_NONE || simd_lane_access)
4343 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4344 if (gatherscatter != SG_NONE)
4345 free_data_ref (dr);
4347 return false;
4349 else
4351 if (dump_enabled_p ())
4353 dump_printf_loc (MSG_NOTE, vect_location,
4354 "got vectype for stmt: ");
4355 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4356 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4357 STMT_VINFO_VECTYPE (stmt_info));
4358 dump_printf (MSG_NOTE, "\n");
4362 /* Adjust the minimal vectorization factor according to the
4363 vector type. */
4364 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4365 *min_vf = upper_bound (*min_vf, vf);
4367 if (gatherscatter != SG_NONE)
4369 gather_scatter_info gs_info;
4370 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4371 &gs_info)
4372 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4374 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4375 free_data_ref (dr);
4376 if (dump_enabled_p ())
4378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4379 (gatherscatter == GATHER) ?
4380 "not vectorized: not suitable for gather "
4381 "load " :
4382 "not vectorized: not suitable for scatter "
4383 "store ");
4384 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4386 return false;
4389 free_data_ref (datarefs[i]);
4390 datarefs[i] = dr;
4391 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4394 else if (is_a <loop_vec_info> (vinfo)
4395 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4397 if (nested_in_vect_loop_p (loop, stmt))
4399 if (dump_enabled_p ())
4401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4402 "not vectorized: not suitable for strided "
4403 "load ");
4404 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4406 return false;
4408 STMT_VINFO_STRIDED_P (stmt_info) = true;
4412 /* If we stopped analysis at the first dataref we could not analyze
4413 when trying to vectorize a basic-block mark the rest of the datarefs
4414 as not vectorizable and truncate the vector of datarefs. That
4415 avoids spending useless time in analyzing their dependence. */
4416 if (i != datarefs.length ())
4418 gcc_assert (is_a <bb_vec_info> (vinfo));
4419 for (unsigned j = i; j < datarefs.length (); ++j)
4421 data_reference_p dr = datarefs[j];
4422 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4423 free_data_ref (dr);
4425 datarefs.truncate (i);
4428 return true;
4432 /* Function vect_get_new_vect_var.
4434 Returns a name for a new variable. The current naming scheme appends the
4435 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4436 the name of vectorizer generated variables, and appends that to NAME if
4437 provided. */
4439 tree
4440 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4442 const char *prefix;
4443 tree new_vect_var;
4445 switch (var_kind)
4447 case vect_simple_var:
4448 prefix = "vect";
4449 break;
4450 case vect_scalar_var:
4451 prefix = "stmp";
4452 break;
4453 case vect_mask_var:
4454 prefix = "mask";
4455 break;
4456 case vect_pointer_var:
4457 prefix = "vectp";
4458 break;
4459 default:
4460 gcc_unreachable ();
4463 if (name)
4465 char* tmp = concat (prefix, "_", name, NULL);
4466 new_vect_var = create_tmp_reg (type, tmp);
4467 free (tmp);
4469 else
4470 new_vect_var = create_tmp_reg (type, prefix);
4472 return new_vect_var;
4475 /* Like vect_get_new_vect_var but return an SSA name. */
4477 tree
4478 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4480 const char *prefix;
4481 tree new_vect_var;
4483 switch (var_kind)
4485 case vect_simple_var:
4486 prefix = "vect";
4487 break;
4488 case vect_scalar_var:
4489 prefix = "stmp";
4490 break;
4491 case vect_pointer_var:
4492 prefix = "vectp";
4493 break;
4494 default:
4495 gcc_unreachable ();
4498 if (name)
4500 char* tmp = concat (prefix, "_", name, NULL);
4501 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4502 free (tmp);
4504 else
4505 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4507 return new_vect_var;
4510 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4512 static void
4513 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4515 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4516 int misalign = DR_MISALIGNMENT (dr);
4517 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4518 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4519 else
4520 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4521 DR_TARGET_ALIGNMENT (dr), misalign);
4524 /* Function vect_create_addr_base_for_vector_ref.
4526 Create an expression that computes the address of the first memory location
4527 that will be accessed for a data reference.
4529 Input:
4530 STMT: The statement containing the data reference.
4531 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4532 OFFSET: Optional. If supplied, it is be added to the initial address.
4533 LOOP: Specify relative to which loop-nest should the address be computed.
4534 For example, when the dataref is in an inner-loop nested in an
4535 outer-loop that is now being vectorized, LOOP can be either the
4536 outer-loop, or the inner-loop. The first memory location accessed
4537 by the following dataref ('in' points to short):
4539 for (i=0; i<N; i++)
4540 for (j=0; j<M; j++)
4541 s += in[i+j]
4543 is as follows:
4544 if LOOP=i_loop: &in (relative to i_loop)
4545 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4546 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4547 initial address. Unlike OFFSET, which is number of elements to
4548 be added, BYTE_OFFSET is measured in bytes.
4550 Output:
4551 1. Return an SSA_NAME whose value is the address of the memory location of
4552 the first vector of the data reference.
4553 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4554 these statement(s) which define the returned SSA_NAME.
4556 FORNOW: We are only handling array accesses with step 1. */
4558 tree
4559 vect_create_addr_base_for_vector_ref (gimple *stmt,
4560 gimple_seq *new_stmt_list,
4561 tree offset,
4562 tree byte_offset)
4564 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4565 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4566 const char *base_name;
4567 tree addr_base;
4568 tree dest;
4569 gimple_seq seq = NULL;
4570 tree vect_ptr_type;
4571 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4572 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4573 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4575 tree data_ref_base = unshare_expr (drb->base_address);
4576 tree base_offset = unshare_expr (drb->offset);
4577 tree init = unshare_expr (drb->init);
4579 if (loop_vinfo)
4580 base_name = get_name (data_ref_base);
4581 else
4583 base_offset = ssize_int (0);
4584 init = ssize_int (0);
4585 base_name = get_name (DR_REF (dr));
4588 /* Create base_offset */
4589 base_offset = size_binop (PLUS_EXPR,
4590 fold_convert (sizetype, base_offset),
4591 fold_convert (sizetype, init));
4593 if (offset)
4595 offset = fold_build2 (MULT_EXPR, sizetype,
4596 fold_convert (sizetype, offset), step);
4597 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4598 base_offset, offset);
4600 if (byte_offset)
4602 byte_offset = fold_convert (sizetype, byte_offset);
4603 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4604 base_offset, byte_offset);
4607 /* base + base_offset */
4608 if (loop_vinfo)
4609 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4610 else
4612 addr_base = build1 (ADDR_EXPR,
4613 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4614 unshare_expr (DR_REF (dr)));
4617 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4618 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4619 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4620 gimple_seq_add_seq (new_stmt_list, seq);
4622 if (DR_PTR_INFO (dr)
4623 && TREE_CODE (addr_base) == SSA_NAME
4624 && !SSA_NAME_PTR_INFO (addr_base))
4626 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4627 if (offset || byte_offset)
4628 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4631 if (dump_enabled_p ())
4633 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4634 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4635 dump_printf (MSG_NOTE, "\n");
4638 return addr_base;
4642 /* Function vect_create_data_ref_ptr.
4644 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4645 location accessed in the loop by STMT, along with the def-use update
4646 chain to appropriately advance the pointer through the loop iterations.
4647 Also set aliasing information for the pointer. This pointer is used by
4648 the callers to this function to create a memory reference expression for
4649 vector load/store access.
4651 Input:
4652 1. STMT: a stmt that references memory. Expected to be of the form
4653 GIMPLE_ASSIGN <name, data-ref> or
4654 GIMPLE_ASSIGN <data-ref, name>.
4655 2. AGGR_TYPE: the type of the reference, which should be either a vector
4656 or an array.
4657 3. AT_LOOP: the loop where the vector memref is to be created.
4658 4. OFFSET (optional): an offset to be added to the initial address accessed
4659 by the data-ref in STMT.
4660 5. BSI: location where the new stmts are to be placed if there is no loop
4661 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4662 pointing to the initial address.
4663 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4664 to the initial address accessed by the data-ref in STMT. This is
4665 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4666 in bytes.
4667 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4668 to the IV during each iteration of the loop. NULL says to move
4669 by one copy of AGGR_TYPE up or down, depending on the step of the
4670 data reference.
4672 Output:
4673 1. Declare a new ptr to vector_type, and have it point to the base of the
4674 data reference (initial addressed accessed by the data reference).
4675 For example, for vector of type V8HI, the following code is generated:
4677 v8hi *ap;
4678 ap = (v8hi *)initial_address;
4680 if OFFSET is not supplied:
4681 initial_address = &a[init];
4682 if OFFSET is supplied:
4683 initial_address = &a[init + OFFSET];
4684 if BYTE_OFFSET is supplied:
4685 initial_address = &a[init] + BYTE_OFFSET;
4687 Return the initial_address in INITIAL_ADDRESS.
4689 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4690 update the pointer in each iteration of the loop.
4692 Return the increment stmt that updates the pointer in PTR_INCR.
4694 3. Set INV_P to true if the access pattern of the data reference in the
4695 vectorized loop is invariant. Set it to false otherwise.
4697 4. Return the pointer. */
4699 tree
4700 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4701 tree offset, tree *initial_address,
4702 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4703 bool only_init, bool *inv_p, tree byte_offset,
4704 tree iv_step)
4706 const char *base_name;
4707 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4708 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4709 struct loop *loop = NULL;
4710 bool nested_in_vect_loop = false;
4711 struct loop *containing_loop = NULL;
4712 tree aggr_ptr_type;
4713 tree aggr_ptr;
4714 tree new_temp;
4715 gimple_seq new_stmt_list = NULL;
4716 edge pe = NULL;
4717 basic_block new_bb;
4718 tree aggr_ptr_init;
4719 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4720 tree aptr;
4721 gimple_stmt_iterator incr_gsi;
4722 bool insert_after;
4723 tree indx_before_incr, indx_after_incr;
4724 gimple *incr;
4725 tree step;
4726 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4728 gcc_assert (iv_step != NULL_TREE
4729 || TREE_CODE (aggr_type) == ARRAY_TYPE
4730 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4732 if (loop_vinfo)
4734 loop = LOOP_VINFO_LOOP (loop_vinfo);
4735 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4736 containing_loop = (gimple_bb (stmt))->loop_father;
4737 pe = loop_preheader_edge (loop);
4739 else
4741 gcc_assert (bb_vinfo);
4742 only_init = true;
4743 *ptr_incr = NULL;
4746 /* Check the step (evolution) of the load in LOOP, and record
4747 whether it's invariant. */
4748 step = vect_dr_behavior (dr)->step;
4749 if (integer_zerop (step))
4750 *inv_p = true;
4751 else
4752 *inv_p = false;
4754 /* Create an expression for the first address accessed by this load
4755 in LOOP. */
4756 base_name = get_name (DR_BASE_ADDRESS (dr));
4758 if (dump_enabled_p ())
4760 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4761 dump_printf_loc (MSG_NOTE, vect_location,
4762 "create %s-pointer variable to type: ",
4763 get_tree_code_name (TREE_CODE (aggr_type)));
4764 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4765 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4766 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4767 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4768 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4769 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4770 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4771 else
4772 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4773 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4774 dump_printf (MSG_NOTE, "\n");
4777 /* (1) Create the new aggregate-pointer variable.
4778 Vector and array types inherit the alias set of their component
4779 type by default so we need to use a ref-all pointer if the data
4780 reference does not conflict with the created aggregated data
4781 reference because it is not addressable. */
4782 bool need_ref_all = false;
4783 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4784 get_alias_set (DR_REF (dr))))
4785 need_ref_all = true;
4786 /* Likewise for any of the data references in the stmt group. */
4787 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4789 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4792 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4793 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4794 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4795 get_alias_set (DR_REF (sdr))))
4797 need_ref_all = true;
4798 break;
4800 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4802 while (orig_stmt);
4804 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4805 need_ref_all);
4806 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4809 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4810 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4811 def-use update cycles for the pointer: one relative to the outer-loop
4812 (LOOP), which is what steps (3) and (4) below do. The other is relative
4813 to the inner-loop (which is the inner-most loop containing the dataref),
4814 and this is done be step (5) below.
4816 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4817 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4818 redundant. Steps (3),(4) create the following:
4820 vp0 = &base_addr;
4821 LOOP: vp1 = phi(vp0,vp2)
4824 vp2 = vp1 + step
4825 goto LOOP
4827 If there is an inner-loop nested in loop, then step (5) will also be
4828 applied, and an additional update in the inner-loop will be created:
4830 vp0 = &base_addr;
4831 LOOP: vp1 = phi(vp0,vp2)
4833 inner: vp3 = phi(vp1,vp4)
4834 vp4 = vp3 + inner_step
4835 if () goto inner
4837 vp2 = vp1 + step
4838 if () goto LOOP */
4840 /* (2) Calculate the initial address of the aggregate-pointer, and set
4841 the aggregate-pointer to point to it before the loop. */
4843 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4845 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4846 offset, byte_offset);
4847 if (new_stmt_list)
4849 if (pe)
4851 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4852 gcc_assert (!new_bb);
4854 else
4855 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4858 *initial_address = new_temp;
4859 aggr_ptr_init = new_temp;
4861 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4862 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4863 inner-loop nested in LOOP (during outer-loop vectorization). */
4865 /* No update in loop is required. */
4866 if (only_init && (!loop_vinfo || at_loop == loop))
4867 aptr = aggr_ptr_init;
4868 else
4870 if (iv_step == NULL_TREE)
4872 /* The step of the aggregate pointer is the type size. */
4873 iv_step = TYPE_SIZE_UNIT (aggr_type);
4874 /* One exception to the above is when the scalar step of the load in
4875 LOOP is zero. In this case the step here is also zero. */
4876 if (*inv_p)
4877 iv_step = size_zero_node;
4878 else if (tree_int_cst_sgn (step) == -1)
4879 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4882 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4884 create_iv (aggr_ptr_init,
4885 fold_convert (aggr_ptr_type, iv_step),
4886 aggr_ptr, loop, &incr_gsi, insert_after,
4887 &indx_before_incr, &indx_after_incr);
4888 incr = gsi_stmt (incr_gsi);
4889 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4891 /* Copy the points-to information if it exists. */
4892 if (DR_PTR_INFO (dr))
4894 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4895 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4897 if (ptr_incr)
4898 *ptr_incr = incr;
4900 aptr = indx_before_incr;
4903 if (!nested_in_vect_loop || only_init)
4904 return aptr;
4907 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4908 nested in LOOP, if exists. */
4910 gcc_assert (nested_in_vect_loop);
4911 if (!only_init)
4913 standard_iv_increment_position (containing_loop, &incr_gsi,
4914 &insert_after);
4915 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4916 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4917 &indx_after_incr);
4918 incr = gsi_stmt (incr_gsi);
4919 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4921 /* Copy the points-to information if it exists. */
4922 if (DR_PTR_INFO (dr))
4924 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4925 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4927 if (ptr_incr)
4928 *ptr_incr = incr;
4930 return indx_before_incr;
4932 else
4933 gcc_unreachable ();
4937 /* Function bump_vector_ptr
4939 Increment a pointer (to a vector type) by vector-size. If requested,
4940 i.e. if PTR-INCR is given, then also connect the new increment stmt
4941 to the existing def-use update-chain of the pointer, by modifying
4942 the PTR_INCR as illustrated below:
4944 The pointer def-use update-chain before this function:
4945 DATAREF_PTR = phi (p_0, p_2)
4946 ....
4947 PTR_INCR: p_2 = DATAREF_PTR + step
4949 The pointer def-use update-chain after this function:
4950 DATAREF_PTR = phi (p_0, p_2)
4951 ....
4952 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4953 ....
4954 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4956 Input:
4957 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4958 in the loop.
4959 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4960 the loop. The increment amount across iterations is expected
4961 to be vector_size.
4962 BSI - location where the new update stmt is to be placed.
4963 STMT - the original scalar memory-access stmt that is being vectorized.
4964 BUMP - optional. The offset by which to bump the pointer. If not given,
4965 the offset is assumed to be vector_size.
4967 Output: Return NEW_DATAREF_PTR as illustrated above.
4971 tree
4972 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4973 gimple *stmt, tree bump)
4975 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4976 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4977 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4978 tree update = TYPE_SIZE_UNIT (vectype);
4979 gassign *incr_stmt;
4980 ssa_op_iter iter;
4981 use_operand_p use_p;
4982 tree new_dataref_ptr;
4984 if (bump)
4985 update = bump;
4987 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4988 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4989 else
4990 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4991 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4992 dataref_ptr, update);
4993 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4995 /* Copy the points-to information if it exists. */
4996 if (DR_PTR_INFO (dr))
4998 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4999 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5002 if (!ptr_incr)
5003 return new_dataref_ptr;
5005 /* Update the vector-pointer's cross-iteration increment. */
5006 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5008 tree use = USE_FROM_PTR (use_p);
5010 if (use == dataref_ptr)
5011 SET_USE (use_p, new_dataref_ptr);
5012 else
5013 gcc_assert (operand_equal_p (use, update, 0));
5016 return new_dataref_ptr;
5020 /* Copy memory reference info such as base/clique from the SRC reference
5021 to the DEST MEM_REF. */
5023 void
5024 vect_copy_ref_info (tree dest, tree src)
5026 if (TREE_CODE (dest) != MEM_REF)
5027 return;
5029 tree src_base = src;
5030 while (handled_component_p (src_base))
5031 src_base = TREE_OPERAND (src_base, 0);
5032 if (TREE_CODE (src_base) != MEM_REF
5033 && TREE_CODE (src_base) != TARGET_MEM_REF)
5034 return;
5036 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5037 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5041 /* Function vect_create_destination_var.
5043 Create a new temporary of type VECTYPE. */
5045 tree
5046 vect_create_destination_var (tree scalar_dest, tree vectype)
5048 tree vec_dest;
5049 const char *name;
5050 char *new_name;
5051 tree type;
5052 enum vect_var_kind kind;
5054 kind = vectype
5055 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5056 ? vect_mask_var
5057 : vect_simple_var
5058 : vect_scalar_var;
5059 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5061 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5063 name = get_name (scalar_dest);
5064 if (name)
5065 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5066 else
5067 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5068 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5069 free (new_name);
5071 return vec_dest;
5074 /* Function vect_grouped_store_supported.
5076 Returns TRUE if interleave high and interleave low permutations
5077 are supported, and FALSE otherwise. */
5079 bool
5080 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5082 machine_mode mode = TYPE_MODE (vectype);
5084 /* vect_permute_store_chain requires the group size to be equal to 3 or
5085 be a power of two. */
5086 if (count != 3 && exact_log2 (count) == -1)
5088 if (dump_enabled_p ())
5089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5090 "the size of the group of accesses"
5091 " is not a power of 2 or not eqaul to 3\n");
5092 return false;
5095 /* Check that the permutation is supported. */
5096 if (VECTOR_MODE_P (mode))
5098 unsigned int i;
5099 if (count == 3)
5101 unsigned int j0 = 0, j1 = 0, j2 = 0;
5102 unsigned int i, j;
5104 unsigned int nelt;
5105 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5107 if (dump_enabled_p ())
5108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5109 "cannot handle groups of 3 stores for"
5110 " variable-length vectors\n");
5111 return false;
5114 vec_perm_builder sel (nelt, nelt, 1);
5115 sel.quick_grow (nelt);
5116 vec_perm_indices indices;
5117 for (j = 0; j < 3; j++)
5119 int nelt0 = ((3 - j) * nelt) % 3;
5120 int nelt1 = ((3 - j) * nelt + 1) % 3;
5121 int nelt2 = ((3 - j) * nelt + 2) % 3;
5122 for (i = 0; i < nelt; i++)
5124 if (3 * i + nelt0 < nelt)
5125 sel[3 * i + nelt0] = j0++;
5126 if (3 * i + nelt1 < nelt)
5127 sel[3 * i + nelt1] = nelt + j1++;
5128 if (3 * i + nelt2 < nelt)
5129 sel[3 * i + nelt2] = 0;
5131 indices.new_vector (sel, 2, nelt);
5132 if (!can_vec_perm_const_p (mode, indices))
5134 if (dump_enabled_p ())
5135 dump_printf (MSG_MISSED_OPTIMIZATION,
5136 "permutation op not supported by target.\n");
5137 return false;
5140 for (i = 0; i < nelt; i++)
5142 if (3 * i + nelt0 < nelt)
5143 sel[3 * i + nelt0] = 3 * i + nelt0;
5144 if (3 * i + nelt1 < nelt)
5145 sel[3 * i + nelt1] = 3 * i + nelt1;
5146 if (3 * i + nelt2 < nelt)
5147 sel[3 * i + nelt2] = nelt + j2++;
5149 indices.new_vector (sel, 2, nelt);
5150 if (!can_vec_perm_const_p (mode, indices))
5152 if (dump_enabled_p ())
5153 dump_printf (MSG_MISSED_OPTIMIZATION,
5154 "permutation op not supported by target.\n");
5155 return false;
5158 return true;
5160 else
5162 /* If length is not equal to 3 then only power of 2 is supported. */
5163 gcc_assert (pow2p_hwi (count));
5164 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5166 /* The encoding has 2 interleaved stepped patterns. */
5167 vec_perm_builder sel (nelt, 2, 3);
5168 sel.quick_grow (6);
5169 for (i = 0; i < 3; i++)
5171 sel[i * 2] = i;
5172 sel[i * 2 + 1] = i + nelt;
5174 vec_perm_indices indices (sel, 2, nelt);
5175 if (can_vec_perm_const_p (mode, indices))
5177 for (i = 0; i < 6; i++)
5178 sel[i] += exact_div (nelt, 2);
5179 indices.new_vector (sel, 2, nelt);
5180 if (can_vec_perm_const_p (mode, indices))
5181 return true;
5186 if (dump_enabled_p ())
5187 dump_printf (MSG_MISSED_OPTIMIZATION,
5188 "permutaion op not supported by target.\n");
5189 return false;
5193 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5194 type VECTYPE. MASKED_P says whether the masked form is needed. */
5196 bool
5197 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5198 bool masked_p)
5200 if (masked_p)
5201 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5202 vec_mask_store_lanes_optab,
5203 vectype, count);
5204 else
5205 return vect_lanes_optab_supported_p ("vec_store_lanes",
5206 vec_store_lanes_optab,
5207 vectype, count);
5211 /* Function vect_permute_store_chain.
5213 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5214 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5215 the data correctly for the stores. Return the final references for stores
5216 in RESULT_CHAIN.
5218 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5219 The input is 4 vectors each containing 8 elements. We assign a number to
5220 each element, the input sequence is:
5222 1st vec: 0 1 2 3 4 5 6 7
5223 2nd vec: 8 9 10 11 12 13 14 15
5224 3rd vec: 16 17 18 19 20 21 22 23
5225 4th vec: 24 25 26 27 28 29 30 31
5227 The output sequence should be:
5229 1st vec: 0 8 16 24 1 9 17 25
5230 2nd vec: 2 10 18 26 3 11 19 27
5231 3rd vec: 4 12 20 28 5 13 21 30
5232 4th vec: 6 14 22 30 7 15 23 31
5234 i.e., we interleave the contents of the four vectors in their order.
5236 We use interleave_high/low instructions to create such output. The input of
5237 each interleave_high/low operation is two vectors:
5238 1st vec 2nd vec
5239 0 1 2 3 4 5 6 7
5240 the even elements of the result vector are obtained left-to-right from the
5241 high/low elements of the first vector. The odd elements of the result are
5242 obtained left-to-right from the high/low elements of the second vector.
5243 The output of interleave_high will be: 0 4 1 5
5244 and of interleave_low: 2 6 3 7
5247 The permutation is done in log LENGTH stages. In each stage interleave_high
5248 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5249 where the first argument is taken from the first half of DR_CHAIN and the
5250 second argument from it's second half.
5251 In our example,
5253 I1: interleave_high (1st vec, 3rd vec)
5254 I2: interleave_low (1st vec, 3rd vec)
5255 I3: interleave_high (2nd vec, 4th vec)
5256 I4: interleave_low (2nd vec, 4th vec)
5258 The output for the first stage is:
5260 I1: 0 16 1 17 2 18 3 19
5261 I2: 4 20 5 21 6 22 7 23
5262 I3: 8 24 9 25 10 26 11 27
5263 I4: 12 28 13 29 14 30 15 31
5265 The output of the second stage, i.e. the final result is:
5267 I1: 0 8 16 24 1 9 17 25
5268 I2: 2 10 18 26 3 11 19 27
5269 I3: 4 12 20 28 5 13 21 30
5270 I4: 6 14 22 30 7 15 23 31. */
5272 void
5273 vect_permute_store_chain (vec<tree> dr_chain,
5274 unsigned int length,
5275 gimple *stmt,
5276 gimple_stmt_iterator *gsi,
5277 vec<tree> *result_chain)
5279 tree vect1, vect2, high, low;
5280 gimple *perm_stmt;
5281 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5282 tree perm_mask_low, perm_mask_high;
5283 tree data_ref;
5284 tree perm3_mask_low, perm3_mask_high;
5285 unsigned int i, j, n, log_length = exact_log2 (length);
5287 result_chain->quick_grow (length);
5288 memcpy (result_chain->address (), dr_chain.address (),
5289 length * sizeof (tree));
5291 if (length == 3)
5293 /* vect_grouped_store_supported ensures that this is constant. */
5294 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5295 unsigned int j0 = 0, j1 = 0, j2 = 0;
5297 vec_perm_builder sel (nelt, nelt, 1);
5298 sel.quick_grow (nelt);
5299 vec_perm_indices indices;
5300 for (j = 0; j < 3; j++)
5302 int nelt0 = ((3 - j) * nelt) % 3;
5303 int nelt1 = ((3 - j) * nelt + 1) % 3;
5304 int nelt2 = ((3 - j) * nelt + 2) % 3;
5306 for (i = 0; i < nelt; i++)
5308 if (3 * i + nelt0 < nelt)
5309 sel[3 * i + nelt0] = j0++;
5310 if (3 * i + nelt1 < nelt)
5311 sel[3 * i + nelt1] = nelt + j1++;
5312 if (3 * i + nelt2 < nelt)
5313 sel[3 * i + nelt2] = 0;
5315 indices.new_vector (sel, 2, nelt);
5316 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5318 for (i = 0; i < nelt; i++)
5320 if (3 * i + nelt0 < nelt)
5321 sel[3 * i + nelt0] = 3 * i + nelt0;
5322 if (3 * i + nelt1 < nelt)
5323 sel[3 * i + nelt1] = 3 * i + nelt1;
5324 if (3 * i + nelt2 < nelt)
5325 sel[3 * i + nelt2] = nelt + j2++;
5327 indices.new_vector (sel, 2, nelt);
5328 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5330 vect1 = dr_chain[0];
5331 vect2 = dr_chain[1];
5333 /* Create interleaving stmt:
5334 low = VEC_PERM_EXPR <vect1, vect2,
5335 {j, nelt, *, j + 1, nelt + j + 1, *,
5336 j + 2, nelt + j + 2, *, ...}> */
5337 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5338 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5339 vect2, perm3_mask_low);
5340 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5342 vect1 = data_ref;
5343 vect2 = dr_chain[2];
5344 /* Create interleaving stmt:
5345 low = VEC_PERM_EXPR <vect1, vect2,
5346 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5347 6, 7, nelt + j + 2, ...}> */
5348 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5349 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5350 vect2, perm3_mask_high);
5351 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5352 (*result_chain)[j] = data_ref;
5355 else
5357 /* If length is not equal to 3 then only power of 2 is supported. */
5358 gcc_assert (pow2p_hwi (length));
5360 /* The encoding has 2 interleaved stepped patterns. */
5361 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5362 vec_perm_builder sel (nelt, 2, 3);
5363 sel.quick_grow (6);
5364 for (i = 0; i < 3; i++)
5366 sel[i * 2] = i;
5367 sel[i * 2 + 1] = i + nelt;
5369 vec_perm_indices indices (sel, 2, nelt);
5370 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5372 for (i = 0; i < 6; i++)
5373 sel[i] += exact_div (nelt, 2);
5374 indices.new_vector (sel, 2, nelt);
5375 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5377 for (i = 0, n = log_length; i < n; i++)
5379 for (j = 0; j < length/2; j++)
5381 vect1 = dr_chain[j];
5382 vect2 = dr_chain[j+length/2];
5384 /* Create interleaving stmt:
5385 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5386 ...}> */
5387 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5388 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5389 vect2, perm_mask_high);
5390 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5391 (*result_chain)[2*j] = high;
5393 /* Create interleaving stmt:
5394 low = VEC_PERM_EXPR <vect1, vect2,
5395 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5396 ...}> */
5397 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5398 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5399 vect2, perm_mask_low);
5400 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5401 (*result_chain)[2*j+1] = low;
5403 memcpy (dr_chain.address (), result_chain->address (),
5404 length * sizeof (tree));
5409 /* Function vect_setup_realignment
5411 This function is called when vectorizing an unaligned load using
5412 the dr_explicit_realign[_optimized] scheme.
5413 This function generates the following code at the loop prolog:
5415 p = initial_addr;
5416 x msq_init = *(floor(p)); # prolog load
5417 realignment_token = call target_builtin;
5418 loop:
5419 x msq = phi (msq_init, ---)
5421 The stmts marked with x are generated only for the case of
5422 dr_explicit_realign_optimized.
5424 The code above sets up a new (vector) pointer, pointing to the first
5425 location accessed by STMT, and a "floor-aligned" load using that pointer.
5426 It also generates code to compute the "realignment-token" (if the relevant
5427 target hook was defined), and creates a phi-node at the loop-header bb
5428 whose arguments are the result of the prolog-load (created by this
5429 function) and the result of a load that takes place in the loop (to be
5430 created by the caller to this function).
5432 For the case of dr_explicit_realign_optimized:
5433 The caller to this function uses the phi-result (msq) to create the
5434 realignment code inside the loop, and sets up the missing phi argument,
5435 as follows:
5436 loop:
5437 msq = phi (msq_init, lsq)
5438 lsq = *(floor(p')); # load in loop
5439 result = realign_load (msq, lsq, realignment_token);
5441 For the case of dr_explicit_realign:
5442 loop:
5443 msq = *(floor(p)); # load in loop
5444 p' = p + (VS-1);
5445 lsq = *(floor(p')); # load in loop
5446 result = realign_load (msq, lsq, realignment_token);
5448 Input:
5449 STMT - (scalar) load stmt to be vectorized. This load accesses
5450 a memory location that may be unaligned.
5451 BSI - place where new code is to be inserted.
5452 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5453 is used.
5455 Output:
5456 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5457 target hook, if defined.
5458 Return value - the result of the loop-header phi node. */
5460 tree
5461 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5462 tree *realignment_token,
5463 enum dr_alignment_support alignment_support_scheme,
5464 tree init_addr,
5465 struct loop **at_loop)
5467 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5468 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5469 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5470 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5471 struct loop *loop = NULL;
5472 edge pe = NULL;
5473 tree scalar_dest = gimple_assign_lhs (stmt);
5474 tree vec_dest;
5475 gimple *inc;
5476 tree ptr;
5477 tree data_ref;
5478 basic_block new_bb;
5479 tree msq_init = NULL_TREE;
5480 tree new_temp;
5481 gphi *phi_stmt;
5482 tree msq = NULL_TREE;
5483 gimple_seq stmts = NULL;
5484 bool inv_p;
5485 bool compute_in_loop = false;
5486 bool nested_in_vect_loop = false;
5487 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5488 struct loop *loop_for_initial_load = NULL;
5490 if (loop_vinfo)
5492 loop = LOOP_VINFO_LOOP (loop_vinfo);
5493 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5496 gcc_assert (alignment_support_scheme == dr_explicit_realign
5497 || alignment_support_scheme == dr_explicit_realign_optimized);
5499 /* We need to generate three things:
5500 1. the misalignment computation
5501 2. the extra vector load (for the optimized realignment scheme).
5502 3. the phi node for the two vectors from which the realignment is
5503 done (for the optimized realignment scheme). */
5505 /* 1. Determine where to generate the misalignment computation.
5507 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5508 calculation will be generated by this function, outside the loop (in the
5509 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5510 caller, inside the loop.
5512 Background: If the misalignment remains fixed throughout the iterations of
5513 the loop, then both realignment schemes are applicable, and also the
5514 misalignment computation can be done outside LOOP. This is because we are
5515 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5516 are a multiple of VS (the Vector Size), and therefore the misalignment in
5517 different vectorized LOOP iterations is always the same.
5518 The problem arises only if the memory access is in an inner-loop nested
5519 inside LOOP, which is now being vectorized using outer-loop vectorization.
5520 This is the only case when the misalignment of the memory access may not
5521 remain fixed throughout the iterations of the inner-loop (as explained in
5522 detail in vect_supportable_dr_alignment). In this case, not only is the
5523 optimized realignment scheme not applicable, but also the misalignment
5524 computation (and generation of the realignment token that is passed to
5525 REALIGN_LOAD) have to be done inside the loop.
5527 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5528 or not, which in turn determines if the misalignment is computed inside
5529 the inner-loop, or outside LOOP. */
5531 if (init_addr != NULL_TREE || !loop_vinfo)
5533 compute_in_loop = true;
5534 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5538 /* 2. Determine where to generate the extra vector load.
5540 For the optimized realignment scheme, instead of generating two vector
5541 loads in each iteration, we generate a single extra vector load in the
5542 preheader of the loop, and in each iteration reuse the result of the
5543 vector load from the previous iteration. In case the memory access is in
5544 an inner-loop nested inside LOOP, which is now being vectorized using
5545 outer-loop vectorization, we need to determine whether this initial vector
5546 load should be generated at the preheader of the inner-loop, or can be
5547 generated at the preheader of LOOP. If the memory access has no evolution
5548 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5549 to be generated inside LOOP (in the preheader of the inner-loop). */
5551 if (nested_in_vect_loop)
5553 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5554 bool invariant_in_outerloop =
5555 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5556 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5558 else
5559 loop_for_initial_load = loop;
5560 if (at_loop)
5561 *at_loop = loop_for_initial_load;
5563 if (loop_for_initial_load)
5564 pe = loop_preheader_edge (loop_for_initial_load);
5566 /* 3. For the case of the optimized realignment, create the first vector
5567 load at the loop preheader. */
5569 if (alignment_support_scheme == dr_explicit_realign_optimized)
5571 /* Create msq_init = *(floor(p1)) in the loop preheader */
5572 gassign *new_stmt;
5574 gcc_assert (!compute_in_loop);
5575 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5576 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5577 NULL_TREE, &init_addr, NULL, &inc,
5578 true, &inv_p);
5579 if (TREE_CODE (ptr) == SSA_NAME)
5580 new_temp = copy_ssa_name (ptr);
5581 else
5582 new_temp = make_ssa_name (TREE_TYPE (ptr));
5583 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5584 new_stmt = gimple_build_assign
5585 (new_temp, BIT_AND_EXPR, ptr,
5586 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5587 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5588 gcc_assert (!new_bb);
5589 data_ref
5590 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5591 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5592 vect_copy_ref_info (data_ref, DR_REF (dr));
5593 new_stmt = gimple_build_assign (vec_dest, data_ref);
5594 new_temp = make_ssa_name (vec_dest, new_stmt);
5595 gimple_assign_set_lhs (new_stmt, new_temp);
5596 if (pe)
5598 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5599 gcc_assert (!new_bb);
5601 else
5602 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5604 msq_init = gimple_assign_lhs (new_stmt);
5607 /* 4. Create realignment token using a target builtin, if available.
5608 It is done either inside the containing loop, or before LOOP (as
5609 determined above). */
5611 if (targetm.vectorize.builtin_mask_for_load)
5613 gcall *new_stmt;
5614 tree builtin_decl;
5616 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5617 if (!init_addr)
5619 /* Generate the INIT_ADDR computation outside LOOP. */
5620 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5621 NULL_TREE);
5622 if (loop)
5624 pe = loop_preheader_edge (loop);
5625 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5626 gcc_assert (!new_bb);
5628 else
5629 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5632 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5633 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5634 vec_dest =
5635 vect_create_destination_var (scalar_dest,
5636 gimple_call_return_type (new_stmt));
5637 new_temp = make_ssa_name (vec_dest, new_stmt);
5638 gimple_call_set_lhs (new_stmt, new_temp);
5640 if (compute_in_loop)
5641 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5642 else
5644 /* Generate the misalignment computation outside LOOP. */
5645 pe = loop_preheader_edge (loop);
5646 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5647 gcc_assert (!new_bb);
5650 *realignment_token = gimple_call_lhs (new_stmt);
5652 /* The result of the CALL_EXPR to this builtin is determined from
5653 the value of the parameter and no global variables are touched
5654 which makes the builtin a "const" function. Requiring the
5655 builtin to have the "const" attribute makes it unnecessary
5656 to call mark_call_clobbered. */
5657 gcc_assert (TREE_READONLY (builtin_decl));
5660 if (alignment_support_scheme == dr_explicit_realign)
5661 return msq;
5663 gcc_assert (!compute_in_loop);
5664 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5667 /* 5. Create msq = phi <msq_init, lsq> in loop */
5669 pe = loop_preheader_edge (containing_loop);
5670 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5671 msq = make_ssa_name (vec_dest);
5672 phi_stmt = create_phi_node (msq, containing_loop->header);
5673 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5675 return msq;
5679 /* Function vect_grouped_load_supported.
5681 COUNT is the size of the load group (the number of statements plus the
5682 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5683 only one statement, with a gap of COUNT - 1.
5685 Returns true if a suitable permute exists. */
5687 bool
5688 vect_grouped_load_supported (tree vectype, bool single_element_p,
5689 unsigned HOST_WIDE_INT count)
5691 machine_mode mode = TYPE_MODE (vectype);
5693 /* If this is single-element interleaving with an element distance
5694 that leaves unused vector loads around punt - we at least create
5695 very sub-optimal code in that case (and blow up memory,
5696 see PR65518). */
5697 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5699 if (dump_enabled_p ())
5700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5701 "single-element interleaving not supported "
5702 "for not adjacent vector loads\n");
5703 return false;
5706 /* vect_permute_load_chain requires the group size to be equal to 3 or
5707 be a power of two. */
5708 if (count != 3 && exact_log2 (count) == -1)
5710 if (dump_enabled_p ())
5711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5712 "the size of the group of accesses"
5713 " is not a power of 2 or not equal to 3\n");
5714 return false;
5717 /* Check that the permutation is supported. */
5718 if (VECTOR_MODE_P (mode))
5720 unsigned int i, j;
5721 if (count == 3)
5723 unsigned int nelt;
5724 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5726 if (dump_enabled_p ())
5727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5728 "cannot handle groups of 3 loads for"
5729 " variable-length vectors\n");
5730 return false;
5733 vec_perm_builder sel (nelt, nelt, 1);
5734 sel.quick_grow (nelt);
5735 vec_perm_indices indices;
5736 unsigned int k;
5737 for (k = 0; k < 3; k++)
5739 for (i = 0; i < nelt; i++)
5740 if (3 * i + k < 2 * nelt)
5741 sel[i] = 3 * i + k;
5742 else
5743 sel[i] = 0;
5744 indices.new_vector (sel, 2, nelt);
5745 if (!can_vec_perm_const_p (mode, indices))
5747 if (dump_enabled_p ())
5748 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5749 "shuffle of 3 loads is not supported by"
5750 " target\n");
5751 return false;
5753 for (i = 0, j = 0; i < nelt; i++)
5754 if (3 * i + k < 2 * nelt)
5755 sel[i] = i;
5756 else
5757 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5758 indices.new_vector (sel, 2, nelt);
5759 if (!can_vec_perm_const_p (mode, indices))
5761 if (dump_enabled_p ())
5762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5763 "shuffle of 3 loads is not supported by"
5764 " target\n");
5765 return false;
5768 return true;
5770 else
5772 /* If length is not equal to 3 then only power of 2 is supported. */
5773 gcc_assert (pow2p_hwi (count));
5774 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5776 /* The encoding has a single stepped pattern. */
5777 vec_perm_builder sel (nelt, 1, 3);
5778 sel.quick_grow (3);
5779 for (i = 0; i < 3; i++)
5780 sel[i] = i * 2;
5781 vec_perm_indices indices (sel, 2, nelt);
5782 if (can_vec_perm_const_p (mode, indices))
5784 for (i = 0; i < 3; i++)
5785 sel[i] = i * 2 + 1;
5786 indices.new_vector (sel, 2, nelt);
5787 if (can_vec_perm_const_p (mode, indices))
5788 return true;
5793 if (dump_enabled_p ())
5794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5795 "extract even/odd not supported by target\n");
5796 return false;
5799 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5800 type VECTYPE. MASKED_P says whether the masked form is needed. */
5802 bool
5803 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5804 bool masked_p)
5806 if (masked_p)
5807 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5808 vec_mask_load_lanes_optab,
5809 vectype, count);
5810 else
5811 return vect_lanes_optab_supported_p ("vec_load_lanes",
5812 vec_load_lanes_optab,
5813 vectype, count);
5816 /* Function vect_permute_load_chain.
5818 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5819 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5820 the input data correctly. Return the final references for loads in
5821 RESULT_CHAIN.
5823 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5824 The input is 4 vectors each containing 8 elements. We assign a number to each
5825 element, the input sequence is:
5827 1st vec: 0 1 2 3 4 5 6 7
5828 2nd vec: 8 9 10 11 12 13 14 15
5829 3rd vec: 16 17 18 19 20 21 22 23
5830 4th vec: 24 25 26 27 28 29 30 31
5832 The output sequence should be:
5834 1st vec: 0 4 8 12 16 20 24 28
5835 2nd vec: 1 5 9 13 17 21 25 29
5836 3rd vec: 2 6 10 14 18 22 26 30
5837 4th vec: 3 7 11 15 19 23 27 31
5839 i.e., the first output vector should contain the first elements of each
5840 interleaving group, etc.
5842 We use extract_even/odd instructions to create such output. The input of
5843 each extract_even/odd operation is two vectors
5844 1st vec 2nd vec
5845 0 1 2 3 4 5 6 7
5847 and the output is the vector of extracted even/odd elements. The output of
5848 extract_even will be: 0 2 4 6
5849 and of extract_odd: 1 3 5 7
5852 The permutation is done in log LENGTH stages. In each stage extract_even
5853 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5854 their order. In our example,
5856 E1: extract_even (1st vec, 2nd vec)
5857 E2: extract_odd (1st vec, 2nd vec)
5858 E3: extract_even (3rd vec, 4th vec)
5859 E4: extract_odd (3rd vec, 4th vec)
5861 The output for the first stage will be:
5863 E1: 0 2 4 6 8 10 12 14
5864 E2: 1 3 5 7 9 11 13 15
5865 E3: 16 18 20 22 24 26 28 30
5866 E4: 17 19 21 23 25 27 29 31
5868 In order to proceed and create the correct sequence for the next stage (or
5869 for the correct output, if the second stage is the last one, as in our
5870 example), we first put the output of extract_even operation and then the
5871 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5872 The input for the second stage is:
5874 1st vec (E1): 0 2 4 6 8 10 12 14
5875 2nd vec (E3): 16 18 20 22 24 26 28 30
5876 3rd vec (E2): 1 3 5 7 9 11 13 15
5877 4th vec (E4): 17 19 21 23 25 27 29 31
5879 The output of the second stage:
5881 E1: 0 4 8 12 16 20 24 28
5882 E2: 2 6 10 14 18 22 26 30
5883 E3: 1 5 9 13 17 21 25 29
5884 E4: 3 7 11 15 19 23 27 31
5886 And RESULT_CHAIN after reordering:
5888 1st vec (E1): 0 4 8 12 16 20 24 28
5889 2nd vec (E3): 1 5 9 13 17 21 25 29
5890 3rd vec (E2): 2 6 10 14 18 22 26 30
5891 4th vec (E4): 3 7 11 15 19 23 27 31. */
5893 static void
5894 vect_permute_load_chain (vec<tree> dr_chain,
5895 unsigned int length,
5896 gimple *stmt,
5897 gimple_stmt_iterator *gsi,
5898 vec<tree> *result_chain)
5900 tree data_ref, first_vect, second_vect;
5901 tree perm_mask_even, perm_mask_odd;
5902 tree perm3_mask_low, perm3_mask_high;
5903 gimple *perm_stmt;
5904 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5905 unsigned int i, j, log_length = exact_log2 (length);
5907 result_chain->quick_grow (length);
5908 memcpy (result_chain->address (), dr_chain.address (),
5909 length * sizeof (tree));
5911 if (length == 3)
5913 /* vect_grouped_load_supported ensures that this is constant. */
5914 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5915 unsigned int k;
5917 vec_perm_builder sel (nelt, nelt, 1);
5918 sel.quick_grow (nelt);
5919 vec_perm_indices indices;
5920 for (k = 0; k < 3; k++)
5922 for (i = 0; i < nelt; i++)
5923 if (3 * i + k < 2 * nelt)
5924 sel[i] = 3 * i + k;
5925 else
5926 sel[i] = 0;
5927 indices.new_vector (sel, 2, nelt);
5928 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5930 for (i = 0, j = 0; i < nelt; i++)
5931 if (3 * i + k < 2 * nelt)
5932 sel[i] = i;
5933 else
5934 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5935 indices.new_vector (sel, 2, nelt);
5936 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5938 first_vect = dr_chain[0];
5939 second_vect = dr_chain[1];
5941 /* Create interleaving stmt (low part of):
5942 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5943 ...}> */
5944 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5945 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5946 second_vect, perm3_mask_low);
5947 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5949 /* Create interleaving stmt (high part of):
5950 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5951 ...}> */
5952 first_vect = data_ref;
5953 second_vect = dr_chain[2];
5954 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5955 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5956 second_vect, perm3_mask_high);
5957 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5958 (*result_chain)[k] = data_ref;
5961 else
5963 /* If length is not equal to 3 then only power of 2 is supported. */
5964 gcc_assert (pow2p_hwi (length));
5966 /* The encoding has a single stepped pattern. */
5967 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5968 vec_perm_builder sel (nelt, 1, 3);
5969 sel.quick_grow (3);
5970 for (i = 0; i < 3; ++i)
5971 sel[i] = i * 2;
5972 vec_perm_indices indices (sel, 2, nelt);
5973 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5975 for (i = 0; i < 3; ++i)
5976 sel[i] = i * 2 + 1;
5977 indices.new_vector (sel, 2, nelt);
5978 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5980 for (i = 0; i < log_length; i++)
5982 for (j = 0; j < length; j += 2)
5984 first_vect = dr_chain[j];
5985 second_vect = dr_chain[j+1];
5987 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5988 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5989 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5990 first_vect, second_vect,
5991 perm_mask_even);
5992 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5993 (*result_chain)[j/2] = data_ref;
5995 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5996 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5997 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5998 first_vect, second_vect,
5999 perm_mask_odd);
6000 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6001 (*result_chain)[j/2+length/2] = data_ref;
6003 memcpy (dr_chain.address (), result_chain->address (),
6004 length * sizeof (tree));
6009 /* Function vect_shift_permute_load_chain.
6011 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6012 sequence of stmts to reorder the input data accordingly.
6013 Return the final references for loads in RESULT_CHAIN.
6014 Return true if successed, false otherwise.
6016 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6017 The input is 3 vectors each containing 8 elements. We assign a
6018 number to each element, the input sequence is:
6020 1st vec: 0 1 2 3 4 5 6 7
6021 2nd vec: 8 9 10 11 12 13 14 15
6022 3rd vec: 16 17 18 19 20 21 22 23
6024 The output sequence should be:
6026 1st vec: 0 3 6 9 12 15 18 21
6027 2nd vec: 1 4 7 10 13 16 19 22
6028 3rd vec: 2 5 8 11 14 17 20 23
6030 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6032 First we shuffle all 3 vectors to get correct elements order:
6034 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6035 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6036 3rd vec: (16 19 22) (17 20 23) (18 21)
6038 Next we unite and shift vector 3 times:
6040 1st step:
6041 shift right by 6 the concatenation of:
6042 "1st vec" and "2nd vec"
6043 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6044 "2nd vec" and "3rd vec"
6045 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6046 "3rd vec" and "1st vec"
6047 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6048 | New vectors |
6050 So that now new vectors are:
6052 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6053 2nd vec: (10 13) (16 19 22) (17 20 23)
6054 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6056 2nd step:
6057 shift right by 5 the concatenation of:
6058 "1st vec" and "3rd vec"
6059 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6060 "2nd vec" and "1st vec"
6061 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6062 "3rd vec" and "2nd vec"
6063 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6064 | New vectors |
6066 So that now new vectors are:
6068 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6069 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6070 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6072 3rd step:
6073 shift right by 5 the concatenation of:
6074 "1st vec" and "1st vec"
6075 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6076 shift right by 3 the concatenation of:
6077 "2nd vec" and "2nd vec"
6078 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6079 | New vectors |
6081 So that now all vectors are READY:
6082 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6083 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6084 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6086 This algorithm is faster than one in vect_permute_load_chain if:
6087 1. "shift of a concatination" is faster than general permutation.
6088 This is usually so.
6089 2. The TARGET machine can't execute vector instructions in parallel.
6090 This is because each step of the algorithm depends on previous.
6091 The algorithm in vect_permute_load_chain is much more parallel.
6093 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6096 static bool
6097 vect_shift_permute_load_chain (vec<tree> dr_chain,
6098 unsigned int length,
6099 gimple *stmt,
6100 gimple_stmt_iterator *gsi,
6101 vec<tree> *result_chain)
6103 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6104 tree perm2_mask1, perm2_mask2, perm3_mask;
6105 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6106 gimple *perm_stmt;
6108 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6109 unsigned int i;
6110 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6111 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6113 unsigned HOST_WIDE_INT nelt, vf;
6114 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6115 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6116 /* Not supported for variable-length vectors. */
6117 return false;
6119 vec_perm_builder sel (nelt, nelt, 1);
6120 sel.quick_grow (nelt);
6122 result_chain->quick_grow (length);
6123 memcpy (result_chain->address (), dr_chain.address (),
6124 length * sizeof (tree));
6126 if (pow2p_hwi (length) && vf > 4)
6128 unsigned int j, log_length = exact_log2 (length);
6129 for (i = 0; i < nelt / 2; ++i)
6130 sel[i] = i * 2;
6131 for (i = 0; i < nelt / 2; ++i)
6132 sel[nelt / 2 + i] = i * 2 + 1;
6133 vec_perm_indices indices (sel, 2, nelt);
6134 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6136 if (dump_enabled_p ())
6137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6138 "shuffle of 2 fields structure is not \
6139 supported by target\n");
6140 return false;
6142 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6144 for (i = 0; i < nelt / 2; ++i)
6145 sel[i] = i * 2 + 1;
6146 for (i = 0; i < nelt / 2; ++i)
6147 sel[nelt / 2 + i] = i * 2;
6148 indices.new_vector (sel, 2, nelt);
6149 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6151 if (dump_enabled_p ())
6152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6153 "shuffle of 2 fields structure is not \
6154 supported by target\n");
6155 return false;
6157 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6159 /* Generating permutation constant to shift all elements.
6160 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6161 for (i = 0; i < nelt; i++)
6162 sel[i] = nelt / 2 + i;
6163 indices.new_vector (sel, 2, nelt);
6164 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6166 if (dump_enabled_p ())
6167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6168 "shift permutation is not supported by target\n");
6169 return false;
6171 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6173 /* Generating permutation constant to select vector from 2.
6174 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6175 for (i = 0; i < nelt / 2; i++)
6176 sel[i] = i;
6177 for (i = nelt / 2; i < nelt; i++)
6178 sel[i] = nelt + i;
6179 indices.new_vector (sel, 2, nelt);
6180 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6182 if (dump_enabled_p ())
6183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6184 "select is not supported by target\n");
6185 return false;
6187 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6189 for (i = 0; i < log_length; i++)
6191 for (j = 0; j < length; j += 2)
6193 first_vect = dr_chain[j];
6194 second_vect = dr_chain[j + 1];
6196 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6197 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6198 first_vect, first_vect,
6199 perm2_mask1);
6200 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6201 vect[0] = data_ref;
6203 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6204 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6205 second_vect, second_vect,
6206 perm2_mask2);
6207 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6208 vect[1] = data_ref;
6210 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6211 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6212 vect[0], vect[1], shift1_mask);
6213 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6214 (*result_chain)[j/2 + length/2] = data_ref;
6216 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6217 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6218 vect[0], vect[1], select_mask);
6219 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6220 (*result_chain)[j/2] = data_ref;
6222 memcpy (dr_chain.address (), result_chain->address (),
6223 length * sizeof (tree));
6225 return true;
6227 if (length == 3 && vf > 2)
6229 unsigned int k = 0, l = 0;
6231 /* Generating permutation constant to get all elements in rigth order.
6232 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6233 for (i = 0; i < nelt; i++)
6235 if (3 * k + (l % 3) >= nelt)
6237 k = 0;
6238 l += (3 - (nelt % 3));
6240 sel[i] = 3 * k + (l % 3);
6241 k++;
6243 vec_perm_indices indices (sel, 2, nelt);
6244 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6246 if (dump_enabled_p ())
6247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6248 "shuffle of 3 fields structure is not \
6249 supported by target\n");
6250 return false;
6252 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6254 /* Generating permutation constant to shift all elements.
6255 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6256 for (i = 0; i < nelt; i++)
6257 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6258 indices.new_vector (sel, 2, nelt);
6259 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6261 if (dump_enabled_p ())
6262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263 "shift permutation is not supported by target\n");
6264 return false;
6266 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6268 /* Generating permutation constant to shift all elements.
6269 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6270 for (i = 0; i < nelt; i++)
6271 sel[i] = 2 * (nelt / 3) + 1 + i;
6272 indices.new_vector (sel, 2, nelt);
6273 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6275 if (dump_enabled_p ())
6276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6277 "shift permutation is not supported by target\n");
6278 return false;
6280 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6282 /* Generating permutation constant to shift all elements.
6283 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6284 for (i = 0; i < nelt; i++)
6285 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6286 indices.new_vector (sel, 2, nelt);
6287 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6289 if (dump_enabled_p ())
6290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6291 "shift permutation is not supported by target\n");
6292 return false;
6294 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6296 /* Generating permutation constant to shift all elements.
6297 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6298 for (i = 0; i < nelt; i++)
6299 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6300 indices.new_vector (sel, 2, nelt);
6301 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6303 if (dump_enabled_p ())
6304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6305 "shift permutation is not supported by target\n");
6306 return false;
6308 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6310 for (k = 0; k < 3; k++)
6312 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6313 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6314 dr_chain[k], dr_chain[k],
6315 perm3_mask);
6316 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6317 vect[k] = data_ref;
6320 for (k = 0; k < 3; k++)
6322 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6323 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6324 vect[k % 3], vect[(k + 1) % 3],
6325 shift1_mask);
6326 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6327 vect_shift[k] = data_ref;
6330 for (k = 0; k < 3; k++)
6332 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6333 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6334 vect_shift[(4 - k) % 3],
6335 vect_shift[(3 - k) % 3],
6336 shift2_mask);
6337 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6338 vect[k] = data_ref;
6341 (*result_chain)[3 - (nelt % 3)] = vect[2];
6343 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6344 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6345 vect[0], shift3_mask);
6346 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6347 (*result_chain)[nelt % 3] = data_ref;
6349 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6350 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6351 vect[1], shift4_mask);
6352 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6353 (*result_chain)[0] = data_ref;
6354 return true;
6356 return false;
6359 /* Function vect_transform_grouped_load.
6361 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6362 to perform their permutation and ascribe the result vectorized statements to
6363 the scalar statements.
6366 void
6367 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6368 gimple_stmt_iterator *gsi)
6370 machine_mode mode;
6371 vec<tree> result_chain = vNULL;
6373 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6374 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6375 vectors, that are ready for vector computation. */
6376 result_chain.create (size);
6378 /* If reassociation width for vector type is 2 or greater target machine can
6379 execute 2 or more vector instructions in parallel. Otherwise try to
6380 get chain for loads group using vect_shift_permute_load_chain. */
6381 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6382 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6383 || pow2p_hwi (size)
6384 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6385 gsi, &result_chain))
6386 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6387 vect_record_grouped_load_vectors (stmt, result_chain);
6388 result_chain.release ();
6391 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6392 generated as part of the vectorization of STMT. Assign the statement
6393 for each vector to the associated scalar statement. */
6395 void
6396 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6398 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6399 gimple *next_stmt, *new_stmt;
6400 unsigned int i, gap_count;
6401 tree tmp_data_ref;
6403 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6404 Since we scan the chain starting from it's first node, their order
6405 corresponds the order of data-refs in RESULT_CHAIN. */
6406 next_stmt = first_stmt;
6407 gap_count = 1;
6408 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6410 if (!next_stmt)
6411 break;
6413 /* Skip the gaps. Loads created for the gaps will be removed by dead
6414 code elimination pass later. No need to check for the first stmt in
6415 the group, since it always exists.
6416 GROUP_GAP is the number of steps in elements from the previous
6417 access (if there is no gap GROUP_GAP is 1). We skip loads that
6418 correspond to the gaps. */
6419 if (next_stmt != first_stmt
6420 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6422 gap_count++;
6423 continue;
6426 while (next_stmt)
6428 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6429 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6430 copies, and we put the new vector statement in the first available
6431 RELATED_STMT. */
6432 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6433 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6434 else
6436 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6438 gimple *prev_stmt =
6439 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6440 gimple *rel_stmt =
6441 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6442 while (rel_stmt)
6444 prev_stmt = rel_stmt;
6445 rel_stmt =
6446 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6449 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6450 new_stmt;
6454 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6455 gap_count = 1;
6456 /* If NEXT_STMT accesses the same DR as the previous statement,
6457 put the same TMP_DATA_REF as its vectorized statement; otherwise
6458 get the next data-ref from RESULT_CHAIN. */
6459 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6460 break;
6465 /* Function vect_force_dr_alignment_p.
6467 Returns whether the alignment of a DECL can be forced to be aligned
6468 on ALIGNMENT bit boundary. */
6470 bool
6471 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6473 if (!VAR_P (decl))
6474 return false;
6476 if (decl_in_symtab_p (decl)
6477 && !symtab_node::get (decl)->can_increase_alignment_p ())
6478 return false;
6480 if (TREE_STATIC (decl))
6481 return (alignment <= MAX_OFILE_ALIGNMENT);
6482 else
6483 return (alignment <= MAX_STACK_ALIGNMENT);
6487 /* Return whether the data reference DR is supported with respect to its
6488 alignment.
6489 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6490 it is aligned, i.e., check if it is possible to vectorize it with different
6491 alignment. */
6493 enum dr_alignment_support
6494 vect_supportable_dr_alignment (struct data_reference *dr,
6495 bool check_aligned_accesses)
6497 gimple *stmt = DR_STMT (dr);
6498 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6499 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6500 machine_mode mode = TYPE_MODE (vectype);
6501 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6502 struct loop *vect_loop = NULL;
6503 bool nested_in_vect_loop = false;
6505 if (aligned_access_p (dr) && !check_aligned_accesses)
6506 return dr_aligned;
6508 /* For now assume all conditional loads/stores support unaligned
6509 access without any special code. */
6510 if (is_gimple_call (stmt)
6511 && gimple_call_internal_p (stmt)
6512 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6513 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6514 return dr_unaligned_supported;
6516 if (loop_vinfo)
6518 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6519 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6522 /* Possibly unaligned access. */
6524 /* We can choose between using the implicit realignment scheme (generating
6525 a misaligned_move stmt) and the explicit realignment scheme (generating
6526 aligned loads with a REALIGN_LOAD). There are two variants to the
6527 explicit realignment scheme: optimized, and unoptimized.
6528 We can optimize the realignment only if the step between consecutive
6529 vector loads is equal to the vector size. Since the vector memory
6530 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6531 is guaranteed that the misalignment amount remains the same throughout the
6532 execution of the vectorized loop. Therefore, we can create the
6533 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6534 at the loop preheader.
6536 However, in the case of outer-loop vectorization, when vectorizing a
6537 memory access in the inner-loop nested within the LOOP that is now being
6538 vectorized, while it is guaranteed that the misalignment of the
6539 vectorized memory access will remain the same in different outer-loop
6540 iterations, it is *not* guaranteed that is will remain the same throughout
6541 the execution of the inner-loop. This is because the inner-loop advances
6542 with the original scalar step (and not in steps of VS). If the inner-loop
6543 step happens to be a multiple of VS, then the misalignment remains fixed
6544 and we can use the optimized realignment scheme. For example:
6546 for (i=0; i<N; i++)
6547 for (j=0; j<M; j++)
6548 s += a[i+j];
6550 When vectorizing the i-loop in the above example, the step between
6551 consecutive vector loads is 1, and so the misalignment does not remain
6552 fixed across the execution of the inner-loop, and the realignment cannot
6553 be optimized (as illustrated in the following pseudo vectorized loop):
6555 for (i=0; i<N; i+=4)
6556 for (j=0; j<M; j++){
6557 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6558 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6559 // (assuming that we start from an aligned address).
6562 We therefore have to use the unoptimized realignment scheme:
6564 for (i=0; i<N; i+=4)
6565 for (j=k; j<M; j+=4)
6566 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6567 // that the misalignment of the initial address is
6568 // 0).
6570 The loop can then be vectorized as follows:
6572 for (k=0; k<4; k++){
6573 rt = get_realignment_token (&vp[k]);
6574 for (i=0; i<N; i+=4){
6575 v1 = vp[i+k];
6576 for (j=k; j<M; j+=4){
6577 v2 = vp[i+j+VS-1];
6578 va = REALIGN_LOAD <v1,v2,rt>;
6579 vs += va;
6580 v1 = v2;
6583 } */
6585 if (DR_IS_READ (dr))
6587 bool is_packed = false;
6588 tree type = (TREE_TYPE (DR_REF (dr)));
6590 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6591 && (!targetm.vectorize.builtin_mask_for_load
6592 || targetm.vectorize.builtin_mask_for_load ()))
6594 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6596 /* If we are doing SLP then the accesses need not have the
6597 same alignment, instead it depends on the SLP group size. */
6598 if (loop_vinfo
6599 && STMT_SLP_TYPE (stmt_info)
6600 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6601 * GROUP_SIZE (vinfo_for_stmt
6602 (GROUP_FIRST_ELEMENT (stmt_info))),
6603 TYPE_VECTOR_SUBPARTS (vectype)))
6605 else if (!loop_vinfo
6606 || (nested_in_vect_loop
6607 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6608 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6609 return dr_explicit_realign;
6610 else
6611 return dr_explicit_realign_optimized;
6613 if (!known_alignment_for_access_p (dr))
6614 is_packed = not_size_aligned (DR_REF (dr));
6616 if (targetm.vectorize.support_vector_misalignment
6617 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6618 /* Can't software pipeline the loads, but can at least do them. */
6619 return dr_unaligned_supported;
6621 else
6623 bool is_packed = false;
6624 tree type = (TREE_TYPE (DR_REF (dr)));
6626 if (!known_alignment_for_access_p (dr))
6627 is_packed = not_size_aligned (DR_REF (dr));
6629 if (targetm.vectorize.support_vector_misalignment
6630 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6631 return dr_unaligned_supported;
6634 /* Unsupported. */
6635 return dr_unaligned_unsupported;