* configure.ac (LD_AS_NEEDED_OPTION, LD_NO_AS_NEEDED_OPTION): Use
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobdf362b0e1c5cd9ec4709c93ce9848101c0ee07f1
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
216 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
219 /* A subroutine of vect_analyze_data_ref_dependence. Handle
220 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
221 distances. These distances are conservatively correct but they don't
222 reflect a guaranteed dependence.
224 Return true if this function does all the work necessary to avoid
225 an alias or false if the caller should use the dependence distances
226 to limit the vectorization factor in the usual way. LOOP_DEPTH is
227 the depth of the loop described by LOOP_VINFO and the other arguments
228 are as for vect_analyze_data_ref_dependence. */
230 static bool
231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 int loop_depth, unsigned int *max_vf)
235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
236 lambda_vector dist_v;
237 unsigned int i;
238 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
240 int dist = dist_v[loop_depth];
241 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
243 /* If the user asserted safelen >= DIST consecutive iterations
244 can be executed concurrently, assume independence.
246 ??? An alternative would be to add the alias check even
247 in this case, and vectorize the fallback loop with the
248 maximum VF set to safelen. However, if the user has
249 explicitly given a length, it's less likely that that
250 would be a win. */
251 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
253 if ((unsigned int) loop->safelen < *max_vf)
254 *max_vf = loop->safelen;
255 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
256 continue;
259 /* For dependence distances of 2 or more, we have the option
260 of limiting VF or checking for an alias at runtime.
261 Prefer to check at runtime if we can, to avoid limiting
262 the VF unnecessarily when the bases are in fact independent.
264 Note that the alias checks will be removed if the VF ends up
265 being small enough. */
266 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
269 return true;
273 /* Function vect_analyze_data_ref_dependence.
275 Return TRUE if there (might) exist a dependence between a memory-reference
276 DRA and a memory-reference DRB. When versioning for alias may check a
277 dependence at run-time, return FALSE. Adjust *MAX_VF according to
278 the data dependence. */
280 static bool
281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
282 loop_vec_info loop_vinfo,
283 unsigned int *max_vf)
285 unsigned int i;
286 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
287 struct data_reference *dra = DDR_A (ddr);
288 struct data_reference *drb = DDR_B (ddr);
289 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
290 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
291 lambda_vector dist_v;
292 unsigned int loop_depth;
294 /* In loop analysis all data references should be vectorizable. */
295 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
296 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
297 gcc_unreachable ();
299 /* Independent data accesses. */
300 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
301 return false;
303 if (dra == drb
304 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
305 return false;
307 /* We do not have to consider dependences between accesses that belong
308 to the same group. */
309 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
310 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
311 return false;
313 /* Even if we have an anti-dependence then, as the vectorized loop covers at
314 least two scalar iterations, there is always also a true dependence.
315 As the vectorizer does not re-order loads and stores we can ignore
316 the anti-dependence if TBAA can disambiguate both DRs similar to the
317 case with known negative distance anti-dependences (positive
318 distance anti-dependences would violate TBAA constraints). */
319 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
320 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
321 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
322 get_alias_set (DR_REF (drb))))
323 return false;
325 /* Unknown data dependence. */
326 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
328 /* If user asserted safelen consecutive iterations can be
329 executed concurrently, assume independence. */
330 if (loop->safelen >= 2)
332 if ((unsigned int) loop->safelen < *max_vf)
333 *max_vf = loop->safelen;
334 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
335 return false;
338 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
339 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
341 if (dump_enabled_p ())
343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
344 "versioning for alias not supported for: "
345 "can't determine dependence between ");
346 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
347 DR_REF (dra));
348 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
349 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
350 DR_REF (drb));
351 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
353 return true;
356 if (dump_enabled_p ())
358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
359 "versioning for alias required: "
360 "can't determine dependence between ");
361 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
362 DR_REF (dra));
363 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
364 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
365 DR_REF (drb));
366 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
369 /* Add to list of ddrs that need to be tested at run-time. */
370 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
373 /* Known data dependence. */
374 if (DDR_NUM_DIST_VECTS (ddr) == 0)
376 /* If user asserted safelen consecutive iterations can be
377 executed concurrently, assume independence. */
378 if (loop->safelen >= 2)
380 if ((unsigned int) loop->safelen < *max_vf)
381 *max_vf = loop->safelen;
382 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
383 return false;
386 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
387 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
389 if (dump_enabled_p ())
391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
392 "versioning for alias not supported for: "
393 "bad dist vector for ");
394 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
395 DR_REF (dra));
396 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
397 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
398 DR_REF (drb));
399 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
401 return true;
404 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "versioning for alias required: "
408 "bad dist vector for ");
409 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
410 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
411 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
412 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
414 /* Add to list of ddrs that need to be tested at run-time. */
415 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
418 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
420 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
421 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
422 loop_depth, max_vf))
423 return false;
425 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
427 int dist = dist_v[loop_depth];
429 if (dump_enabled_p ())
430 dump_printf_loc (MSG_NOTE, vect_location,
431 "dependence distance = %d.\n", dist);
433 if (dist == 0)
435 if (dump_enabled_p ())
437 dump_printf_loc (MSG_NOTE, vect_location,
438 "dependence distance == 0 between ");
439 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
440 dump_printf (MSG_NOTE, " and ");
441 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
442 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
445 /* When we perform grouped accesses and perform implicit CSE
446 by detecting equal accesses and doing disambiguation with
447 runtime alias tests like for
448 .. = a[i];
449 .. = a[i+1];
450 a[i] = ..;
451 a[i+1] = ..;
452 *p = ..;
453 .. = a[i];
454 .. = a[i+1];
455 where we will end up loading { a[i], a[i+1] } once, make
456 sure that inserting group loads before the first load and
457 stores after the last store will do the right thing.
458 Similar for groups like
459 a[i] = ...;
460 ... = a[i];
461 a[i+1] = ...;
462 where loads from the group interleave with the store. */
463 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
465 if (dump_enabled_p ())
466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
467 "READ_WRITE dependence in interleaving.\n");
468 return true;
471 if (loop->safelen < 2)
473 tree indicator = dr_zero_step_indicator (dra);
474 if (TREE_CODE (indicator) != INTEGER_CST)
475 vect_check_nonzero_value (loop_vinfo, indicator);
476 else if (integer_zerop (indicator))
478 if (dump_enabled_p ())
479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
480 "access also has a zero step\n");
481 return true;
484 continue;
487 if (dist > 0 && DDR_REVERSED_P (ddr))
489 /* If DDR_REVERSED_P the order of the data-refs in DDR was
490 reversed (to make distance vector positive), and the actual
491 distance is negative. */
492 if (dump_enabled_p ())
493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
494 "dependence distance negative.\n");
495 /* Record a negative dependence distance to later limit the
496 amount of stmt copying / unrolling we can perform.
497 Only need to handle read-after-write dependence. */
498 if (DR_IS_READ (drb)
499 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
500 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
501 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
502 continue;
505 unsigned int abs_dist = abs (dist);
506 if (abs_dist >= 2 && abs_dist < *max_vf)
508 /* The dependence distance requires reduction of the maximal
509 vectorization factor. */
510 *max_vf = abs (dist);
511 if (dump_enabled_p ())
512 dump_printf_loc (MSG_NOTE, vect_location,
513 "adjusting maximal vectorization factor to %i\n",
514 *max_vf);
517 if (abs_dist >= *max_vf)
519 /* Dependence distance does not create dependence, as far as
520 vectorization is concerned, in this case. */
521 if (dump_enabled_p ())
522 dump_printf_loc (MSG_NOTE, vect_location,
523 "dependence distance >= VF.\n");
524 continue;
527 if (dump_enabled_p ())
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
530 "not vectorized, possible dependence "
531 "between data-refs ");
532 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
533 dump_printf (MSG_NOTE, " and ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
535 dump_printf (MSG_NOTE, "\n");
538 return true;
541 return false;
544 /* Function vect_analyze_data_ref_dependences.
546 Examine all the data references in the loop, and make sure there do not
547 exist any data dependences between them. Set *MAX_VF according to
548 the maximum vectorization factor the data dependences allow. */
550 bool
551 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
552 unsigned int *max_vf)
554 unsigned int i;
555 struct data_dependence_relation *ddr;
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location,
559 "=== vect_analyze_data_ref_dependences ===\n");
561 LOOP_VINFO_DDRS (loop_vinfo)
562 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
563 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
564 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
565 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
566 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
567 &LOOP_VINFO_DDRS (loop_vinfo),
568 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
569 return false;
571 /* For epilogues we either have no aliases or alias versioning
572 was applied to original loop. Therefore we may just get max_vf
573 using VF of original loop. */
574 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
575 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
576 else
577 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
578 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
579 return false;
581 return true;
585 /* Function vect_slp_analyze_data_ref_dependence.
587 Return TRUE if there (might) exist a dependence between a memory-reference
588 DRA and a memory-reference DRB. When versioning for alias may check a
589 dependence at run-time, return FALSE. Adjust *MAX_VF according to
590 the data dependence. */
592 static bool
593 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
595 struct data_reference *dra = DDR_A (ddr);
596 struct data_reference *drb = DDR_B (ddr);
598 /* We need to check dependences of statements marked as unvectorizable
599 as well, they still can prohibit vectorization. */
601 /* Independent data accesses. */
602 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
603 return false;
605 if (dra == drb)
606 return false;
608 /* Read-read is OK. */
609 if (DR_IS_READ (dra) && DR_IS_READ (drb))
610 return false;
612 /* If dra and drb are part of the same interleaving chain consider
613 them independent. */
614 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
615 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
616 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
617 return false;
619 /* Unknown data dependence. */
620 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
622 if (dump_enabled_p ())
624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
625 "can't determine dependence between ");
626 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
627 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
628 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
629 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
632 else if (dump_enabled_p ())
634 dump_printf_loc (MSG_NOTE, vect_location,
635 "determined dependence between ");
636 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
637 dump_printf (MSG_NOTE, " and ");
638 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
639 dump_printf (MSG_NOTE, "\n");
642 return true;
646 /* Analyze dependences involved in the transform of SLP NODE. STORES
647 contain the vector of scalar stores of this instance if we are
648 disambiguating the loads. */
650 static bool
651 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
652 vec<gimple *> stores, gimple *last_store)
654 /* This walks over all stmts involved in the SLP load/store done
655 in NODE verifying we can sink them up to the last stmt in the
656 group. */
657 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
658 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
660 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
661 if (access == last_access)
662 continue;
663 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
664 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
665 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
667 gimple *stmt = gsi_stmt (gsi);
668 if (! gimple_vuse (stmt)
669 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
670 continue;
672 /* If we couldn't record a (single) data reference for this
673 stmt we have to give up. */
674 /* ??? Here and below if dependence analysis fails we can resort
675 to the alias oracle which can handle more kinds of stmts. */
676 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
677 if (!dr_b)
678 return false;
680 bool dependent = false;
681 /* If we run into a store of this same instance (we've just
682 marked those) then delay dependence checking until we run
683 into the last store because this is where it will have
684 been sunk to (and we verify if we can do that as well). */
685 if (gimple_visited_p (stmt))
687 if (stmt != last_store)
688 continue;
689 unsigned i;
690 gimple *store;
691 FOR_EACH_VEC_ELT (stores, i, store)
693 data_reference *store_dr
694 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
695 ddr_p ddr = initialize_data_dependence_relation
696 (dr_a, store_dr, vNULL);
697 dependent = vect_slp_analyze_data_ref_dependence (ddr);
698 free_dependence_relation (ddr);
699 if (dependent)
700 break;
703 else
705 ddr_p ddr = initialize_data_dependence_relation (dr_a,
706 dr_b, vNULL);
707 dependent = vect_slp_analyze_data_ref_dependence (ddr);
708 free_dependence_relation (ddr);
710 if (dependent)
711 return false;
714 return true;
718 /* Function vect_analyze_data_ref_dependences.
720 Examine all the data references in the basic-block, and make sure there
721 do not exist any data dependences between them. Set *MAX_VF according to
722 the maximum vectorization factor the data dependences allow. */
724 bool
725 vect_slp_analyze_instance_dependence (slp_instance instance)
727 if (dump_enabled_p ())
728 dump_printf_loc (MSG_NOTE, vect_location,
729 "=== vect_slp_analyze_instance_dependence ===\n");
731 /* The stores of this instance are at the root of the SLP tree. */
732 slp_tree store = SLP_INSTANCE_TREE (instance);
733 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
734 store = NULL;
736 /* Verify we can sink stores to the vectorized stmt insert location. */
737 gimple *last_store = NULL;
738 if (store)
740 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
741 return false;
743 /* Mark stores in this instance and remember the last one. */
744 last_store = vect_find_last_scalar_stmt_in_slp (store);
745 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
746 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
749 bool res = true;
751 /* Verify we can sink loads to the vectorized stmt insert location,
752 special-casing stores of this instance. */
753 slp_tree load;
754 unsigned int i;
755 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
756 if (! vect_slp_analyze_node_dependences (instance, load,
757 store
758 ? SLP_TREE_SCALAR_STMTS (store)
759 : vNULL, last_store))
761 res = false;
762 break;
765 /* Unset the visited flag. */
766 if (store)
767 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
768 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
770 return res;
773 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
774 the statement that contains DRB, which is useful for recording in the
775 dump file. */
777 static void
778 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
779 innermost_loop_behavior *drb)
781 bool existed;
782 innermost_loop_behavior *&entry
783 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
784 if (!existed || entry->base_alignment < drb->base_alignment)
786 entry = drb;
787 if (dump_enabled_p ())
789 dump_printf_loc (MSG_NOTE, vect_location,
790 "recording new base alignment for ");
791 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
792 dump_printf (MSG_NOTE, "\n");
793 dump_printf_loc (MSG_NOTE, vect_location,
794 " alignment: %d\n", drb->base_alignment);
795 dump_printf_loc (MSG_NOTE, vect_location,
796 " misalignment: %d\n", drb->base_misalignment);
797 dump_printf_loc (MSG_NOTE, vect_location,
798 " based on: ");
799 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
804 /* If the region we're going to vectorize is reached, all unconditional
805 data references occur at least once. We can therefore pool the base
806 alignment guarantees from each unconditional reference. Do this by
807 going through all the data references in VINFO and checking whether
808 the containing statement makes the reference unconditionally. If so,
809 record the alignment of the base address in VINFO so that it can be
810 used for all other references with the same base. */
812 void
813 vect_record_base_alignments (vec_info *vinfo)
815 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
816 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
817 data_reference *dr;
818 unsigned int i;
819 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
820 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
822 gimple *stmt = DR_STMT (dr);
823 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
825 /* If DR is nested in the loop that is being vectorized, we can also
826 record the alignment of the base wrt the outer loop. */
827 if (loop && nested_in_vect_loop_p (loop, stmt))
829 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
830 vect_record_base_alignment
831 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
836 /* Return the target alignment for the vectorized form of DR. */
838 static unsigned int
839 vect_calculate_target_alignment (struct data_reference *dr)
841 gimple *stmt = DR_STMT (dr);
842 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
843 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
844 return targetm.vectorize.preferred_vector_alignment (vectype);
847 /* Function vect_compute_data_ref_alignment
849 Compute the misalignment of the data reference DR.
851 Output:
852 1. If during the misalignment computation it is found that the data reference
853 cannot be vectorized then false is returned.
854 2. DR_MISALIGNMENT (DR) is defined.
856 FOR NOW: No analysis is actually performed. Misalignment is calculated
857 only for trivial cases. TODO. */
859 bool
860 vect_compute_data_ref_alignment (struct data_reference *dr)
862 gimple *stmt = DR_STMT (dr);
863 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
864 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
865 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
866 struct loop *loop = NULL;
867 tree ref = DR_REF (dr);
868 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
870 if (dump_enabled_p ())
871 dump_printf_loc (MSG_NOTE, vect_location,
872 "vect_compute_data_ref_alignment:\n");
874 if (loop_vinfo)
875 loop = LOOP_VINFO_LOOP (loop_vinfo);
877 /* Initialize misalignment to unknown. */
878 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
880 innermost_loop_behavior *drb = vect_dr_behavior (dr);
881 bool step_preserves_misalignment_p;
883 unsigned HOST_WIDE_INT vector_alignment
884 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
885 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
887 /* No step for BB vectorization. */
888 if (!loop)
890 gcc_assert (integer_zerop (drb->step));
891 step_preserves_misalignment_p = true;
894 /* In case the dataref is in an inner-loop of the loop that is being
895 vectorized (LOOP), we use the base and misalignment information
896 relative to the outer-loop (LOOP). This is ok only if the misalignment
897 stays the same throughout the execution of the inner-loop, which is why
898 we have to check that the stride of the dataref in the inner-loop evenly
899 divides by the vector alignment. */
900 else if (nested_in_vect_loop_p (loop, stmt))
902 step_preserves_misalignment_p
903 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
905 if (dump_enabled_p ())
907 if (step_preserves_misalignment_p)
908 dump_printf_loc (MSG_NOTE, vect_location,
909 "inner step divides the vector alignment.\n");
910 else
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 "inner step doesn't divide the vector"
913 " alignment.\n");
917 /* Similarly we can only use base and misalignment information relative to
918 an innermost loop if the misalignment stays the same throughout the
919 execution of the loop. As above, this is the case if the stride of
920 the dataref evenly divides by the alignment. */
921 else
923 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
924 step_preserves_misalignment_p
925 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
927 if (!step_preserves_misalignment_p && dump_enabled_p ())
928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
929 "step doesn't divide the vector alignment.\n");
932 unsigned int base_alignment = drb->base_alignment;
933 unsigned int base_misalignment = drb->base_misalignment;
935 /* Calculate the maximum of the pooled base address alignment and the
936 alignment that we can compute for DR itself. */
937 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
938 if (entry && base_alignment < (*entry)->base_alignment)
940 base_alignment = (*entry)->base_alignment;
941 base_misalignment = (*entry)->base_misalignment;
944 if (drb->offset_alignment < vector_alignment
945 || !step_preserves_misalignment_p
946 /* We need to know whether the step wrt the vectorized loop is
947 negative when computing the starting misalignment below. */
948 || TREE_CODE (drb->step) != INTEGER_CST)
950 if (dump_enabled_p ())
952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
953 "Unknown alignment for access: ");
954 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
955 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
957 return true;
960 if (base_alignment < vector_alignment)
962 unsigned int max_alignment;
963 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
964 if (max_alignment < vector_alignment
965 || !vect_can_force_dr_alignment_p (base,
966 vector_alignment * BITS_PER_UNIT))
968 if (dump_enabled_p ())
970 dump_printf_loc (MSG_NOTE, vect_location,
971 "can't force alignment of ref: ");
972 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
973 dump_printf (MSG_NOTE, "\n");
975 return true;
978 /* Force the alignment of the decl.
979 NOTE: This is the only change to the code we make during
980 the analysis phase, before deciding to vectorize the loop. */
981 if (dump_enabled_p ())
983 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
984 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
985 dump_printf (MSG_NOTE, "\n");
988 DR_VECT_AUX (dr)->base_decl = base;
989 DR_VECT_AUX (dr)->base_misaligned = true;
990 base_misalignment = 0;
992 poly_int64 misalignment
993 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
995 /* If this is a backward running DR then first access in the larger
996 vectype actually is N-1 elements before the address in the DR.
997 Adjust misalign accordingly. */
998 if (tree_int_cst_sgn (drb->step) < 0)
999 /* PLUS because STEP is negative. */
1000 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1001 * TREE_INT_CST_LOW (drb->step));
1003 unsigned int const_misalignment;
1004 if (!known_misalignment (misalignment, vector_alignment,
1005 &const_misalignment))
1007 if (dump_enabled_p ())
1009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010 "Non-constant misalignment for access: ");
1011 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1012 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1014 return true;
1017 SET_DR_MISALIGNMENT (dr, const_misalignment);
1019 if (dump_enabled_p ())
1021 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1022 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1023 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1024 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1027 return true;
1030 /* Function vect_update_misalignment_for_peel.
1031 Sets DR's misalignment
1032 - to 0 if it has the same alignment as DR_PEEL,
1033 - to the misalignment computed using NPEEL if DR's salignment is known,
1034 - to -1 (unknown) otherwise.
1036 DR - the data reference whose misalignment is to be adjusted.
1037 DR_PEEL - the data reference whose misalignment is being made
1038 zero in the vector loop by the peel.
1039 NPEEL - the number of iterations in the peel loop if the misalignment
1040 of DR_PEEL is known at compile time. */
1042 static void
1043 vect_update_misalignment_for_peel (struct data_reference *dr,
1044 struct data_reference *dr_peel, int npeel)
1046 unsigned int i;
1047 vec<dr_p> same_aligned_drs;
1048 struct data_reference *current_dr;
1049 int dr_size = vect_get_scalar_dr_size (dr);
1050 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1051 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1052 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1054 /* For interleaved data accesses the step in the loop must be multiplied by
1055 the size of the interleaving group. */
1056 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1057 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1058 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1059 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1061 /* It can be assumed that the data refs with the same alignment as dr_peel
1062 are aligned in the vector loop. */
1063 same_aligned_drs
1064 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1065 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1067 if (current_dr != dr)
1068 continue;
1069 gcc_assert (!known_alignment_for_access_p (dr)
1070 || !known_alignment_for_access_p (dr_peel)
1071 || (DR_MISALIGNMENT (dr) / dr_size
1072 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1073 SET_DR_MISALIGNMENT (dr, 0);
1074 return;
1077 if (known_alignment_for_access_p (dr)
1078 && known_alignment_for_access_p (dr_peel))
1080 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1081 int misal = DR_MISALIGNMENT (dr);
1082 misal += negative ? -npeel * dr_size : npeel * dr_size;
1083 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1084 SET_DR_MISALIGNMENT (dr, misal);
1085 return;
1088 if (dump_enabled_p ())
1089 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1090 "to unknown (-1).\n");
1091 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1095 /* Function verify_data_ref_alignment
1097 Return TRUE if DR can be handled with respect to alignment. */
1099 static bool
1100 verify_data_ref_alignment (data_reference_p dr)
1102 enum dr_alignment_support supportable_dr_alignment
1103 = vect_supportable_dr_alignment (dr, false);
1104 if (!supportable_dr_alignment)
1106 if (dump_enabled_p ())
1108 if (DR_IS_READ (dr))
1109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1110 "not vectorized: unsupported unaligned load.");
1111 else
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "not vectorized: unsupported unaligned "
1114 "store.");
1116 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1117 DR_REF (dr));
1118 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1120 return false;
1123 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1124 dump_printf_loc (MSG_NOTE, vect_location,
1125 "Vectorizing an unaligned access.\n");
1127 return true;
1130 /* Function vect_verify_datarefs_alignment
1132 Return TRUE if all data references in the loop can be
1133 handled with respect to alignment. */
1135 bool
1136 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1138 vec<data_reference_p> datarefs = vinfo->datarefs;
1139 struct data_reference *dr;
1140 unsigned int i;
1142 FOR_EACH_VEC_ELT (datarefs, i, dr)
1144 gimple *stmt = DR_STMT (dr);
1145 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1147 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1148 continue;
1150 /* For interleaving, only the alignment of the first access matters. */
1151 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1152 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1153 continue;
1155 /* Strided accesses perform only component accesses, alignment is
1156 irrelevant for them. */
1157 if (STMT_VINFO_STRIDED_P (stmt_info)
1158 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1159 continue;
1161 if (! verify_data_ref_alignment (dr))
1162 return false;
1165 return true;
1168 /* Given an memory reference EXP return whether its alignment is less
1169 than its size. */
1171 static bool
1172 not_size_aligned (tree exp)
1174 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1175 return true;
1177 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1178 > get_object_alignment (exp));
1181 /* Function vector_alignment_reachable_p
1183 Return true if vector alignment for DR is reachable by peeling
1184 a few loop iterations. Return false otherwise. */
1186 static bool
1187 vector_alignment_reachable_p (struct data_reference *dr)
1189 gimple *stmt = DR_STMT (dr);
1190 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1191 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1193 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1195 /* For interleaved access we peel only if number of iterations in
1196 the prolog loop ({VF - misalignment}), is a multiple of the
1197 number of the interleaved accesses. */
1198 int elem_size, mis_in_elements;
1200 /* FORNOW: handle only known alignment. */
1201 if (!known_alignment_for_access_p (dr))
1202 return false;
1204 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1205 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1206 elem_size = vector_element_size (vector_size, nelements);
1207 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1209 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1210 return false;
1213 /* If misalignment is known at the compile time then allow peeling
1214 only if natural alignment is reachable through peeling. */
1215 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1217 HOST_WIDE_INT elmsize =
1218 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1219 if (dump_enabled_p ())
1221 dump_printf_loc (MSG_NOTE, vect_location,
1222 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1223 dump_printf (MSG_NOTE,
1224 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1226 if (DR_MISALIGNMENT (dr) % elmsize)
1228 if (dump_enabled_p ())
1229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230 "data size does not divide the misalignment.\n");
1231 return false;
1235 if (!known_alignment_for_access_p (dr))
1237 tree type = TREE_TYPE (DR_REF (dr));
1238 bool is_packed = not_size_aligned (DR_REF (dr));
1239 if (dump_enabled_p ())
1240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1241 "Unknown misalignment, %snaturally aligned\n",
1242 is_packed ? "not " : "");
1243 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1246 return true;
1250 /* Calculate the cost of the memory access represented by DR. */
1252 static void
1253 vect_get_data_access_cost (struct data_reference *dr,
1254 unsigned int *inside_cost,
1255 unsigned int *outside_cost,
1256 stmt_vector_for_cost *body_cost_vec,
1257 stmt_vector_for_cost *prologue_cost_vec)
1259 gimple *stmt = DR_STMT (dr);
1260 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1261 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1262 int ncopies;
1264 if (PURE_SLP_STMT (stmt_info))
1265 ncopies = 1;
1266 else
1267 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1269 if (DR_IS_READ (dr))
1270 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1271 prologue_cost_vec, body_cost_vec, false);
1272 else
1273 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1275 if (dump_enabled_p ())
1276 dump_printf_loc (MSG_NOTE, vect_location,
1277 "vect_get_data_access_cost: inside_cost = %d, "
1278 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1282 typedef struct _vect_peel_info
1284 struct data_reference *dr;
1285 int npeel;
1286 unsigned int count;
1287 } *vect_peel_info;
1289 typedef struct _vect_peel_extended_info
1291 struct _vect_peel_info peel_info;
1292 unsigned int inside_cost;
1293 unsigned int outside_cost;
1294 } *vect_peel_extended_info;
1297 /* Peeling hashtable helpers. */
1299 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1301 static inline hashval_t hash (const _vect_peel_info *);
1302 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1305 inline hashval_t
1306 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1308 return (hashval_t) peel_info->npeel;
1311 inline bool
1312 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1314 return (a->npeel == b->npeel);
1318 /* Insert DR into peeling hash table with NPEEL as key. */
1320 static void
1321 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1322 loop_vec_info loop_vinfo, struct data_reference *dr,
1323 int npeel)
1325 struct _vect_peel_info elem, *slot;
1326 _vect_peel_info **new_slot;
1327 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1329 elem.npeel = npeel;
1330 slot = peeling_htab->find (&elem);
1331 if (slot)
1332 slot->count++;
1333 else
1335 slot = XNEW (struct _vect_peel_info);
1336 slot->npeel = npeel;
1337 slot->dr = dr;
1338 slot->count = 1;
1339 new_slot = peeling_htab->find_slot (slot, INSERT);
1340 *new_slot = slot;
1343 if (!supportable_dr_alignment
1344 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1345 slot->count += VECT_MAX_COST;
1349 /* Traverse peeling hash table to find peeling option that aligns maximum
1350 number of data accesses. */
1353 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1354 _vect_peel_extended_info *max)
1356 vect_peel_info elem = *slot;
1358 if (elem->count > max->peel_info.count
1359 || (elem->count == max->peel_info.count
1360 && max->peel_info.npeel > elem->npeel))
1362 max->peel_info.npeel = elem->npeel;
1363 max->peel_info.count = elem->count;
1364 max->peel_info.dr = elem->dr;
1367 return 1;
1370 /* Get the costs of peeling NPEEL iterations checking data access costs
1371 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1372 misalignment will be zero after peeling. */
1374 static void
1375 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1376 struct data_reference *dr0,
1377 unsigned int *inside_cost,
1378 unsigned int *outside_cost,
1379 stmt_vector_for_cost *body_cost_vec,
1380 stmt_vector_for_cost *prologue_cost_vec,
1381 unsigned int npeel,
1382 bool unknown_misalignment)
1384 unsigned i;
1385 data_reference *dr;
1387 FOR_EACH_VEC_ELT (datarefs, i, dr)
1389 gimple *stmt = DR_STMT (dr);
1390 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1391 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1392 continue;
1394 /* For interleaving, only the alignment of the first access
1395 matters. */
1396 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1397 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1398 continue;
1400 /* Strided accesses perform only component accesses, alignment is
1401 irrelevant for them. */
1402 if (STMT_VINFO_STRIDED_P (stmt_info)
1403 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1404 continue;
1406 int save_misalignment;
1407 save_misalignment = DR_MISALIGNMENT (dr);
1408 if (npeel == 0)
1410 else if (unknown_misalignment && dr == dr0)
1411 SET_DR_MISALIGNMENT (dr, 0);
1412 else
1413 vect_update_misalignment_for_peel (dr, dr0, npeel);
1414 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1415 body_cost_vec, prologue_cost_vec);
1416 SET_DR_MISALIGNMENT (dr, save_misalignment);
1420 /* Traverse peeling hash table and calculate cost for each peeling option.
1421 Find the one with the lowest cost. */
1424 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1425 _vect_peel_extended_info *min)
1427 vect_peel_info elem = *slot;
1428 int dummy;
1429 unsigned int inside_cost = 0, outside_cost = 0;
1430 gimple *stmt = DR_STMT (elem->dr);
1431 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1432 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1433 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1434 epilogue_cost_vec;
1436 prologue_cost_vec.create (2);
1437 body_cost_vec.create (2);
1438 epilogue_cost_vec.create (2);
1440 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1441 elem->dr, &inside_cost, &outside_cost,
1442 &body_cost_vec, &prologue_cost_vec,
1443 elem->npeel, false);
1445 body_cost_vec.release ();
1447 outside_cost += vect_get_known_peeling_cost
1448 (loop_vinfo, elem->npeel, &dummy,
1449 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1450 &prologue_cost_vec, &epilogue_cost_vec);
1452 /* Prologue and epilogue costs are added to the target model later.
1453 These costs depend only on the scalar iteration cost, the
1454 number of peeling iterations finally chosen, and the number of
1455 misaligned statements. So discard the information found here. */
1456 prologue_cost_vec.release ();
1457 epilogue_cost_vec.release ();
1459 if (inside_cost < min->inside_cost
1460 || (inside_cost == min->inside_cost
1461 && outside_cost < min->outside_cost))
1463 min->inside_cost = inside_cost;
1464 min->outside_cost = outside_cost;
1465 min->peel_info.dr = elem->dr;
1466 min->peel_info.npeel = elem->npeel;
1467 min->peel_info.count = elem->count;
1470 return 1;
1474 /* Choose best peeling option by traversing peeling hash table and either
1475 choosing an option with the lowest cost (if cost model is enabled) or the
1476 option that aligns as many accesses as possible. */
1478 static struct _vect_peel_extended_info
1479 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1480 loop_vec_info loop_vinfo)
1482 struct _vect_peel_extended_info res;
1484 res.peel_info.dr = NULL;
1486 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1488 res.inside_cost = INT_MAX;
1489 res.outside_cost = INT_MAX;
1490 peeling_htab->traverse <_vect_peel_extended_info *,
1491 vect_peeling_hash_get_lowest_cost> (&res);
1493 else
1495 res.peel_info.count = 0;
1496 peeling_htab->traverse <_vect_peel_extended_info *,
1497 vect_peeling_hash_get_most_frequent> (&res);
1498 res.inside_cost = 0;
1499 res.outside_cost = 0;
1502 return res;
1505 /* Return true if the new peeling NPEEL is supported. */
1507 static bool
1508 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1509 unsigned npeel)
1511 unsigned i;
1512 struct data_reference *dr = NULL;
1513 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1514 gimple *stmt;
1515 stmt_vec_info stmt_info;
1516 enum dr_alignment_support supportable_dr_alignment;
1518 /* Ensure that all data refs can be vectorized after the peel. */
1519 FOR_EACH_VEC_ELT (datarefs, i, dr)
1521 int save_misalignment;
1523 if (dr == dr0)
1524 continue;
1526 stmt = DR_STMT (dr);
1527 stmt_info = vinfo_for_stmt (stmt);
1528 /* For interleaving, only the alignment of the first access
1529 matters. */
1530 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1531 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1532 continue;
1534 /* Strided accesses perform only component accesses, alignment is
1535 irrelevant for them. */
1536 if (STMT_VINFO_STRIDED_P (stmt_info)
1537 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1538 continue;
1540 save_misalignment = DR_MISALIGNMENT (dr);
1541 vect_update_misalignment_for_peel (dr, dr0, npeel);
1542 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1543 SET_DR_MISALIGNMENT (dr, save_misalignment);
1545 if (!supportable_dr_alignment)
1546 return false;
1549 return true;
1552 /* Function vect_enhance_data_refs_alignment
1554 This pass will use loop versioning and loop peeling in order to enhance
1555 the alignment of data references in the loop.
1557 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1558 original loop is to be vectorized. Any other loops that are created by
1559 the transformations performed in this pass - are not supposed to be
1560 vectorized. This restriction will be relaxed.
1562 This pass will require a cost model to guide it whether to apply peeling
1563 or versioning or a combination of the two. For example, the scheme that
1564 intel uses when given a loop with several memory accesses, is as follows:
1565 choose one memory access ('p') which alignment you want to force by doing
1566 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1567 other accesses are not necessarily aligned, or (2) use loop versioning to
1568 generate one loop in which all accesses are aligned, and another loop in
1569 which only 'p' is necessarily aligned.
1571 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1572 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1573 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1575 Devising a cost model is the most critical aspect of this work. It will
1576 guide us on which access to peel for, whether to use loop versioning, how
1577 many versions to create, etc. The cost model will probably consist of
1578 generic considerations as well as target specific considerations (on
1579 powerpc for example, misaligned stores are more painful than misaligned
1580 loads).
1582 Here are the general steps involved in alignment enhancements:
1584 -- original loop, before alignment analysis:
1585 for (i=0; i<N; i++){
1586 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1587 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1590 -- After vect_compute_data_refs_alignment:
1591 for (i=0; i<N; i++){
1592 x = q[i]; # DR_MISALIGNMENT(q) = 3
1593 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1596 -- Possibility 1: we do loop versioning:
1597 if (p is aligned) {
1598 for (i=0; i<N; i++){ # loop 1A
1599 x = q[i]; # DR_MISALIGNMENT(q) = 3
1600 p[i] = y; # DR_MISALIGNMENT(p) = 0
1603 else {
1604 for (i=0; i<N; i++){ # loop 1B
1605 x = q[i]; # DR_MISALIGNMENT(q) = 3
1606 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1610 -- Possibility 2: we do loop peeling:
1611 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1612 x = q[i];
1613 p[i] = y;
1615 for (i = 3; i < N; i++){ # loop 2A
1616 x = q[i]; # DR_MISALIGNMENT(q) = 0
1617 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1620 -- Possibility 3: combination of loop peeling and versioning:
1621 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1622 x = q[i];
1623 p[i] = y;
1625 if (p is aligned) {
1626 for (i = 3; i<N; i++){ # loop 3A
1627 x = q[i]; # DR_MISALIGNMENT(q) = 0
1628 p[i] = y; # DR_MISALIGNMENT(p) = 0
1631 else {
1632 for (i = 3; i<N; i++){ # loop 3B
1633 x = q[i]; # DR_MISALIGNMENT(q) = 0
1634 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1638 These loops are later passed to loop_transform to be vectorized. The
1639 vectorizer will use the alignment information to guide the transformation
1640 (whether to generate regular loads/stores, or with special handling for
1641 misalignment). */
1643 bool
1644 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1646 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1647 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1648 enum dr_alignment_support supportable_dr_alignment;
1649 struct data_reference *dr0 = NULL, *first_store = NULL;
1650 struct data_reference *dr;
1651 unsigned int i, j;
1652 bool do_peeling = false;
1653 bool do_versioning = false;
1654 bool stat;
1655 gimple *stmt;
1656 stmt_vec_info stmt_info;
1657 unsigned int npeel = 0;
1658 bool one_misalignment_known = false;
1659 bool one_misalignment_unknown = false;
1660 bool one_dr_unsupportable = false;
1661 struct data_reference *unsupportable_dr = NULL;
1662 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1663 unsigned possible_npeel_number = 1;
1664 tree vectype;
1665 unsigned int mis, same_align_drs_max = 0;
1666 hash_table<peel_info_hasher> peeling_htab (1);
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_NOTE, vect_location,
1670 "=== vect_enhance_data_refs_alignment ===\n");
1672 /* Reset data so we can safely be called multiple times. */
1673 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1674 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1676 /* While cost model enhancements are expected in the future, the high level
1677 view of the code at this time is as follows:
1679 A) If there is a misaligned access then see if peeling to align
1680 this access can make all data references satisfy
1681 vect_supportable_dr_alignment. If so, update data structures
1682 as needed and return true.
1684 B) If peeling wasn't possible and there is a data reference with an
1685 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1686 then see if loop versioning checks can be used to make all data
1687 references satisfy vect_supportable_dr_alignment. If so, update
1688 data structures as needed and return true.
1690 C) If neither peeling nor versioning were successful then return false if
1691 any data reference does not satisfy vect_supportable_dr_alignment.
1693 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1695 Note, Possibility 3 above (which is peeling and versioning together) is not
1696 being done at this time. */
1698 /* (1) Peeling to force alignment. */
1700 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1701 Considerations:
1702 + How many accesses will become aligned due to the peeling
1703 - How many accesses will become unaligned due to the peeling,
1704 and the cost of misaligned accesses.
1705 - The cost of peeling (the extra runtime checks, the increase
1706 in code size). */
1708 FOR_EACH_VEC_ELT (datarefs, i, dr)
1710 stmt = DR_STMT (dr);
1711 stmt_info = vinfo_for_stmt (stmt);
1713 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1714 continue;
1716 /* For interleaving, only the alignment of the first access
1717 matters. */
1718 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1719 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1720 continue;
1722 /* For invariant accesses there is nothing to enhance. */
1723 if (integer_zerop (DR_STEP (dr)))
1724 continue;
1726 /* Strided accesses perform only component accesses, alignment is
1727 irrelevant for them. */
1728 if (STMT_VINFO_STRIDED_P (stmt_info)
1729 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1730 continue;
1732 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1733 do_peeling = vector_alignment_reachable_p (dr);
1734 if (do_peeling)
1736 if (known_alignment_for_access_p (dr))
1738 unsigned int npeel_tmp = 0;
1739 bool negative = tree_int_cst_compare (DR_STEP (dr),
1740 size_zero_node) < 0;
1742 vectype = STMT_VINFO_VECTYPE (stmt_info);
1743 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1744 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1745 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1746 if (DR_MISALIGNMENT (dr) != 0)
1747 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1749 /* For multiple types, it is possible that the bigger type access
1750 will have more than one peeling option. E.g., a loop with two
1751 types: one of size (vector size / 4), and the other one of
1752 size (vector size / 8). Vectorization factor will 8. If both
1753 accesses are misaligned by 3, the first one needs one scalar
1754 iteration to be aligned, and the second one needs 5. But the
1755 first one will be aligned also by peeling 5 scalar
1756 iterations, and in that case both accesses will be aligned.
1757 Hence, except for the immediate peeling amount, we also want
1758 to try to add full vector size, while we don't exceed
1759 vectorization factor.
1760 We do this automatically for cost model, since we calculate
1761 cost for every peeling option. */
1762 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1764 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1765 ? vf * GROUP_SIZE (stmt_info) : vf);
1766 possible_npeel_number
1767 = vect_get_num_vectors (nscalars, vectype);
1769 /* NPEEL_TMP is 0 when there is no misalignment, but also
1770 allow peeling NELEMENTS. */
1771 if (DR_MISALIGNMENT (dr) == 0)
1772 possible_npeel_number++;
1775 /* Save info about DR in the hash table. Also include peeling
1776 amounts according to the explanation above. */
1777 for (j = 0; j < possible_npeel_number; j++)
1779 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1780 dr, npeel_tmp);
1781 npeel_tmp += target_align / dr_size;
1784 one_misalignment_known = true;
1786 else
1788 /* If we don't know any misalignment values, we prefer
1789 peeling for data-ref that has the maximum number of data-refs
1790 with the same alignment, unless the target prefers to align
1791 stores over load. */
1792 unsigned same_align_drs
1793 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1794 if (!dr0
1795 || same_align_drs_max < same_align_drs)
1797 same_align_drs_max = same_align_drs;
1798 dr0 = dr;
1800 /* For data-refs with the same number of related
1801 accesses prefer the one where the misalign
1802 computation will be invariant in the outermost loop. */
1803 else if (same_align_drs_max == same_align_drs)
1805 struct loop *ivloop0, *ivloop;
1806 ivloop0 = outermost_invariant_loop_for_expr
1807 (loop, DR_BASE_ADDRESS (dr0));
1808 ivloop = outermost_invariant_loop_for_expr
1809 (loop, DR_BASE_ADDRESS (dr));
1810 if ((ivloop && !ivloop0)
1811 || (ivloop && ivloop0
1812 && flow_loop_nested_p (ivloop, ivloop0)))
1813 dr0 = dr;
1816 one_misalignment_unknown = true;
1818 /* Check for data refs with unsupportable alignment that
1819 can be peeled. */
1820 if (!supportable_dr_alignment)
1822 one_dr_unsupportable = true;
1823 unsupportable_dr = dr;
1826 if (!first_store && DR_IS_WRITE (dr))
1827 first_store = dr;
1830 else
1832 if (!aligned_access_p (dr))
1834 if (dump_enabled_p ())
1835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1836 "vector alignment may not be reachable\n");
1837 break;
1842 /* Check if we can possibly peel the loop. */
1843 if (!vect_can_advance_ivs_p (loop_vinfo)
1844 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1845 || loop->inner)
1846 do_peeling = false;
1848 struct _vect_peel_extended_info peel_for_known_alignment;
1849 struct _vect_peel_extended_info peel_for_unknown_alignment;
1850 struct _vect_peel_extended_info best_peel;
1852 peel_for_unknown_alignment.inside_cost = INT_MAX;
1853 peel_for_unknown_alignment.outside_cost = INT_MAX;
1854 peel_for_unknown_alignment.peel_info.count = 0;
1856 if (do_peeling
1857 && one_misalignment_unknown)
1859 /* Check if the target requires to prefer stores over loads, i.e., if
1860 misaligned stores are more expensive than misaligned loads (taking
1861 drs with same alignment into account). */
1862 unsigned int load_inside_cost = 0;
1863 unsigned int load_outside_cost = 0;
1864 unsigned int store_inside_cost = 0;
1865 unsigned int store_outside_cost = 0;
1866 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1868 stmt_vector_for_cost dummy;
1869 dummy.create (2);
1870 vect_get_peeling_costs_all_drs (datarefs, dr0,
1871 &load_inside_cost,
1872 &load_outside_cost,
1873 &dummy, &dummy, estimated_npeels, true);
1874 dummy.release ();
1876 if (first_store)
1878 dummy.create (2);
1879 vect_get_peeling_costs_all_drs (datarefs, first_store,
1880 &store_inside_cost,
1881 &store_outside_cost,
1882 &dummy, &dummy,
1883 estimated_npeels, true);
1884 dummy.release ();
1886 else
1888 store_inside_cost = INT_MAX;
1889 store_outside_cost = INT_MAX;
1892 if (load_inside_cost > store_inside_cost
1893 || (load_inside_cost == store_inside_cost
1894 && load_outside_cost > store_outside_cost))
1896 dr0 = first_store;
1897 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1898 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1900 else
1902 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1903 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1906 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1907 prologue_cost_vec.create (2);
1908 epilogue_cost_vec.create (2);
1910 int dummy2;
1911 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1912 (loop_vinfo, estimated_npeels, &dummy2,
1913 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1914 &prologue_cost_vec, &epilogue_cost_vec);
1916 prologue_cost_vec.release ();
1917 epilogue_cost_vec.release ();
1919 peel_for_unknown_alignment.peel_info.count = 1
1920 + STMT_VINFO_SAME_ALIGN_REFS
1921 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1924 peel_for_unknown_alignment.peel_info.npeel = 0;
1925 peel_for_unknown_alignment.peel_info.dr = dr0;
1927 best_peel = peel_for_unknown_alignment;
1929 peel_for_known_alignment.inside_cost = INT_MAX;
1930 peel_for_known_alignment.outside_cost = INT_MAX;
1931 peel_for_known_alignment.peel_info.count = 0;
1932 peel_for_known_alignment.peel_info.dr = NULL;
1934 if (do_peeling && one_misalignment_known)
1936 /* Peeling is possible, but there is no data access that is not supported
1937 unless aligned. So we try to choose the best possible peeling from
1938 the hash table. */
1939 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1940 (&peeling_htab, loop_vinfo);
1943 /* Compare costs of peeling for known and unknown alignment. */
1944 if (peel_for_known_alignment.peel_info.dr != NULL
1945 && peel_for_unknown_alignment.inside_cost
1946 >= peel_for_known_alignment.inside_cost)
1948 best_peel = peel_for_known_alignment;
1950 /* If the best peeling for known alignment has NPEEL == 0, perform no
1951 peeling at all except if there is an unsupportable dr that we can
1952 align. */
1953 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1954 do_peeling = false;
1957 /* If there is an unsupportable data ref, prefer this over all choices so far
1958 since we'd have to discard a chosen peeling except when it accidentally
1959 aligned the unsupportable data ref. */
1960 if (one_dr_unsupportable)
1961 dr0 = unsupportable_dr;
1962 else if (do_peeling)
1964 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1965 TODO: Use nopeel_outside_cost or get rid of it? */
1966 unsigned nopeel_inside_cost = 0;
1967 unsigned nopeel_outside_cost = 0;
1969 stmt_vector_for_cost dummy;
1970 dummy.create (2);
1971 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1972 &nopeel_outside_cost, &dummy, &dummy,
1973 0, false);
1974 dummy.release ();
1976 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1977 costs will be recorded. */
1978 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1979 prologue_cost_vec.create (2);
1980 epilogue_cost_vec.create (2);
1982 int dummy2;
1983 nopeel_outside_cost += vect_get_known_peeling_cost
1984 (loop_vinfo, 0, &dummy2,
1985 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1986 &prologue_cost_vec, &epilogue_cost_vec);
1988 prologue_cost_vec.release ();
1989 epilogue_cost_vec.release ();
1991 npeel = best_peel.peel_info.npeel;
1992 dr0 = best_peel.peel_info.dr;
1994 /* If no peeling is not more expensive than the best peeling we
1995 have so far, don't perform any peeling. */
1996 if (nopeel_inside_cost <= best_peel.inside_cost)
1997 do_peeling = false;
2000 if (do_peeling)
2002 stmt = DR_STMT (dr0);
2003 stmt_info = vinfo_for_stmt (stmt);
2004 vectype = STMT_VINFO_VECTYPE (stmt_info);
2006 if (known_alignment_for_access_p (dr0))
2008 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2009 size_zero_node) < 0;
2010 if (!npeel)
2012 /* Since it's known at compile time, compute the number of
2013 iterations in the peeled loop (the peeling factor) for use in
2014 updating DR_MISALIGNMENT values. The peeling factor is the
2015 vectorization factor minus the misalignment as an element
2016 count. */
2017 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2018 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2019 npeel = ((mis & (target_align - 1))
2020 / vect_get_scalar_dr_size (dr0));
2023 /* For interleaved data access every iteration accesses all the
2024 members of the group, therefore we divide the number of iterations
2025 by the group size. */
2026 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2027 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2028 npeel /= GROUP_SIZE (stmt_info);
2030 if (dump_enabled_p ())
2031 dump_printf_loc (MSG_NOTE, vect_location,
2032 "Try peeling by %d\n", npeel);
2035 /* Ensure that all datarefs can be vectorized after the peel. */
2036 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2037 do_peeling = false;
2039 /* Check if all datarefs are supportable and log. */
2040 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2042 stat = vect_verify_datarefs_alignment (loop_vinfo);
2043 if (!stat)
2044 do_peeling = false;
2045 else
2046 return stat;
2049 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2050 if (do_peeling)
2052 unsigned max_allowed_peel
2053 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2054 if (max_allowed_peel != (unsigned)-1)
2056 unsigned max_peel = npeel;
2057 if (max_peel == 0)
2059 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2060 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2062 if (max_peel > max_allowed_peel)
2064 do_peeling = false;
2065 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_NOTE, vect_location,
2067 "Disable peeling, max peels reached: %d\n", max_peel);
2072 /* Cost model #2 - if peeling may result in a remaining loop not
2073 iterating enough to be vectorized then do not peel. Since this
2074 is a cost heuristic rather than a correctness decision, use the
2075 most likely runtime value for variable vectorization factors. */
2076 if (do_peeling
2077 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2079 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2080 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2081 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2082 < assumed_vf + max_peel)
2083 do_peeling = false;
2086 if (do_peeling)
2088 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2089 If the misalignment of DR_i is identical to that of dr0 then set
2090 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2091 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2092 by the peeling factor times the element size of DR_i (MOD the
2093 vectorization factor times the size). Otherwise, the
2094 misalignment of DR_i must be set to unknown. */
2095 FOR_EACH_VEC_ELT (datarefs, i, dr)
2096 if (dr != dr0)
2098 /* Strided accesses perform only component accesses, alignment
2099 is irrelevant for them. */
2100 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2101 if (STMT_VINFO_STRIDED_P (stmt_info)
2102 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2103 continue;
2105 vect_update_misalignment_for_peel (dr, dr0, npeel);
2108 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2109 if (npeel)
2110 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2111 else
2112 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2113 = DR_MISALIGNMENT (dr0);
2114 SET_DR_MISALIGNMENT (dr0, 0);
2115 if (dump_enabled_p ())
2117 dump_printf_loc (MSG_NOTE, vect_location,
2118 "Alignment of access forced using peeling.\n");
2119 dump_printf_loc (MSG_NOTE, vect_location,
2120 "Peeling for alignment will be applied.\n");
2123 /* The inside-loop cost will be accounted for in vectorizable_load
2124 and vectorizable_store correctly with adjusted alignments.
2125 Drop the body_cst_vec on the floor here. */
2126 stat = vect_verify_datarefs_alignment (loop_vinfo);
2127 gcc_assert (stat);
2128 return stat;
2132 /* (2) Versioning to force alignment. */
2134 /* Try versioning if:
2135 1) optimize loop for speed
2136 2) there is at least one unsupported misaligned data ref with an unknown
2137 misalignment, and
2138 3) all misaligned data refs with a known misalignment are supported, and
2139 4) the number of runtime alignment checks is within reason. */
2141 do_versioning =
2142 optimize_loop_nest_for_speed_p (loop)
2143 && (!loop->inner); /* FORNOW */
2145 if (do_versioning)
2147 FOR_EACH_VEC_ELT (datarefs, i, dr)
2149 stmt = DR_STMT (dr);
2150 stmt_info = vinfo_for_stmt (stmt);
2152 /* For interleaving, only the alignment of the first access
2153 matters. */
2154 if (aligned_access_p (dr)
2155 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2156 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2157 continue;
2159 if (STMT_VINFO_STRIDED_P (stmt_info))
2161 /* Strided loads perform only component accesses, alignment is
2162 irrelevant for them. */
2163 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2164 continue;
2165 do_versioning = false;
2166 break;
2169 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2171 if (!supportable_dr_alignment)
2173 gimple *stmt;
2174 int mask;
2175 tree vectype;
2177 if (known_alignment_for_access_p (dr)
2178 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2179 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2181 do_versioning = false;
2182 break;
2185 stmt = DR_STMT (dr);
2186 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2187 gcc_assert (vectype);
2189 /* At present we don't support versioning for alignment
2190 with variable VF, since there's no guarantee that the
2191 VF is a power of two. We could relax this if we added
2192 a way of enforcing a power-of-two size. */
2193 unsigned HOST_WIDE_INT size;
2194 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2196 do_versioning = false;
2197 break;
2200 /* The rightmost bits of an aligned address must be zeros.
2201 Construct the mask needed for this test. For example,
2202 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2203 mask must be 15 = 0xf. */
2204 mask = size - 1;
2206 /* FORNOW: use the same mask to test all potentially unaligned
2207 references in the loop. The vectorizer currently supports
2208 a single vector size, see the reference to
2209 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2210 vectorization factor is computed. */
2211 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2212 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2213 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2214 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2215 DR_STMT (dr));
2219 /* Versioning requires at least one misaligned data reference. */
2220 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2221 do_versioning = false;
2222 else if (!do_versioning)
2223 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2226 if (do_versioning)
2228 vec<gimple *> may_misalign_stmts
2229 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2230 gimple *stmt;
2232 /* It can now be assumed that the data references in the statements
2233 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2234 of the loop being vectorized. */
2235 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2237 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2238 dr = STMT_VINFO_DATA_REF (stmt_info);
2239 SET_DR_MISALIGNMENT (dr, 0);
2240 if (dump_enabled_p ())
2241 dump_printf_loc (MSG_NOTE, vect_location,
2242 "Alignment of access forced using versioning.\n");
2245 if (dump_enabled_p ())
2246 dump_printf_loc (MSG_NOTE, vect_location,
2247 "Versioning for alignment will be applied.\n");
2249 /* Peeling and versioning can't be done together at this time. */
2250 gcc_assert (! (do_peeling && do_versioning));
2252 stat = vect_verify_datarefs_alignment (loop_vinfo);
2253 gcc_assert (stat);
2254 return stat;
2257 /* This point is reached if neither peeling nor versioning is being done. */
2258 gcc_assert (! (do_peeling || do_versioning));
2260 stat = vect_verify_datarefs_alignment (loop_vinfo);
2261 return stat;
2265 /* Function vect_find_same_alignment_drs.
2267 Update group and alignment relations according to the chosen
2268 vectorization factor. */
2270 static void
2271 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2273 struct data_reference *dra = DDR_A (ddr);
2274 struct data_reference *drb = DDR_B (ddr);
2275 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2276 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2278 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2279 return;
2281 if (dra == drb)
2282 return;
2284 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2285 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2286 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2287 return;
2289 /* Two references with distance zero have the same alignment. */
2290 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2291 - wi::to_poly_offset (DR_INIT (drb)));
2292 if (maybe_ne (diff, 0))
2294 /* Get the wider of the two alignments. */
2295 unsigned int align_a = (vect_calculate_target_alignment (dra)
2296 / BITS_PER_UNIT);
2297 unsigned int align_b = (vect_calculate_target_alignment (drb)
2298 / BITS_PER_UNIT);
2299 unsigned int max_align = MAX (align_a, align_b);
2301 /* Require the gap to be a multiple of the larger vector alignment. */
2302 if (!multiple_p (diff, max_align))
2303 return;
2306 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2307 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2308 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_NOTE, vect_location,
2311 "accesses have the same alignment: ");
2312 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2313 dump_printf (MSG_NOTE, " and ");
2314 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2315 dump_printf (MSG_NOTE, "\n");
2320 /* Function vect_analyze_data_refs_alignment
2322 Analyze the alignment of the data-references in the loop.
2323 Return FALSE if a data reference is found that cannot be vectorized. */
2325 bool
2326 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2328 if (dump_enabled_p ())
2329 dump_printf_loc (MSG_NOTE, vect_location,
2330 "=== vect_analyze_data_refs_alignment ===\n");
2332 /* Mark groups of data references with same alignment using
2333 data dependence information. */
2334 vec<ddr_p> ddrs = vinfo->ddrs;
2335 struct data_dependence_relation *ddr;
2336 unsigned int i;
2338 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2339 vect_find_same_alignment_drs (ddr);
2341 vec<data_reference_p> datarefs = vinfo->datarefs;
2342 struct data_reference *dr;
2344 vect_record_base_alignments (vinfo);
2345 FOR_EACH_VEC_ELT (datarefs, i, dr)
2347 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2348 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2349 && !vect_compute_data_ref_alignment (dr))
2351 /* Strided accesses perform only component accesses, misalignment
2352 information is irrelevant for them. */
2353 if (STMT_VINFO_STRIDED_P (stmt_info)
2354 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2355 continue;
2357 if (dump_enabled_p ())
2358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2359 "not vectorized: can't calculate alignment "
2360 "for data ref.\n");
2362 return false;
2366 return true;
2370 /* Analyze alignment of DRs of stmts in NODE. */
2372 static bool
2373 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2375 /* We vectorize from the first scalar stmt in the node unless
2376 the node is permuted in which case we start from the first
2377 element in the group. */
2378 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2379 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2380 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2381 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2383 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2384 if (! vect_compute_data_ref_alignment (dr)
2385 /* For creating the data-ref pointer we need alignment of the
2386 first element anyway. */
2387 || (dr != first_dr
2388 && ! vect_compute_data_ref_alignment (first_dr))
2389 || ! verify_data_ref_alignment (dr))
2391 if (dump_enabled_p ())
2392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2393 "not vectorized: bad data alignment in basic "
2394 "block.\n");
2395 return false;
2398 return true;
2401 /* Function vect_slp_analyze_instance_alignment
2403 Analyze the alignment of the data-references in the SLP instance.
2404 Return FALSE if a data reference is found that cannot be vectorized. */
2406 bool
2407 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2409 if (dump_enabled_p ())
2410 dump_printf_loc (MSG_NOTE, vect_location,
2411 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2413 slp_tree node;
2414 unsigned i;
2415 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2416 if (! vect_slp_analyze_and_verify_node_alignment (node))
2417 return false;
2419 node = SLP_INSTANCE_TREE (instance);
2420 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2421 && ! vect_slp_analyze_and_verify_node_alignment
2422 (SLP_INSTANCE_TREE (instance)))
2423 return false;
2425 return true;
2429 /* Analyze groups of accesses: check that DR belongs to a group of
2430 accesses of legal size, step, etc. Detect gaps, single element
2431 interleaving, and other special cases. Set grouped access info.
2432 Collect groups of strided stores for further use in SLP analysis.
2433 Worker for vect_analyze_group_access. */
2435 static bool
2436 vect_analyze_group_access_1 (struct data_reference *dr)
2438 tree step = DR_STEP (dr);
2439 tree scalar_type = TREE_TYPE (DR_REF (dr));
2440 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2441 gimple *stmt = DR_STMT (dr);
2442 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2443 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2444 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2445 HOST_WIDE_INT dr_step = -1;
2446 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2447 bool slp_impossible = false;
2449 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2450 size of the interleaving group (including gaps). */
2451 if (tree_fits_shwi_p (step))
2453 dr_step = tree_to_shwi (step);
2454 /* Check that STEP is a multiple of type size. Otherwise there is
2455 a non-element-sized gap at the end of the group which we
2456 cannot represent in GROUP_GAP or GROUP_SIZE.
2457 ??? As we can handle non-constant step fine here we should
2458 simply remove uses of GROUP_GAP between the last and first
2459 element and instead rely on DR_STEP. GROUP_SIZE then would
2460 simply not include that gap. */
2461 if ((dr_step % type_size) != 0)
2463 if (dump_enabled_p ())
2465 dump_printf_loc (MSG_NOTE, vect_location,
2466 "Step ");
2467 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2468 dump_printf (MSG_NOTE,
2469 " is not a multiple of the element size for ");
2470 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2471 dump_printf (MSG_NOTE, "\n");
2473 return false;
2475 groupsize = absu_hwi (dr_step) / type_size;
2477 else
2478 groupsize = 0;
2480 /* Not consecutive access is possible only if it is a part of interleaving. */
2481 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2483 /* Check if it this DR is a part of interleaving, and is a single
2484 element of the group that is accessed in the loop. */
2486 /* Gaps are supported only for loads. STEP must be a multiple of the type
2487 size. */
2488 if (DR_IS_READ (dr)
2489 && (dr_step % type_size) == 0
2490 && groupsize > 0)
2492 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2493 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2494 GROUP_GAP (stmt_info) = groupsize - 1;
2495 if (dump_enabled_p ())
2497 dump_printf_loc (MSG_NOTE, vect_location,
2498 "Detected single element interleaving ");
2499 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2500 dump_printf (MSG_NOTE, " step ");
2501 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2502 dump_printf (MSG_NOTE, "\n");
2505 return true;
2508 if (dump_enabled_p ())
2510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2511 "not consecutive access ");
2512 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2515 if (bb_vinfo)
2517 /* Mark the statement as unvectorizable. */
2518 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2519 return true;
2522 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2523 STMT_VINFO_STRIDED_P (stmt_info) = true;
2524 return true;
2527 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2529 /* First stmt in the interleaving chain. Check the chain. */
2530 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2531 struct data_reference *data_ref = dr;
2532 unsigned int count = 1;
2533 tree prev_init = DR_INIT (data_ref);
2534 gimple *prev = stmt;
2535 HOST_WIDE_INT diff, gaps = 0;
2537 /* By construction, all group members have INTEGER_CST DR_INITs. */
2538 while (next)
2540 /* Skip same data-refs. In case that two or more stmts share
2541 data-ref (supported only for loads), we vectorize only the first
2542 stmt, and the rest get their vectorized loads from the first
2543 one. */
2544 if (!tree_int_cst_compare (DR_INIT (data_ref),
2545 DR_INIT (STMT_VINFO_DATA_REF (
2546 vinfo_for_stmt (next)))))
2548 if (DR_IS_WRITE (data_ref))
2550 if (dump_enabled_p ())
2551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2552 "Two store stmts share the same dr.\n");
2553 return false;
2556 if (dump_enabled_p ())
2557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2558 "Two or more load stmts share the same dr.\n");
2560 /* For load use the same data-ref load. */
2561 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2563 prev = next;
2564 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2565 continue;
2568 prev = next;
2569 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2571 /* All group members have the same STEP by construction. */
2572 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2574 /* Check that the distance between two accesses is equal to the type
2575 size. Otherwise, we have gaps. */
2576 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2577 - TREE_INT_CST_LOW (prev_init)) / type_size;
2578 if (diff != 1)
2580 /* FORNOW: SLP of accesses with gaps is not supported. */
2581 slp_impossible = true;
2582 if (DR_IS_WRITE (data_ref))
2584 if (dump_enabled_p ())
2585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2586 "interleaved store with gaps\n");
2587 return false;
2590 gaps += diff - 1;
2593 last_accessed_element += diff;
2595 /* Store the gap from the previous member of the group. If there is no
2596 gap in the access, GROUP_GAP is always 1. */
2597 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2599 prev_init = DR_INIT (data_ref);
2600 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2601 /* Count the number of data-refs in the chain. */
2602 count++;
2605 if (groupsize == 0)
2606 groupsize = count + gaps;
2608 /* This could be UINT_MAX but as we are generating code in a very
2609 inefficient way we have to cap earlier. See PR78699 for example. */
2610 if (groupsize > 4096)
2612 if (dump_enabled_p ())
2613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2614 "group is too large\n");
2615 return false;
2618 /* Check that the size of the interleaving is equal to count for stores,
2619 i.e., that there are no gaps. */
2620 if (groupsize != count
2621 && !DR_IS_READ (dr))
2623 if (dump_enabled_p ())
2624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2625 "interleaved store with gaps\n");
2626 return false;
2629 /* If there is a gap after the last load in the group it is the
2630 difference between the groupsize and the last accessed
2631 element.
2632 When there is no gap, this difference should be 0. */
2633 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2635 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2636 if (dump_enabled_p ())
2638 dump_printf_loc (MSG_NOTE, vect_location,
2639 "Detected interleaving ");
2640 if (DR_IS_READ (dr))
2641 dump_printf (MSG_NOTE, "load ");
2642 else
2643 dump_printf (MSG_NOTE, "store ");
2644 dump_printf (MSG_NOTE, "of size %u starting with ",
2645 (unsigned)groupsize);
2646 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2647 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2648 dump_printf_loc (MSG_NOTE, vect_location,
2649 "There is a gap of %u elements after the group\n",
2650 GROUP_GAP (vinfo_for_stmt (stmt)));
2653 /* SLP: create an SLP data structure for every interleaving group of
2654 stores for further analysis in vect_analyse_slp. */
2655 if (DR_IS_WRITE (dr) && !slp_impossible)
2657 if (loop_vinfo)
2658 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2659 if (bb_vinfo)
2660 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2664 return true;
2667 /* Analyze groups of accesses: check that DR belongs to a group of
2668 accesses of legal size, step, etc. Detect gaps, single element
2669 interleaving, and other special cases. Set grouped access info.
2670 Collect groups of strided stores for further use in SLP analysis. */
2672 static bool
2673 vect_analyze_group_access (struct data_reference *dr)
2675 if (!vect_analyze_group_access_1 (dr))
2677 /* Dissolve the group if present. */
2678 gimple *next;
2679 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2680 while (stmt)
2682 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2683 next = GROUP_NEXT_ELEMENT (vinfo);
2684 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2685 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2686 stmt = next;
2688 return false;
2690 return true;
2693 /* Analyze the access pattern of the data-reference DR.
2694 In case of non-consecutive accesses call vect_analyze_group_access() to
2695 analyze groups of accesses. */
2697 static bool
2698 vect_analyze_data_ref_access (struct data_reference *dr)
2700 tree step = DR_STEP (dr);
2701 tree scalar_type = TREE_TYPE (DR_REF (dr));
2702 gimple *stmt = DR_STMT (dr);
2703 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2704 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2705 struct loop *loop = NULL;
2707 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2708 return true;
2710 if (loop_vinfo)
2711 loop = LOOP_VINFO_LOOP (loop_vinfo);
2713 if (loop_vinfo && !step)
2715 if (dump_enabled_p ())
2716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2717 "bad data-ref access in loop\n");
2718 return false;
2721 /* Allow loads with zero step in inner-loop vectorization. */
2722 if (loop_vinfo && integer_zerop (step))
2724 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2725 if (!nested_in_vect_loop_p (loop, stmt))
2726 return DR_IS_READ (dr);
2727 /* Allow references with zero step for outer loops marked
2728 with pragma omp simd only - it guarantees absence of
2729 loop-carried dependencies between inner loop iterations. */
2730 if (loop->safelen < 2)
2732 if (dump_enabled_p ())
2733 dump_printf_loc (MSG_NOTE, vect_location,
2734 "zero step in inner loop of nest\n");
2735 return false;
2739 if (loop && nested_in_vect_loop_p (loop, stmt))
2741 /* Interleaved accesses are not yet supported within outer-loop
2742 vectorization for references in the inner-loop. */
2743 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2745 /* For the rest of the analysis we use the outer-loop step. */
2746 step = STMT_VINFO_DR_STEP (stmt_info);
2747 if (integer_zerop (step))
2749 if (dump_enabled_p ())
2750 dump_printf_loc (MSG_NOTE, vect_location,
2751 "zero step in outer loop.\n");
2752 return DR_IS_READ (dr);
2756 /* Consecutive? */
2757 if (TREE_CODE (step) == INTEGER_CST)
2759 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2760 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2761 || (dr_step < 0
2762 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2764 /* Mark that it is not interleaving. */
2765 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2766 return true;
2770 if (loop && nested_in_vect_loop_p (loop, stmt))
2772 if (dump_enabled_p ())
2773 dump_printf_loc (MSG_NOTE, vect_location,
2774 "grouped access in outer loop.\n");
2775 return false;
2779 /* Assume this is a DR handled by non-constant strided load case. */
2780 if (TREE_CODE (step) != INTEGER_CST)
2781 return (STMT_VINFO_STRIDED_P (stmt_info)
2782 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2783 || vect_analyze_group_access (dr)));
2785 /* Not consecutive access - check if it's a part of interleaving group. */
2786 return vect_analyze_group_access (dr);
2789 /* Compare two data-references DRA and DRB to group them into chunks
2790 suitable for grouping. */
2792 static int
2793 dr_group_sort_cmp (const void *dra_, const void *drb_)
2795 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2796 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2797 int cmp;
2799 /* Stabilize sort. */
2800 if (dra == drb)
2801 return 0;
2803 /* DRs in different loops never belong to the same group. */
2804 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2805 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2806 if (loopa != loopb)
2807 return loopa->num < loopb->num ? -1 : 1;
2809 /* Ordering of DRs according to base. */
2810 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2811 DR_BASE_ADDRESS (drb));
2812 if (cmp != 0)
2813 return cmp;
2815 /* And according to DR_OFFSET. */
2816 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2817 if (cmp != 0)
2818 return cmp;
2820 /* Put reads before writes. */
2821 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2822 return DR_IS_READ (dra) ? -1 : 1;
2824 /* Then sort after access size. */
2825 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2826 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2827 if (cmp != 0)
2828 return cmp;
2830 /* And after step. */
2831 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2832 if (cmp != 0)
2833 return cmp;
2835 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2836 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2837 if (cmp == 0)
2838 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2839 return cmp;
2842 /* If OP is the result of a conversion, return the unconverted value,
2843 otherwise return null. */
2845 static tree
2846 strip_conversion (tree op)
2848 if (TREE_CODE (op) != SSA_NAME)
2849 return NULL_TREE;
2850 gimple *stmt = SSA_NAME_DEF_STMT (op);
2851 if (!is_gimple_assign (stmt)
2852 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2853 return NULL_TREE;
2854 return gimple_assign_rhs1 (stmt);
2857 /* Return true if vectorizable_* routines can handle statements STMT1
2858 and STMT2 being in a single group. */
2860 static bool
2861 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2863 if (gimple_assign_single_p (stmt1))
2864 return gimple_assign_single_p (stmt2);
2866 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2868 /* Check for two masked loads or two masked stores. */
2869 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2870 return false;
2871 internal_fn ifn = gimple_call_internal_fn (stmt1);
2872 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2873 return false;
2874 if (ifn != gimple_call_internal_fn (stmt2))
2875 return false;
2877 /* Check that the masks are the same. Cope with casts of masks,
2878 like those created by build_mask_conversion. */
2879 tree mask1 = gimple_call_arg (stmt1, 2);
2880 tree mask2 = gimple_call_arg (stmt2, 2);
2881 if (!operand_equal_p (mask1, mask2, 0))
2883 mask1 = strip_conversion (mask1);
2884 if (!mask1)
2885 return false;
2886 mask2 = strip_conversion (mask2);
2887 if (!mask2)
2888 return false;
2889 if (!operand_equal_p (mask1, mask2, 0))
2890 return false;
2892 return true;
2895 return false;
2898 /* Function vect_analyze_data_ref_accesses.
2900 Analyze the access pattern of all the data references in the loop.
2902 FORNOW: the only access pattern that is considered vectorizable is a
2903 simple step 1 (consecutive) access.
2905 FORNOW: handle only arrays and pointer accesses. */
2907 bool
2908 vect_analyze_data_ref_accesses (vec_info *vinfo)
2910 unsigned int i;
2911 vec<data_reference_p> datarefs = vinfo->datarefs;
2912 struct data_reference *dr;
2914 if (dump_enabled_p ())
2915 dump_printf_loc (MSG_NOTE, vect_location,
2916 "=== vect_analyze_data_ref_accesses ===\n");
2918 if (datarefs.is_empty ())
2919 return true;
2921 /* Sort the array of datarefs to make building the interleaving chains
2922 linear. Don't modify the original vector's order, it is needed for
2923 determining what dependencies are reversed. */
2924 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2925 datarefs_copy.qsort (dr_group_sort_cmp);
2927 /* Build the interleaving chains. */
2928 for (i = 0; i < datarefs_copy.length () - 1;)
2930 data_reference_p dra = datarefs_copy[i];
2931 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2932 stmt_vec_info lastinfo = NULL;
2933 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2934 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2936 ++i;
2937 continue;
2939 for (i = i + 1; i < datarefs_copy.length (); ++i)
2941 data_reference_p drb = datarefs_copy[i];
2942 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2943 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2944 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2945 break;
2947 /* ??? Imperfect sorting (non-compatible types, non-modulo
2948 accesses, same accesses) can lead to a group to be artificially
2949 split here as we don't just skip over those. If it really
2950 matters we can push those to a worklist and re-iterate
2951 over them. The we can just skip ahead to the next DR here. */
2953 /* DRs in a different loop should not be put into the same
2954 interleaving group. */
2955 if (gimple_bb (DR_STMT (dra))->loop_father
2956 != gimple_bb (DR_STMT (drb))->loop_father)
2957 break;
2959 /* Check that the data-refs have same first location (except init)
2960 and they are both either store or load (not load and store,
2961 not masked loads or stores). */
2962 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2963 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2964 DR_BASE_ADDRESS (drb)) != 0
2965 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2966 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2967 break;
2969 /* Check that the data-refs have the same constant size. */
2970 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2971 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2972 if (!tree_fits_uhwi_p (sza)
2973 || !tree_fits_uhwi_p (szb)
2974 || !tree_int_cst_equal (sza, szb))
2975 break;
2977 /* Check that the data-refs have the same step. */
2978 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2979 break;
2981 /* Check the types are compatible.
2982 ??? We don't distinguish this during sorting. */
2983 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2984 TREE_TYPE (DR_REF (drb))))
2985 break;
2987 /* Check that the DR_INITs are compile-time constants. */
2988 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2989 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2990 break;
2992 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2993 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2994 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2995 HOST_WIDE_INT init_prev
2996 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2997 gcc_assert (init_a <= init_b
2998 && init_a <= init_prev
2999 && init_prev <= init_b);
3001 /* Do not place the same access in the interleaving chain twice. */
3002 if (init_b == init_prev)
3004 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3005 < gimple_uid (DR_STMT (drb)));
3006 /* ??? For now we simply "drop" the later reference which is
3007 otherwise the same rather than finishing off this group.
3008 In the end we'd want to re-process duplicates forming
3009 multiple groups from the refs, likely by just collecting
3010 all candidates (including duplicates and split points
3011 below) in a vector and then process them together. */
3012 continue;
3015 /* If init_b == init_a + the size of the type * k, we have an
3016 interleaving, and DRA is accessed before DRB. */
3017 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3018 if (type_size_a == 0
3019 || (init_b - init_a) % type_size_a != 0)
3020 break;
3022 /* If we have a store, the accesses are adjacent. This splits
3023 groups into chunks we support (we don't support vectorization
3024 of stores with gaps). */
3025 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3026 break;
3028 /* If the step (if not zero or non-constant) is greater than the
3029 difference between data-refs' inits this splits groups into
3030 suitable sizes. */
3031 if (tree_fits_shwi_p (DR_STEP (dra)))
3033 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3034 if (step != 0 && step <= (init_b - init_a))
3035 break;
3038 if (dump_enabled_p ())
3040 dump_printf_loc (MSG_NOTE, vect_location,
3041 "Detected interleaving ");
3042 if (DR_IS_READ (dra))
3043 dump_printf (MSG_NOTE, "load ");
3044 else
3045 dump_printf (MSG_NOTE, "store ");
3046 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3047 dump_printf (MSG_NOTE, " and ");
3048 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3049 dump_printf (MSG_NOTE, "\n");
3052 /* Link the found element into the group list. */
3053 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3055 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3056 lastinfo = stmtinfo_a;
3058 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3059 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3060 lastinfo = stmtinfo_b;
3064 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3065 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3066 && !vect_analyze_data_ref_access (dr))
3068 if (dump_enabled_p ())
3069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3070 "not vectorized: complicated access pattern.\n");
3072 if (is_a <bb_vec_info> (vinfo))
3074 /* Mark the statement as not vectorizable. */
3075 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3076 continue;
3078 else
3080 datarefs_copy.release ();
3081 return false;
3085 datarefs_copy.release ();
3086 return true;
3089 /* Function vect_vfa_segment_size.
3091 Input:
3092 DR: The data reference.
3093 LENGTH_FACTOR: segment length to consider.
3095 Return a value suitable for the dr_with_seg_len::seg_len field.
3096 This is the "distance travelled" by the pointer from the first
3097 iteration in the segment to the last. Note that it does not include
3098 the size of the access; in effect it only describes the first byte. */
3100 static tree
3101 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3103 length_factor = size_binop (MINUS_EXPR,
3104 fold_convert (sizetype, length_factor),
3105 size_one_node);
3106 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3107 length_factor);
3110 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3111 gives the worst-case number of bytes covered by the segment. */
3113 static unsigned HOST_WIDE_INT
3114 vect_vfa_access_size (data_reference *dr)
3116 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3117 tree ref_type = TREE_TYPE (DR_REF (dr));
3118 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3119 unsigned HOST_WIDE_INT access_size = ref_size;
3120 if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3122 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3123 access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3125 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3126 && (vect_supportable_dr_alignment (dr, false)
3127 == dr_explicit_realign_optimized))
3129 /* We might access a full vector's worth. */
3130 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3131 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3133 return access_size;
3136 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3138 static unsigned int
3139 vect_vfa_align (const data_reference *dr)
3141 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3144 /* Function vect_no_alias_p.
3146 Given data references A and B with equal base and offset, see whether
3147 the alias relation can be decided at compilation time. Return 1 if
3148 it can and the references alias, 0 if it can and the references do
3149 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3150 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3151 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3153 static int
3154 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3155 tree segment_length_a, tree segment_length_b,
3156 unsigned HOST_WIDE_INT access_size_a,
3157 unsigned HOST_WIDE_INT access_size_b)
3159 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3160 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3161 poly_uint64 const_length_a;
3162 poly_uint64 const_length_b;
3164 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3165 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3166 [a, a+12) */
3167 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3169 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3170 offset_a = (offset_a + access_size_a) - const_length_a;
3172 else
3173 const_length_a = tree_to_poly_uint64 (segment_length_a);
3174 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3176 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3177 offset_b = (offset_b + access_size_b) - const_length_b;
3179 else
3180 const_length_b = tree_to_poly_uint64 (segment_length_b);
3182 const_length_a += access_size_a;
3183 const_length_b += access_size_b;
3185 if (ranges_known_overlap_p (offset_a, const_length_a,
3186 offset_b, const_length_b))
3187 return 1;
3189 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3190 offset_b, const_length_b))
3191 return 0;
3193 return -1;
3196 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3197 in DDR is >= VF. */
3199 static bool
3200 dependence_distance_ge_vf (data_dependence_relation *ddr,
3201 unsigned int loop_depth, poly_uint64 vf)
3203 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3204 || DDR_NUM_DIST_VECTS (ddr) == 0)
3205 return false;
3207 /* If the dependence is exact, we should have limited the VF instead. */
3208 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3210 unsigned int i;
3211 lambda_vector dist_v;
3212 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3214 HOST_WIDE_INT dist = dist_v[loop_depth];
3215 if (dist != 0
3216 && !(dist > 0 && DDR_REVERSED_P (ddr))
3217 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3218 return false;
3221 if (dump_enabled_p ())
3223 dump_printf_loc (MSG_NOTE, vect_location,
3224 "dependence distance between ");
3225 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3226 dump_printf (MSG_NOTE, " and ");
3227 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3228 dump_printf (MSG_NOTE, " is >= VF\n");
3231 return true;
3234 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3236 static void
3237 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3239 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3240 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3241 dump_printf (dump_kind, ") >= ");
3242 dump_dec (dump_kind, lower_bound.min_value);
3245 /* Record that the vectorized loop requires the vec_lower_bound described
3246 by EXPR, UNSIGNED_P and MIN_VALUE. */
3248 static void
3249 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3250 poly_uint64 min_value)
3252 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3253 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3254 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3256 unsigned_p &= lower_bounds[i].unsigned_p;
3257 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3258 if (lower_bounds[i].unsigned_p != unsigned_p
3259 || maybe_lt (lower_bounds[i].min_value, min_value))
3261 lower_bounds[i].unsigned_p = unsigned_p;
3262 lower_bounds[i].min_value = min_value;
3263 if (dump_enabled_p ())
3265 dump_printf_loc (MSG_NOTE, vect_location,
3266 "updating run-time check to ");
3267 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3268 dump_printf (MSG_NOTE, "\n");
3271 return;
3274 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3275 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3278 dump_lower_bound (MSG_NOTE, lower_bound);
3279 dump_printf (MSG_NOTE, "\n");
3281 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3284 /* Return true if it's unlikely that the step of the vectorized form of DR
3285 will span fewer than GAP bytes. */
3287 static bool
3288 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3290 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3291 HOST_WIDE_INT count
3292 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3293 if (GROUP_FIRST_ELEMENT (stmt_info))
3294 count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3295 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3298 /* Return true if we know that there is no alias between DR_A and DR_B
3299 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3300 *LOWER_BOUND_OUT to this N. */
3302 static bool
3303 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3304 poly_uint64 *lower_bound_out)
3306 /* Check that there is a constant gap of known sign between DR_A
3307 and DR_B. */
3308 poly_int64 init_a, init_b;
3309 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3310 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3311 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3312 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3313 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3314 || !ordered_p (init_a, init_b))
3315 return false;
3317 /* Sort DR_A and DR_B by the address they access. */
3318 if (maybe_lt (init_b, init_a))
3320 std::swap (init_a, init_b);
3321 std::swap (dr_a, dr_b);
3324 /* If the two accesses could be dependent within a scalar iteration,
3325 make sure that we'd retain their order. */
3326 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3327 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3328 return false;
3330 /* There is no alias if abs (DR_STEP) is greater than or equal to
3331 the bytes spanned by the combination of the two accesses. */
3332 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3333 return true;
3336 /* Function vect_prune_runtime_alias_test_list.
3338 Prune a list of ddrs to be tested at run-time by versioning for alias.
3339 Merge several alias checks into one if possible.
3340 Return FALSE if resulting list of ddrs is longer then allowed by
3341 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3343 bool
3344 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3346 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3347 hash_set <tree_pair_hash> compared_objects;
3349 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3350 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3351 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3352 vec<vec_object_pair> &check_unequal_addrs
3353 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3354 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3355 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3357 ddr_p ddr;
3358 unsigned int i;
3359 tree length_factor;
3361 if (dump_enabled_p ())
3362 dump_printf_loc (MSG_NOTE, vect_location,
3363 "=== vect_prune_runtime_alias_test_list ===\n");
3365 /* Step values are irrelevant for aliasing if the number of vector
3366 iterations is equal to the number of scalar iterations (which can
3367 happen for fully-SLP loops). */
3368 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3370 if (!ignore_step_p)
3372 /* Convert the checks for nonzero steps into bound tests. */
3373 tree value;
3374 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3375 vect_check_lower_bound (loop_vinfo, value, true, 1);
3378 if (may_alias_ddrs.is_empty ())
3379 return true;
3381 comp_alias_ddrs.create (may_alias_ddrs.length ());
3383 unsigned int loop_depth
3384 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3385 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3387 /* First, we collect all data ref pairs for aliasing checks. */
3388 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3390 int comp_res;
3391 poly_uint64 lower_bound;
3392 struct data_reference *dr_a, *dr_b;
3393 gimple *dr_group_first_a, *dr_group_first_b;
3394 tree segment_length_a, segment_length_b;
3395 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3396 unsigned int align_a, align_b;
3397 gimple *stmt_a, *stmt_b;
3399 /* Ignore the alias if the VF we chose ended up being no greater
3400 than the dependence distance. */
3401 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3402 continue;
3404 if (DDR_OBJECT_A (ddr))
3406 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3407 if (!compared_objects.add (new_pair))
3409 if (dump_enabled_p ())
3411 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3412 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3413 dump_printf (MSG_NOTE, " and ");
3414 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3415 dump_printf (MSG_NOTE, " have different addresses\n");
3417 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3419 continue;
3422 dr_a = DDR_A (ddr);
3423 stmt_a = DR_STMT (DDR_A (ddr));
3425 dr_b = DDR_B (ddr);
3426 stmt_b = DR_STMT (DDR_B (ddr));
3428 /* Skip the pair if inter-iteration dependencies are irrelevant
3429 and intra-iteration dependencies are guaranteed to be honored. */
3430 if (ignore_step_p
3431 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3432 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3434 if (dump_enabled_p ())
3436 dump_printf_loc (MSG_NOTE, vect_location,
3437 "no need for alias check between ");
3438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3439 dump_printf (MSG_NOTE, " and ");
3440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3441 dump_printf (MSG_NOTE, " when VF is 1\n");
3443 continue;
3446 /* See whether we can handle the alias using a bounds check on
3447 the step, and whether that's likely to be the best approach.
3448 (It might not be, for example, if the minimum step is much larger
3449 than the number of bytes handled by one vector iteration.) */
3450 if (!ignore_step_p
3451 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3452 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3453 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3454 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3456 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3457 if (dump_enabled_p ())
3459 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3460 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3461 dump_printf (MSG_NOTE, " and ");
3462 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3463 dump_printf (MSG_NOTE, " when the step ");
3464 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3465 dump_printf (MSG_NOTE, " is outside ");
3466 if (unsigned_p)
3467 dump_printf (MSG_NOTE, "[0");
3468 else
3470 dump_printf (MSG_NOTE, "(");
3471 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3473 dump_printf (MSG_NOTE, ", ");
3474 dump_dec (MSG_NOTE, lower_bound);
3475 dump_printf (MSG_NOTE, ")\n");
3477 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3478 lower_bound);
3479 continue;
3482 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3483 if (dr_group_first_a)
3485 stmt_a = dr_group_first_a;
3486 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3489 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3490 if (dr_group_first_b)
3492 stmt_b = dr_group_first_b;
3493 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3496 if (ignore_step_p)
3498 segment_length_a = size_zero_node;
3499 segment_length_b = size_zero_node;
3501 else
3503 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3504 length_factor = scalar_loop_iters;
3505 else
3506 length_factor = size_int (vect_factor);
3507 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3508 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3510 access_size_a = vect_vfa_access_size (dr_a);
3511 access_size_b = vect_vfa_access_size (dr_b);
3512 align_a = vect_vfa_align (dr_a);
3513 align_b = vect_vfa_align (dr_b);
3515 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3516 DR_BASE_ADDRESS (dr_b));
3517 if (comp_res == 0)
3518 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3519 DR_OFFSET (dr_b));
3521 /* See whether the alias is known at compilation time. */
3522 if (comp_res == 0
3523 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3524 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3525 && poly_int_tree_p (segment_length_a)
3526 && poly_int_tree_p (segment_length_b))
3528 int res = vect_compile_time_alias (dr_a, dr_b,
3529 segment_length_a,
3530 segment_length_b,
3531 access_size_a,
3532 access_size_b);
3533 if (res >= 0 && dump_enabled_p ())
3535 dump_printf_loc (MSG_NOTE, vect_location,
3536 "can tell at compile time that ");
3537 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3538 dump_printf (MSG_NOTE, " and ");
3539 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3540 if (res == 0)
3541 dump_printf (MSG_NOTE, " do not alias\n");
3542 else
3543 dump_printf (MSG_NOTE, " alias\n");
3546 if (res == 0)
3547 continue;
3549 if (res == 1)
3551 if (dump_enabled_p ())
3552 dump_printf_loc (MSG_NOTE, vect_location,
3553 "not vectorized: compilation time alias.\n");
3554 return false;
3558 dr_with_seg_len_pair_t dr_with_seg_len_pair
3559 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3560 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3562 /* Canonicalize pairs by sorting the two DR members. */
3563 if (comp_res > 0)
3564 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3566 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3569 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3571 unsigned int count = (comp_alias_ddrs.length ()
3572 + check_unequal_addrs.length ());
3574 dump_printf_loc (MSG_NOTE, vect_location,
3575 "improved number of alias checks from %d to %d\n",
3576 may_alias_ddrs.length (), count);
3577 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3579 if (dump_enabled_p ())
3580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3581 "number of versioning for alias "
3582 "run-time tests exceeds %d "
3583 "(--param vect-max-version-for-alias-checks)\n",
3584 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3585 return false;
3588 return true;
3591 /* Check whether we can use an internal function for a gather load
3592 or scatter store. READ_P is true for loads and false for stores.
3593 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3594 the type of the memory elements being loaded or stored. OFFSET_BITS
3595 is the number of bits in each scalar offset and OFFSET_SIGN is the
3596 sign of the offset. SCALE is the amount by which the offset should
3597 be multiplied *after* it has been converted to address width.
3599 Return true if the function is supported, storing the function
3600 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3602 bool
3603 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3604 tree memory_type, unsigned int offset_bits,
3605 signop offset_sign, int scale,
3606 internal_fn *ifn_out, tree *element_type_out)
3608 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3609 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3610 if (offset_bits > element_bits)
3611 /* Internal functions require the offset to be the same width as
3612 the vector elements. We can extend narrower offsets, but it isn't
3613 safe to truncate wider offsets. */
3614 return false;
3616 if (element_bits != memory_bits)
3617 /* For now the vector elements must be the same width as the
3618 memory elements. */
3619 return false;
3621 /* Work out which function we need. */
3622 internal_fn ifn;
3623 if (read_p)
3624 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3625 else
3626 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3628 /* Test whether the target supports this combination. */
3629 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3630 offset_sign, scale))
3631 return false;
3633 *ifn_out = ifn;
3634 *element_type_out = TREE_TYPE (vectype);
3635 return true;
3638 /* CALL is a call to an internal gather load or scatter store function.
3639 Describe the operation in INFO. */
3641 static void
3642 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3644 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3645 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3646 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3648 info->ifn = gimple_call_internal_fn (call);
3649 info->decl = NULL_TREE;
3650 info->base = gimple_call_arg (call, 0);
3651 info->offset = gimple_call_arg (call, 1);
3652 info->offset_dt = vect_unknown_def_type;
3653 info->offset_vectype = NULL_TREE;
3654 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3655 info->element_type = TREE_TYPE (vectype);
3656 info->memory_type = TREE_TYPE (DR_REF (dr));
3659 /* Return true if a non-affine read or write in STMT is suitable for a
3660 gather load or scatter store. Describe the operation in *INFO if so. */
3662 bool
3663 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3664 gather_scatter_info *info)
3666 HOST_WIDE_INT scale = 1;
3667 poly_int64 pbitpos, pbitsize;
3668 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3669 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3670 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3671 tree offtype = NULL_TREE;
3672 tree decl = NULL_TREE, base, off;
3673 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3674 tree memory_type = TREE_TYPE (DR_REF (dr));
3675 machine_mode pmode;
3676 int punsignedp, reversep, pvolatilep = 0;
3677 internal_fn ifn;
3678 tree element_type;
3679 bool masked_p = false;
3681 /* See whether this is already a call to a gather/scatter internal function.
3682 If not, see whether it's a masked load or store. */
3683 gcall *call = dyn_cast <gcall *> (stmt);
3684 if (call && gimple_call_internal_p (call))
3686 ifn = gimple_call_internal_fn (stmt);
3687 if (internal_gather_scatter_fn_p (ifn))
3689 vect_describe_gather_scatter_call (call, info);
3690 return true;
3692 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3695 /* True if we should aim to use internal functions rather than
3696 built-in functions. */
3697 bool use_ifn_p = (DR_IS_READ (dr)
3698 ? supports_vec_gather_load_p ()
3699 : supports_vec_scatter_store_p ());
3701 base = DR_REF (dr);
3702 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3703 see if we can use the def stmt of the address. */
3704 if (masked_p
3705 && TREE_CODE (base) == MEM_REF
3706 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3707 && integer_zerop (TREE_OPERAND (base, 1))
3708 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3710 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3711 if (is_gimple_assign (def_stmt)
3712 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3713 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3716 /* The gather and scatter builtins need address of the form
3717 loop_invariant + vector * {1, 2, 4, 8}
3719 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3720 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3721 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3722 multiplications and additions in it. To get a vector, we need
3723 a single SSA_NAME that will be defined in the loop and will
3724 contain everything that is not loop invariant and that can be
3725 vectorized. The following code attempts to find such a preexistng
3726 SSA_NAME OFF and put the loop invariants into a tree BASE
3727 that can be gimplified before the loop. */
3728 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3729 &punsignedp, &reversep, &pvolatilep);
3730 gcc_assert (base && !reversep);
3731 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3733 if (TREE_CODE (base) == MEM_REF)
3735 if (!integer_zerop (TREE_OPERAND (base, 1)))
3737 if (off == NULL_TREE)
3738 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3739 else
3740 off = size_binop (PLUS_EXPR, off,
3741 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3743 base = TREE_OPERAND (base, 0);
3745 else
3746 base = build_fold_addr_expr (base);
3748 if (off == NULL_TREE)
3749 off = size_zero_node;
3751 /* If base is not loop invariant, either off is 0, then we start with just
3752 the constant offset in the loop invariant BASE and continue with base
3753 as OFF, otherwise give up.
3754 We could handle that case by gimplifying the addition of base + off
3755 into some SSA_NAME and use that as off, but for now punt. */
3756 if (!expr_invariant_in_loop_p (loop, base))
3758 if (!integer_zerop (off))
3759 return false;
3760 off = base;
3761 base = size_int (pbytepos);
3763 /* Otherwise put base + constant offset into the loop invariant BASE
3764 and continue with OFF. */
3765 else
3767 base = fold_convert (sizetype, base);
3768 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3771 /* OFF at this point may be either a SSA_NAME or some tree expression
3772 from get_inner_reference. Try to peel off loop invariants from it
3773 into BASE as long as possible. */
3774 STRIP_NOPS (off);
3775 while (offtype == NULL_TREE)
3777 enum tree_code code;
3778 tree op0, op1, add = NULL_TREE;
3780 if (TREE_CODE (off) == SSA_NAME)
3782 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3784 if (expr_invariant_in_loop_p (loop, off))
3785 return false;
3787 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3788 break;
3790 op0 = gimple_assign_rhs1 (def_stmt);
3791 code = gimple_assign_rhs_code (def_stmt);
3792 op1 = gimple_assign_rhs2 (def_stmt);
3794 else
3796 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3797 return false;
3798 code = TREE_CODE (off);
3799 extract_ops_from_tree (off, &code, &op0, &op1);
3801 switch (code)
3803 case POINTER_PLUS_EXPR:
3804 case PLUS_EXPR:
3805 if (expr_invariant_in_loop_p (loop, op0))
3807 add = op0;
3808 off = op1;
3809 do_add:
3810 add = fold_convert (sizetype, add);
3811 if (scale != 1)
3812 add = size_binop (MULT_EXPR, add, size_int (scale));
3813 base = size_binop (PLUS_EXPR, base, add);
3814 continue;
3816 if (expr_invariant_in_loop_p (loop, op1))
3818 add = op1;
3819 off = op0;
3820 goto do_add;
3822 break;
3823 case MINUS_EXPR:
3824 if (expr_invariant_in_loop_p (loop, op1))
3826 add = fold_convert (sizetype, op1);
3827 add = size_binop (MINUS_EXPR, size_zero_node, add);
3828 off = op0;
3829 goto do_add;
3831 break;
3832 case MULT_EXPR:
3833 if (scale == 1 && tree_fits_shwi_p (op1))
3835 int new_scale = tree_to_shwi (op1);
3836 /* Only treat this as a scaling operation if the target
3837 supports it. */
3838 if (use_ifn_p
3839 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3840 vectype, memory_type, 1,
3841 TYPE_SIGN (TREE_TYPE (op0)),
3842 new_scale, &ifn,
3843 &element_type))
3844 break;
3845 scale = new_scale;
3846 off = op0;
3847 continue;
3849 break;
3850 case SSA_NAME:
3851 off = op0;
3852 continue;
3853 CASE_CONVERT:
3854 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3855 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3856 break;
3857 if (TYPE_PRECISION (TREE_TYPE (op0))
3858 == TYPE_PRECISION (TREE_TYPE (off)))
3860 off = op0;
3861 continue;
3864 /* The internal functions need the offset to be the same width
3865 as the elements of VECTYPE. Don't include operations that
3866 cast the offset from that width to a different width. */
3867 if (use_ifn_p
3868 && (int_size_in_bytes (TREE_TYPE (vectype))
3869 == int_size_in_bytes (TREE_TYPE (off))))
3870 break;
3872 if (TYPE_PRECISION (TREE_TYPE (op0))
3873 < TYPE_PRECISION (TREE_TYPE (off)))
3875 off = op0;
3876 offtype = TREE_TYPE (off);
3877 STRIP_NOPS (off);
3878 continue;
3880 break;
3881 default:
3882 break;
3884 break;
3887 /* If at the end OFF still isn't a SSA_NAME or isn't
3888 defined in the loop, punt. */
3889 if (TREE_CODE (off) != SSA_NAME
3890 || expr_invariant_in_loop_p (loop, off))
3891 return false;
3893 if (offtype == NULL_TREE)
3894 offtype = TREE_TYPE (off);
3896 if (use_ifn_p)
3898 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3899 memory_type, TYPE_PRECISION (offtype),
3900 TYPE_SIGN (offtype), scale, &ifn,
3901 &element_type))
3902 return false;
3904 else
3906 if (DR_IS_READ (dr))
3908 if (targetm.vectorize.builtin_gather)
3909 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3911 else
3913 if (targetm.vectorize.builtin_scatter)
3914 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3917 if (!decl)
3918 return false;
3920 ifn = IFN_LAST;
3921 element_type = TREE_TYPE (vectype);
3924 info->ifn = ifn;
3925 info->decl = decl;
3926 info->base = base;
3927 info->offset = off;
3928 info->offset_dt = vect_unknown_def_type;
3929 info->offset_vectype = NULL_TREE;
3930 info->scale = scale;
3931 info->element_type = element_type;
3932 info->memory_type = memory_type;
3933 return true;
3936 /* Function vect_analyze_data_refs.
3938 Find all the data references in the loop or basic block.
3940 The general structure of the analysis of data refs in the vectorizer is as
3941 follows:
3942 1- vect_analyze_data_refs(loop/bb): call
3943 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3944 in the loop/bb and their dependences.
3945 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3946 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3947 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3951 bool
3952 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3954 struct loop *loop = NULL;
3955 unsigned int i;
3956 struct data_reference *dr;
3957 tree scalar_type;
3959 if (dump_enabled_p ())
3960 dump_printf_loc (MSG_NOTE, vect_location,
3961 "=== vect_analyze_data_refs ===\n");
3963 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3964 loop = LOOP_VINFO_LOOP (loop_vinfo);
3966 /* Go through the data-refs, check that the analysis succeeded. Update
3967 pointer from stmt_vec_info struct to DR and vectype. */
3969 vec<data_reference_p> datarefs = vinfo->datarefs;
3970 FOR_EACH_VEC_ELT (datarefs, i, dr)
3972 gimple *stmt;
3973 stmt_vec_info stmt_info;
3974 tree base, offset, init;
3975 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3976 bool simd_lane_access = false;
3977 poly_uint64 vf;
3979 again:
3980 if (!dr || !DR_REF (dr))
3982 if (dump_enabled_p ())
3983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3984 "not vectorized: unhandled data-ref\n");
3985 return false;
3988 stmt = DR_STMT (dr);
3989 stmt_info = vinfo_for_stmt (stmt);
3991 /* Discard clobbers from the dataref vector. We will remove
3992 clobber stmts during vectorization. */
3993 if (gimple_clobber_p (stmt))
3995 free_data_ref (dr);
3996 if (i == datarefs.length () - 1)
3998 datarefs.pop ();
3999 break;
4001 datarefs.ordered_remove (i);
4002 dr = datarefs[i];
4003 goto again;
4006 /* Check that analysis of the data-ref succeeded. */
4007 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4008 || !DR_STEP (dr))
4010 bool maybe_gather
4011 = DR_IS_READ (dr)
4012 && !TREE_THIS_VOLATILE (DR_REF (dr))
4013 && (targetm.vectorize.builtin_gather != NULL
4014 || supports_vec_gather_load_p ());
4015 bool maybe_scatter
4016 = DR_IS_WRITE (dr)
4017 && !TREE_THIS_VOLATILE (DR_REF (dr))
4018 && (targetm.vectorize.builtin_scatter != NULL
4019 || supports_vec_scatter_store_p ());
4020 bool maybe_simd_lane_access
4021 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4023 /* If target supports vector gather loads or scatter stores, or if
4024 this might be a SIMD lane access, see if they can't be used. */
4025 if (is_a <loop_vec_info> (vinfo)
4026 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4027 && !nested_in_vect_loop_p (loop, stmt))
4029 struct data_reference *newdr
4030 = create_data_ref (NULL, loop_containing_stmt (stmt),
4031 DR_REF (dr), stmt, !maybe_scatter,
4032 DR_IS_CONDITIONAL_IN_STMT (dr));
4033 gcc_assert (newdr != NULL && DR_REF (newdr));
4034 if (DR_BASE_ADDRESS (newdr)
4035 && DR_OFFSET (newdr)
4036 && DR_INIT (newdr)
4037 && DR_STEP (newdr)
4038 && integer_zerop (DR_STEP (newdr)))
4040 if (maybe_simd_lane_access)
4042 tree off = DR_OFFSET (newdr);
4043 STRIP_NOPS (off);
4044 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4045 && TREE_CODE (off) == MULT_EXPR
4046 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4048 tree step = TREE_OPERAND (off, 1);
4049 off = TREE_OPERAND (off, 0);
4050 STRIP_NOPS (off);
4051 if (CONVERT_EXPR_P (off)
4052 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4053 0)))
4054 < TYPE_PRECISION (TREE_TYPE (off)))
4055 off = TREE_OPERAND (off, 0);
4056 if (TREE_CODE (off) == SSA_NAME)
4058 gimple *def = SSA_NAME_DEF_STMT (off);
4059 tree reft = TREE_TYPE (DR_REF (newdr));
4060 if (is_gimple_call (def)
4061 && gimple_call_internal_p (def)
4062 && (gimple_call_internal_fn (def)
4063 == IFN_GOMP_SIMD_LANE))
4065 tree arg = gimple_call_arg (def, 0);
4066 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4067 arg = SSA_NAME_VAR (arg);
4068 if (arg == loop->simduid
4069 /* For now. */
4070 && tree_int_cst_equal
4071 (TYPE_SIZE_UNIT (reft),
4072 step))
4074 DR_OFFSET (newdr) = ssize_int (0);
4075 DR_STEP (newdr) = step;
4076 DR_OFFSET_ALIGNMENT (newdr)
4077 = BIGGEST_ALIGNMENT;
4078 DR_STEP_ALIGNMENT (newdr)
4079 = highest_pow2_factor (step);
4080 dr = newdr;
4081 simd_lane_access = true;
4087 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4089 dr = newdr;
4090 if (maybe_gather)
4091 gatherscatter = GATHER;
4092 else
4093 gatherscatter = SCATTER;
4096 if (gatherscatter == SG_NONE && !simd_lane_access)
4097 free_data_ref (newdr);
4100 if (gatherscatter == SG_NONE && !simd_lane_access)
4102 if (dump_enabled_p ())
4104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4105 "not vectorized: data ref analysis "
4106 "failed ");
4107 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4110 if (is_a <bb_vec_info> (vinfo))
4111 break;
4113 return false;
4117 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4119 if (dump_enabled_p ())
4120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4121 "not vectorized: base addr of dr is a "
4122 "constant\n");
4124 if (is_a <bb_vec_info> (vinfo))
4125 break;
4127 if (gatherscatter != SG_NONE || simd_lane_access)
4128 free_data_ref (dr);
4129 return false;
4132 if (TREE_THIS_VOLATILE (DR_REF (dr)))
4134 if (dump_enabled_p ())
4136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4137 "not vectorized: volatile type ");
4138 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4141 if (is_a <bb_vec_info> (vinfo))
4142 break;
4144 return false;
4147 if (stmt_can_throw_internal (stmt))
4149 if (dump_enabled_p ())
4151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4152 "not vectorized: statement can throw an "
4153 "exception ");
4154 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4157 if (is_a <bb_vec_info> (vinfo))
4158 break;
4160 if (gatherscatter != SG_NONE || simd_lane_access)
4161 free_data_ref (dr);
4162 return false;
4165 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4166 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4168 if (dump_enabled_p ())
4170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4171 "not vectorized: statement is bitfield "
4172 "access ");
4173 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4176 if (is_a <bb_vec_info> (vinfo))
4177 break;
4179 if (gatherscatter != SG_NONE || simd_lane_access)
4180 free_data_ref (dr);
4181 return false;
4184 base = unshare_expr (DR_BASE_ADDRESS (dr));
4185 offset = unshare_expr (DR_OFFSET (dr));
4186 init = unshare_expr (DR_INIT (dr));
4188 if (is_gimple_call (stmt)
4189 && (!gimple_call_internal_p (stmt)
4190 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4191 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4193 if (dump_enabled_p ())
4195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4196 "not vectorized: dr in a call ");
4197 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4200 if (is_a <bb_vec_info> (vinfo))
4201 break;
4203 if (gatherscatter != SG_NONE || simd_lane_access)
4204 free_data_ref (dr);
4205 return false;
4208 /* Update DR field in stmt_vec_info struct. */
4210 /* If the dataref is in an inner-loop of the loop that is considered for
4211 for vectorization, we also want to analyze the access relative to
4212 the outer-loop (DR contains information only relative to the
4213 inner-most enclosing loop). We do that by building a reference to the
4214 first location accessed by the inner-loop, and analyze it relative to
4215 the outer-loop. */
4216 if (loop && nested_in_vect_loop_p (loop, stmt))
4218 /* Build a reference to the first location accessed by the
4219 inner loop: *(BASE + INIT + OFFSET). By construction,
4220 this address must be invariant in the inner loop, so we
4221 can consider it as being used in the outer loop. */
4222 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4223 init, offset);
4224 tree init_addr = fold_build_pointer_plus (base, init_offset);
4225 tree init_ref = build_fold_indirect_ref (init_addr);
4227 if (dump_enabled_p ())
4229 dump_printf_loc (MSG_NOTE, vect_location,
4230 "analyze in outer loop: ");
4231 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4232 dump_printf (MSG_NOTE, "\n");
4235 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4236 init_ref, loop))
4237 /* dr_analyze_innermost already explained the failure. */
4238 return false;
4240 if (dump_enabled_p ())
4242 dump_printf_loc (MSG_NOTE, vect_location,
4243 "\touter base_address: ");
4244 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4245 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4246 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4247 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4248 STMT_VINFO_DR_OFFSET (stmt_info));
4249 dump_printf (MSG_NOTE,
4250 "\n\touter constant offset from base address: ");
4251 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4252 STMT_VINFO_DR_INIT (stmt_info));
4253 dump_printf (MSG_NOTE, "\n\touter step: ");
4254 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4255 STMT_VINFO_DR_STEP (stmt_info));
4256 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4257 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4258 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4259 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4260 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4261 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4262 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4263 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4267 if (STMT_VINFO_DATA_REF (stmt_info))
4269 if (dump_enabled_p ())
4271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4272 "not vectorized: more than one data ref "
4273 "in stmt: ");
4274 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4277 if (is_a <bb_vec_info> (vinfo))
4278 break;
4280 if (gatherscatter != SG_NONE || simd_lane_access)
4281 free_data_ref (dr);
4282 return false;
4285 STMT_VINFO_DATA_REF (stmt_info) = dr;
4286 if (simd_lane_access)
4288 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4289 free_data_ref (datarefs[i]);
4290 datarefs[i] = dr;
4293 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4294 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4295 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4297 if (dump_enabled_p ())
4299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4300 "not vectorized: base object not addressable "
4301 "for stmt: ");
4302 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4304 if (is_a <bb_vec_info> (vinfo))
4306 /* In BB vectorization the ref can still participate
4307 in dependence analysis, we just can't vectorize it. */
4308 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4309 continue;
4311 return false;
4314 /* Set vectype for STMT. */
4315 scalar_type = TREE_TYPE (DR_REF (dr));
4316 STMT_VINFO_VECTYPE (stmt_info)
4317 = get_vectype_for_scalar_type (scalar_type);
4318 if (!STMT_VINFO_VECTYPE (stmt_info))
4320 if (dump_enabled_p ())
4322 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4323 "not vectorized: no vectype for stmt: ");
4324 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4325 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4327 scalar_type);
4328 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4331 if (is_a <bb_vec_info> (vinfo))
4333 /* No vector type is fine, the ref can still participate
4334 in dependence analysis, we just can't vectorize it. */
4335 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4336 continue;
4339 if (gatherscatter != SG_NONE || simd_lane_access)
4341 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4342 if (gatherscatter != SG_NONE)
4343 free_data_ref (dr);
4345 return false;
4347 else
4349 if (dump_enabled_p ())
4351 dump_printf_loc (MSG_NOTE, vect_location,
4352 "got vectype for stmt: ");
4353 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4354 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4355 STMT_VINFO_VECTYPE (stmt_info));
4356 dump_printf (MSG_NOTE, "\n");
4360 /* Adjust the minimal vectorization factor according to the
4361 vector type. */
4362 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4363 *min_vf = upper_bound (*min_vf, vf);
4365 if (gatherscatter != SG_NONE)
4367 gather_scatter_info gs_info;
4368 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4369 &gs_info)
4370 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4372 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4373 free_data_ref (dr);
4374 if (dump_enabled_p ())
4376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4377 (gatherscatter == GATHER) ?
4378 "not vectorized: not suitable for gather "
4379 "load " :
4380 "not vectorized: not suitable for scatter "
4381 "store ");
4382 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4384 return false;
4387 free_data_ref (datarefs[i]);
4388 datarefs[i] = dr;
4389 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4392 else if (is_a <loop_vec_info> (vinfo)
4393 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4395 if (nested_in_vect_loop_p (loop, stmt))
4397 if (dump_enabled_p ())
4399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4400 "not vectorized: not suitable for strided "
4401 "load ");
4402 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4404 return false;
4406 STMT_VINFO_STRIDED_P (stmt_info) = true;
4410 /* If we stopped analysis at the first dataref we could not analyze
4411 when trying to vectorize a basic-block mark the rest of the datarefs
4412 as not vectorizable and truncate the vector of datarefs. That
4413 avoids spending useless time in analyzing their dependence. */
4414 if (i != datarefs.length ())
4416 gcc_assert (is_a <bb_vec_info> (vinfo));
4417 for (unsigned j = i; j < datarefs.length (); ++j)
4419 data_reference_p dr = datarefs[j];
4420 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4421 free_data_ref (dr);
4423 datarefs.truncate (i);
4426 return true;
4430 /* Function vect_get_new_vect_var.
4432 Returns a name for a new variable. The current naming scheme appends the
4433 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4434 the name of vectorizer generated variables, and appends that to NAME if
4435 provided. */
4437 tree
4438 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4440 const char *prefix;
4441 tree new_vect_var;
4443 switch (var_kind)
4445 case vect_simple_var:
4446 prefix = "vect";
4447 break;
4448 case vect_scalar_var:
4449 prefix = "stmp";
4450 break;
4451 case vect_mask_var:
4452 prefix = "mask";
4453 break;
4454 case vect_pointer_var:
4455 prefix = "vectp";
4456 break;
4457 default:
4458 gcc_unreachable ();
4461 if (name)
4463 char* tmp = concat (prefix, "_", name, NULL);
4464 new_vect_var = create_tmp_reg (type, tmp);
4465 free (tmp);
4467 else
4468 new_vect_var = create_tmp_reg (type, prefix);
4470 return new_vect_var;
4473 /* Like vect_get_new_vect_var but return an SSA name. */
4475 tree
4476 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4478 const char *prefix;
4479 tree new_vect_var;
4481 switch (var_kind)
4483 case vect_simple_var:
4484 prefix = "vect";
4485 break;
4486 case vect_scalar_var:
4487 prefix = "stmp";
4488 break;
4489 case vect_pointer_var:
4490 prefix = "vectp";
4491 break;
4492 default:
4493 gcc_unreachable ();
4496 if (name)
4498 char* tmp = concat (prefix, "_", name, NULL);
4499 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4500 free (tmp);
4502 else
4503 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4505 return new_vect_var;
4508 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4510 static void
4511 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4513 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4514 int misalign = DR_MISALIGNMENT (dr);
4515 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4516 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4517 else
4518 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4519 DR_TARGET_ALIGNMENT (dr), misalign);
4522 /* Function vect_create_addr_base_for_vector_ref.
4524 Create an expression that computes the address of the first memory location
4525 that will be accessed for a data reference.
4527 Input:
4528 STMT: The statement containing the data reference.
4529 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4530 OFFSET: Optional. If supplied, it is be added to the initial address.
4531 LOOP: Specify relative to which loop-nest should the address be computed.
4532 For example, when the dataref is in an inner-loop nested in an
4533 outer-loop that is now being vectorized, LOOP can be either the
4534 outer-loop, or the inner-loop. The first memory location accessed
4535 by the following dataref ('in' points to short):
4537 for (i=0; i<N; i++)
4538 for (j=0; j<M; j++)
4539 s += in[i+j]
4541 is as follows:
4542 if LOOP=i_loop: &in (relative to i_loop)
4543 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4544 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4545 initial address. Unlike OFFSET, which is number of elements to
4546 be added, BYTE_OFFSET is measured in bytes.
4548 Output:
4549 1. Return an SSA_NAME whose value is the address of the memory location of
4550 the first vector of the data reference.
4551 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4552 these statement(s) which define the returned SSA_NAME.
4554 FORNOW: We are only handling array accesses with step 1. */
4556 tree
4557 vect_create_addr_base_for_vector_ref (gimple *stmt,
4558 gimple_seq *new_stmt_list,
4559 tree offset,
4560 tree byte_offset)
4562 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4563 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4564 const char *base_name;
4565 tree addr_base;
4566 tree dest;
4567 gimple_seq seq = NULL;
4568 tree vect_ptr_type;
4569 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4570 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4571 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4573 tree data_ref_base = unshare_expr (drb->base_address);
4574 tree base_offset = unshare_expr (drb->offset);
4575 tree init = unshare_expr (drb->init);
4577 if (loop_vinfo)
4578 base_name = get_name (data_ref_base);
4579 else
4581 base_offset = ssize_int (0);
4582 init = ssize_int (0);
4583 base_name = get_name (DR_REF (dr));
4586 /* Create base_offset */
4587 base_offset = size_binop (PLUS_EXPR,
4588 fold_convert (sizetype, base_offset),
4589 fold_convert (sizetype, init));
4591 if (offset)
4593 offset = fold_build2 (MULT_EXPR, sizetype,
4594 fold_convert (sizetype, offset), step);
4595 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4596 base_offset, offset);
4598 if (byte_offset)
4600 byte_offset = fold_convert (sizetype, byte_offset);
4601 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4602 base_offset, byte_offset);
4605 /* base + base_offset */
4606 if (loop_vinfo)
4607 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4608 else
4610 addr_base = build1 (ADDR_EXPR,
4611 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4612 unshare_expr (DR_REF (dr)));
4615 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4616 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4617 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4618 gimple_seq_add_seq (new_stmt_list, seq);
4620 if (DR_PTR_INFO (dr)
4621 && TREE_CODE (addr_base) == SSA_NAME
4622 && !SSA_NAME_PTR_INFO (addr_base))
4624 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4625 if (offset || byte_offset)
4626 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4629 if (dump_enabled_p ())
4631 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4632 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4633 dump_printf (MSG_NOTE, "\n");
4636 return addr_base;
4640 /* Function vect_create_data_ref_ptr.
4642 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4643 location accessed in the loop by STMT, along with the def-use update
4644 chain to appropriately advance the pointer through the loop iterations.
4645 Also set aliasing information for the pointer. This pointer is used by
4646 the callers to this function to create a memory reference expression for
4647 vector load/store access.
4649 Input:
4650 1. STMT: a stmt that references memory. Expected to be of the form
4651 GIMPLE_ASSIGN <name, data-ref> or
4652 GIMPLE_ASSIGN <data-ref, name>.
4653 2. AGGR_TYPE: the type of the reference, which should be either a vector
4654 or an array.
4655 3. AT_LOOP: the loop where the vector memref is to be created.
4656 4. OFFSET (optional): an offset to be added to the initial address accessed
4657 by the data-ref in STMT.
4658 5. BSI: location where the new stmts are to be placed if there is no loop
4659 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4660 pointing to the initial address.
4661 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4662 to the initial address accessed by the data-ref in STMT. This is
4663 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4664 in bytes.
4665 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4666 to the IV during each iteration of the loop. NULL says to move
4667 by one copy of AGGR_TYPE up or down, depending on the step of the
4668 data reference.
4670 Output:
4671 1. Declare a new ptr to vector_type, and have it point to the base of the
4672 data reference (initial addressed accessed by the data reference).
4673 For example, for vector of type V8HI, the following code is generated:
4675 v8hi *ap;
4676 ap = (v8hi *)initial_address;
4678 if OFFSET is not supplied:
4679 initial_address = &a[init];
4680 if OFFSET is supplied:
4681 initial_address = &a[init + OFFSET];
4682 if BYTE_OFFSET is supplied:
4683 initial_address = &a[init] + BYTE_OFFSET;
4685 Return the initial_address in INITIAL_ADDRESS.
4687 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4688 update the pointer in each iteration of the loop.
4690 Return the increment stmt that updates the pointer in PTR_INCR.
4692 3. Set INV_P to true if the access pattern of the data reference in the
4693 vectorized loop is invariant. Set it to false otherwise.
4695 4. Return the pointer. */
4697 tree
4698 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4699 tree offset, tree *initial_address,
4700 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4701 bool only_init, bool *inv_p, tree byte_offset,
4702 tree iv_step)
4704 const char *base_name;
4705 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4706 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4707 struct loop *loop = NULL;
4708 bool nested_in_vect_loop = false;
4709 struct loop *containing_loop = NULL;
4710 tree aggr_ptr_type;
4711 tree aggr_ptr;
4712 tree new_temp;
4713 gimple_seq new_stmt_list = NULL;
4714 edge pe = NULL;
4715 basic_block new_bb;
4716 tree aggr_ptr_init;
4717 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4718 tree aptr;
4719 gimple_stmt_iterator incr_gsi;
4720 bool insert_after;
4721 tree indx_before_incr, indx_after_incr;
4722 gimple *incr;
4723 tree step;
4724 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4726 gcc_assert (iv_step != NULL_TREE
4727 || TREE_CODE (aggr_type) == ARRAY_TYPE
4728 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4730 if (loop_vinfo)
4732 loop = LOOP_VINFO_LOOP (loop_vinfo);
4733 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4734 containing_loop = (gimple_bb (stmt))->loop_father;
4735 pe = loop_preheader_edge (loop);
4737 else
4739 gcc_assert (bb_vinfo);
4740 only_init = true;
4741 *ptr_incr = NULL;
4744 /* Check the step (evolution) of the load in LOOP, and record
4745 whether it's invariant. */
4746 step = vect_dr_behavior (dr)->step;
4747 if (integer_zerop (step))
4748 *inv_p = true;
4749 else
4750 *inv_p = false;
4752 /* Create an expression for the first address accessed by this load
4753 in LOOP. */
4754 base_name = get_name (DR_BASE_ADDRESS (dr));
4756 if (dump_enabled_p ())
4758 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4759 dump_printf_loc (MSG_NOTE, vect_location,
4760 "create %s-pointer variable to type: ",
4761 get_tree_code_name (TREE_CODE (aggr_type)));
4762 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4763 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4764 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4765 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4766 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4767 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4768 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4769 else
4770 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4771 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4772 dump_printf (MSG_NOTE, "\n");
4775 /* (1) Create the new aggregate-pointer variable.
4776 Vector and array types inherit the alias set of their component
4777 type by default so we need to use a ref-all pointer if the data
4778 reference does not conflict with the created aggregated data
4779 reference because it is not addressable. */
4780 bool need_ref_all = false;
4781 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4782 get_alias_set (DR_REF (dr))))
4783 need_ref_all = true;
4784 /* Likewise for any of the data references in the stmt group. */
4785 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4787 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4790 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4791 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4792 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4793 get_alias_set (DR_REF (sdr))))
4795 need_ref_all = true;
4796 break;
4798 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4800 while (orig_stmt);
4802 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4803 need_ref_all);
4804 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4807 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4808 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4809 def-use update cycles for the pointer: one relative to the outer-loop
4810 (LOOP), which is what steps (3) and (4) below do. The other is relative
4811 to the inner-loop (which is the inner-most loop containing the dataref),
4812 and this is done be step (5) below.
4814 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4815 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4816 redundant. Steps (3),(4) create the following:
4818 vp0 = &base_addr;
4819 LOOP: vp1 = phi(vp0,vp2)
4822 vp2 = vp1 + step
4823 goto LOOP
4825 If there is an inner-loop nested in loop, then step (5) will also be
4826 applied, and an additional update in the inner-loop will be created:
4828 vp0 = &base_addr;
4829 LOOP: vp1 = phi(vp0,vp2)
4831 inner: vp3 = phi(vp1,vp4)
4832 vp4 = vp3 + inner_step
4833 if () goto inner
4835 vp2 = vp1 + step
4836 if () goto LOOP */
4838 /* (2) Calculate the initial address of the aggregate-pointer, and set
4839 the aggregate-pointer to point to it before the loop. */
4841 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4843 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4844 offset, byte_offset);
4845 if (new_stmt_list)
4847 if (pe)
4849 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4850 gcc_assert (!new_bb);
4852 else
4853 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4856 *initial_address = new_temp;
4857 aggr_ptr_init = new_temp;
4859 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4860 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4861 inner-loop nested in LOOP (during outer-loop vectorization). */
4863 /* No update in loop is required. */
4864 if (only_init && (!loop_vinfo || at_loop == loop))
4865 aptr = aggr_ptr_init;
4866 else
4868 if (iv_step == NULL_TREE)
4870 /* The step of the aggregate pointer is the type size. */
4871 iv_step = TYPE_SIZE_UNIT (aggr_type);
4872 /* One exception to the above is when the scalar step of the load in
4873 LOOP is zero. In this case the step here is also zero. */
4874 if (*inv_p)
4875 iv_step = size_zero_node;
4876 else if (tree_int_cst_sgn (step) == -1)
4877 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4880 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4882 create_iv (aggr_ptr_init,
4883 fold_convert (aggr_ptr_type, iv_step),
4884 aggr_ptr, loop, &incr_gsi, insert_after,
4885 &indx_before_incr, &indx_after_incr);
4886 incr = gsi_stmt (incr_gsi);
4887 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4889 /* Copy the points-to information if it exists. */
4890 if (DR_PTR_INFO (dr))
4892 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4893 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4895 if (ptr_incr)
4896 *ptr_incr = incr;
4898 aptr = indx_before_incr;
4901 if (!nested_in_vect_loop || only_init)
4902 return aptr;
4905 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4906 nested in LOOP, if exists. */
4908 gcc_assert (nested_in_vect_loop);
4909 if (!only_init)
4911 standard_iv_increment_position (containing_loop, &incr_gsi,
4912 &insert_after);
4913 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4914 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4915 &indx_after_incr);
4916 incr = gsi_stmt (incr_gsi);
4917 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4919 /* Copy the points-to information if it exists. */
4920 if (DR_PTR_INFO (dr))
4922 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4923 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4925 if (ptr_incr)
4926 *ptr_incr = incr;
4928 return indx_before_incr;
4930 else
4931 gcc_unreachable ();
4935 /* Function bump_vector_ptr
4937 Increment a pointer (to a vector type) by vector-size. If requested,
4938 i.e. if PTR-INCR is given, then also connect the new increment stmt
4939 to the existing def-use update-chain of the pointer, by modifying
4940 the PTR_INCR as illustrated below:
4942 The pointer def-use update-chain before this function:
4943 DATAREF_PTR = phi (p_0, p_2)
4944 ....
4945 PTR_INCR: p_2 = DATAREF_PTR + step
4947 The pointer def-use update-chain after this function:
4948 DATAREF_PTR = phi (p_0, p_2)
4949 ....
4950 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4951 ....
4952 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4954 Input:
4955 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4956 in the loop.
4957 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4958 the loop. The increment amount across iterations is expected
4959 to be vector_size.
4960 BSI - location where the new update stmt is to be placed.
4961 STMT - the original scalar memory-access stmt that is being vectorized.
4962 BUMP - optional. The offset by which to bump the pointer. If not given,
4963 the offset is assumed to be vector_size.
4965 Output: Return NEW_DATAREF_PTR as illustrated above.
4969 tree
4970 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4971 gimple *stmt, tree bump)
4973 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4974 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4975 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4976 tree update = TYPE_SIZE_UNIT (vectype);
4977 gassign *incr_stmt;
4978 ssa_op_iter iter;
4979 use_operand_p use_p;
4980 tree new_dataref_ptr;
4982 if (bump)
4983 update = bump;
4985 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4986 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4987 else
4988 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4989 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4990 dataref_ptr, update);
4991 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4993 /* Copy the points-to information if it exists. */
4994 if (DR_PTR_INFO (dr))
4996 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4997 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5000 if (!ptr_incr)
5001 return new_dataref_ptr;
5003 /* Update the vector-pointer's cross-iteration increment. */
5004 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5006 tree use = USE_FROM_PTR (use_p);
5008 if (use == dataref_ptr)
5009 SET_USE (use_p, new_dataref_ptr);
5010 else
5011 gcc_assert (operand_equal_p (use, update, 0));
5014 return new_dataref_ptr;
5018 /* Copy memory reference info such as base/clique from the SRC reference
5019 to the DEST MEM_REF. */
5021 void
5022 vect_copy_ref_info (tree dest, tree src)
5024 if (TREE_CODE (dest) != MEM_REF)
5025 return;
5027 tree src_base = src;
5028 while (handled_component_p (src_base))
5029 src_base = TREE_OPERAND (src_base, 0);
5030 if (TREE_CODE (src_base) != MEM_REF
5031 && TREE_CODE (src_base) != TARGET_MEM_REF)
5032 return;
5034 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5035 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5039 /* Function vect_create_destination_var.
5041 Create a new temporary of type VECTYPE. */
5043 tree
5044 vect_create_destination_var (tree scalar_dest, tree vectype)
5046 tree vec_dest;
5047 const char *name;
5048 char *new_name;
5049 tree type;
5050 enum vect_var_kind kind;
5052 kind = vectype
5053 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5054 ? vect_mask_var
5055 : vect_simple_var
5056 : vect_scalar_var;
5057 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5059 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5061 name = get_name (scalar_dest);
5062 if (name)
5063 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5064 else
5065 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5066 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5067 free (new_name);
5069 return vec_dest;
5072 /* Function vect_grouped_store_supported.
5074 Returns TRUE if interleave high and interleave low permutations
5075 are supported, and FALSE otherwise. */
5077 bool
5078 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5080 machine_mode mode = TYPE_MODE (vectype);
5082 /* vect_permute_store_chain requires the group size to be equal to 3 or
5083 be a power of two. */
5084 if (count != 3 && exact_log2 (count) == -1)
5086 if (dump_enabled_p ())
5087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5088 "the size of the group of accesses"
5089 " is not a power of 2 or not eqaul to 3\n");
5090 return false;
5093 /* Check that the permutation is supported. */
5094 if (VECTOR_MODE_P (mode))
5096 unsigned int i;
5097 if (count == 3)
5099 unsigned int j0 = 0, j1 = 0, j2 = 0;
5100 unsigned int i, j;
5102 unsigned int nelt;
5103 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5105 if (dump_enabled_p ())
5106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5107 "cannot handle groups of 3 stores for"
5108 " variable-length vectors\n");
5109 return false;
5112 vec_perm_builder sel (nelt, nelt, 1);
5113 sel.quick_grow (nelt);
5114 vec_perm_indices indices;
5115 for (j = 0; j < 3; j++)
5117 int nelt0 = ((3 - j) * nelt) % 3;
5118 int nelt1 = ((3 - j) * nelt + 1) % 3;
5119 int nelt2 = ((3 - j) * nelt + 2) % 3;
5120 for (i = 0; i < nelt; i++)
5122 if (3 * i + nelt0 < nelt)
5123 sel[3 * i + nelt0] = j0++;
5124 if (3 * i + nelt1 < nelt)
5125 sel[3 * i + nelt1] = nelt + j1++;
5126 if (3 * i + nelt2 < nelt)
5127 sel[3 * i + nelt2] = 0;
5129 indices.new_vector (sel, 2, nelt);
5130 if (!can_vec_perm_const_p (mode, indices))
5132 if (dump_enabled_p ())
5133 dump_printf (MSG_MISSED_OPTIMIZATION,
5134 "permutation op not supported by target.\n");
5135 return false;
5138 for (i = 0; i < nelt; i++)
5140 if (3 * i + nelt0 < nelt)
5141 sel[3 * i + nelt0] = 3 * i + nelt0;
5142 if (3 * i + nelt1 < nelt)
5143 sel[3 * i + nelt1] = 3 * i + nelt1;
5144 if (3 * i + nelt2 < nelt)
5145 sel[3 * i + nelt2] = nelt + j2++;
5147 indices.new_vector (sel, 2, nelt);
5148 if (!can_vec_perm_const_p (mode, indices))
5150 if (dump_enabled_p ())
5151 dump_printf (MSG_MISSED_OPTIMIZATION,
5152 "permutation op not supported by target.\n");
5153 return false;
5156 return true;
5158 else
5160 /* If length is not equal to 3 then only power of 2 is supported. */
5161 gcc_assert (pow2p_hwi (count));
5162 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5164 /* The encoding has 2 interleaved stepped patterns. */
5165 vec_perm_builder sel (nelt, 2, 3);
5166 sel.quick_grow (6);
5167 for (i = 0; i < 3; i++)
5169 sel[i * 2] = i;
5170 sel[i * 2 + 1] = i + nelt;
5172 vec_perm_indices indices (sel, 2, nelt);
5173 if (can_vec_perm_const_p (mode, indices))
5175 for (i = 0; i < 6; i++)
5176 sel[i] += exact_div (nelt, 2);
5177 indices.new_vector (sel, 2, nelt);
5178 if (can_vec_perm_const_p (mode, indices))
5179 return true;
5184 if (dump_enabled_p ())
5185 dump_printf (MSG_MISSED_OPTIMIZATION,
5186 "permutaion op not supported by target.\n");
5187 return false;
5191 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5192 type VECTYPE. MASKED_P says whether the masked form is needed. */
5194 bool
5195 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5196 bool masked_p)
5198 if (masked_p)
5199 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5200 vec_mask_store_lanes_optab,
5201 vectype, count);
5202 else
5203 return vect_lanes_optab_supported_p ("vec_store_lanes",
5204 vec_store_lanes_optab,
5205 vectype, count);
5209 /* Function vect_permute_store_chain.
5211 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5212 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5213 the data correctly for the stores. Return the final references for stores
5214 in RESULT_CHAIN.
5216 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5217 The input is 4 vectors each containing 8 elements. We assign a number to
5218 each element, the input sequence is:
5220 1st vec: 0 1 2 3 4 5 6 7
5221 2nd vec: 8 9 10 11 12 13 14 15
5222 3rd vec: 16 17 18 19 20 21 22 23
5223 4th vec: 24 25 26 27 28 29 30 31
5225 The output sequence should be:
5227 1st vec: 0 8 16 24 1 9 17 25
5228 2nd vec: 2 10 18 26 3 11 19 27
5229 3rd vec: 4 12 20 28 5 13 21 30
5230 4th vec: 6 14 22 30 7 15 23 31
5232 i.e., we interleave the contents of the four vectors in their order.
5234 We use interleave_high/low instructions to create such output. The input of
5235 each interleave_high/low operation is two vectors:
5236 1st vec 2nd vec
5237 0 1 2 3 4 5 6 7
5238 the even elements of the result vector are obtained left-to-right from the
5239 high/low elements of the first vector. The odd elements of the result are
5240 obtained left-to-right from the high/low elements of the second vector.
5241 The output of interleave_high will be: 0 4 1 5
5242 and of interleave_low: 2 6 3 7
5245 The permutation is done in log LENGTH stages. In each stage interleave_high
5246 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5247 where the first argument is taken from the first half of DR_CHAIN and the
5248 second argument from it's second half.
5249 In our example,
5251 I1: interleave_high (1st vec, 3rd vec)
5252 I2: interleave_low (1st vec, 3rd vec)
5253 I3: interleave_high (2nd vec, 4th vec)
5254 I4: interleave_low (2nd vec, 4th vec)
5256 The output for the first stage is:
5258 I1: 0 16 1 17 2 18 3 19
5259 I2: 4 20 5 21 6 22 7 23
5260 I3: 8 24 9 25 10 26 11 27
5261 I4: 12 28 13 29 14 30 15 31
5263 The output of the second stage, i.e. the final result is:
5265 I1: 0 8 16 24 1 9 17 25
5266 I2: 2 10 18 26 3 11 19 27
5267 I3: 4 12 20 28 5 13 21 30
5268 I4: 6 14 22 30 7 15 23 31. */
5270 void
5271 vect_permute_store_chain (vec<tree> dr_chain,
5272 unsigned int length,
5273 gimple *stmt,
5274 gimple_stmt_iterator *gsi,
5275 vec<tree> *result_chain)
5277 tree vect1, vect2, high, low;
5278 gimple *perm_stmt;
5279 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5280 tree perm_mask_low, perm_mask_high;
5281 tree data_ref;
5282 tree perm3_mask_low, perm3_mask_high;
5283 unsigned int i, j, n, log_length = exact_log2 (length);
5285 result_chain->quick_grow (length);
5286 memcpy (result_chain->address (), dr_chain.address (),
5287 length * sizeof (tree));
5289 if (length == 3)
5291 /* vect_grouped_store_supported ensures that this is constant. */
5292 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5293 unsigned int j0 = 0, j1 = 0, j2 = 0;
5295 vec_perm_builder sel (nelt, nelt, 1);
5296 sel.quick_grow (nelt);
5297 vec_perm_indices indices;
5298 for (j = 0; j < 3; j++)
5300 int nelt0 = ((3 - j) * nelt) % 3;
5301 int nelt1 = ((3 - j) * nelt + 1) % 3;
5302 int nelt2 = ((3 - j) * nelt + 2) % 3;
5304 for (i = 0; i < nelt; i++)
5306 if (3 * i + nelt0 < nelt)
5307 sel[3 * i + nelt0] = j0++;
5308 if (3 * i + nelt1 < nelt)
5309 sel[3 * i + nelt1] = nelt + j1++;
5310 if (3 * i + nelt2 < nelt)
5311 sel[3 * i + nelt2] = 0;
5313 indices.new_vector (sel, 2, nelt);
5314 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5316 for (i = 0; i < nelt; i++)
5318 if (3 * i + nelt0 < nelt)
5319 sel[3 * i + nelt0] = 3 * i + nelt0;
5320 if (3 * i + nelt1 < nelt)
5321 sel[3 * i + nelt1] = 3 * i + nelt1;
5322 if (3 * i + nelt2 < nelt)
5323 sel[3 * i + nelt2] = nelt + j2++;
5325 indices.new_vector (sel, 2, nelt);
5326 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5328 vect1 = dr_chain[0];
5329 vect2 = dr_chain[1];
5331 /* Create interleaving stmt:
5332 low = VEC_PERM_EXPR <vect1, vect2,
5333 {j, nelt, *, j + 1, nelt + j + 1, *,
5334 j + 2, nelt + j + 2, *, ...}> */
5335 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5336 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5337 vect2, perm3_mask_low);
5338 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5340 vect1 = data_ref;
5341 vect2 = dr_chain[2];
5342 /* Create interleaving stmt:
5343 low = VEC_PERM_EXPR <vect1, vect2,
5344 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5345 6, 7, nelt + j + 2, ...}> */
5346 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5347 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5348 vect2, perm3_mask_high);
5349 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5350 (*result_chain)[j] = data_ref;
5353 else
5355 /* If length is not equal to 3 then only power of 2 is supported. */
5356 gcc_assert (pow2p_hwi (length));
5358 /* The encoding has 2 interleaved stepped patterns. */
5359 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5360 vec_perm_builder sel (nelt, 2, 3);
5361 sel.quick_grow (6);
5362 for (i = 0; i < 3; i++)
5364 sel[i * 2] = i;
5365 sel[i * 2 + 1] = i + nelt;
5367 vec_perm_indices indices (sel, 2, nelt);
5368 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5370 for (i = 0; i < 6; i++)
5371 sel[i] += exact_div (nelt, 2);
5372 indices.new_vector (sel, 2, nelt);
5373 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5375 for (i = 0, n = log_length; i < n; i++)
5377 for (j = 0; j < length/2; j++)
5379 vect1 = dr_chain[j];
5380 vect2 = dr_chain[j+length/2];
5382 /* Create interleaving stmt:
5383 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5384 ...}> */
5385 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5386 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5387 vect2, perm_mask_high);
5388 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5389 (*result_chain)[2*j] = high;
5391 /* Create interleaving stmt:
5392 low = VEC_PERM_EXPR <vect1, vect2,
5393 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5394 ...}> */
5395 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5396 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5397 vect2, perm_mask_low);
5398 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5399 (*result_chain)[2*j+1] = low;
5401 memcpy (dr_chain.address (), result_chain->address (),
5402 length * sizeof (tree));
5407 /* Function vect_setup_realignment
5409 This function is called when vectorizing an unaligned load using
5410 the dr_explicit_realign[_optimized] scheme.
5411 This function generates the following code at the loop prolog:
5413 p = initial_addr;
5414 x msq_init = *(floor(p)); # prolog load
5415 realignment_token = call target_builtin;
5416 loop:
5417 x msq = phi (msq_init, ---)
5419 The stmts marked with x are generated only for the case of
5420 dr_explicit_realign_optimized.
5422 The code above sets up a new (vector) pointer, pointing to the first
5423 location accessed by STMT, and a "floor-aligned" load using that pointer.
5424 It also generates code to compute the "realignment-token" (if the relevant
5425 target hook was defined), and creates a phi-node at the loop-header bb
5426 whose arguments are the result of the prolog-load (created by this
5427 function) and the result of a load that takes place in the loop (to be
5428 created by the caller to this function).
5430 For the case of dr_explicit_realign_optimized:
5431 The caller to this function uses the phi-result (msq) to create the
5432 realignment code inside the loop, and sets up the missing phi argument,
5433 as follows:
5434 loop:
5435 msq = phi (msq_init, lsq)
5436 lsq = *(floor(p')); # load in loop
5437 result = realign_load (msq, lsq, realignment_token);
5439 For the case of dr_explicit_realign:
5440 loop:
5441 msq = *(floor(p)); # load in loop
5442 p' = p + (VS-1);
5443 lsq = *(floor(p')); # load in loop
5444 result = realign_load (msq, lsq, realignment_token);
5446 Input:
5447 STMT - (scalar) load stmt to be vectorized. This load accesses
5448 a memory location that may be unaligned.
5449 BSI - place where new code is to be inserted.
5450 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5451 is used.
5453 Output:
5454 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5455 target hook, if defined.
5456 Return value - the result of the loop-header phi node. */
5458 tree
5459 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5460 tree *realignment_token,
5461 enum dr_alignment_support alignment_support_scheme,
5462 tree init_addr,
5463 struct loop **at_loop)
5465 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5466 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5467 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5468 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5469 struct loop *loop = NULL;
5470 edge pe = NULL;
5471 tree scalar_dest = gimple_assign_lhs (stmt);
5472 tree vec_dest;
5473 gimple *inc;
5474 tree ptr;
5475 tree data_ref;
5476 basic_block new_bb;
5477 tree msq_init = NULL_TREE;
5478 tree new_temp;
5479 gphi *phi_stmt;
5480 tree msq = NULL_TREE;
5481 gimple_seq stmts = NULL;
5482 bool inv_p;
5483 bool compute_in_loop = false;
5484 bool nested_in_vect_loop = false;
5485 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5486 struct loop *loop_for_initial_load = NULL;
5488 if (loop_vinfo)
5490 loop = LOOP_VINFO_LOOP (loop_vinfo);
5491 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5494 gcc_assert (alignment_support_scheme == dr_explicit_realign
5495 || alignment_support_scheme == dr_explicit_realign_optimized);
5497 /* We need to generate three things:
5498 1. the misalignment computation
5499 2. the extra vector load (for the optimized realignment scheme).
5500 3. the phi node for the two vectors from which the realignment is
5501 done (for the optimized realignment scheme). */
5503 /* 1. Determine where to generate the misalignment computation.
5505 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5506 calculation will be generated by this function, outside the loop (in the
5507 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5508 caller, inside the loop.
5510 Background: If the misalignment remains fixed throughout the iterations of
5511 the loop, then both realignment schemes are applicable, and also the
5512 misalignment computation can be done outside LOOP. This is because we are
5513 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5514 are a multiple of VS (the Vector Size), and therefore the misalignment in
5515 different vectorized LOOP iterations is always the same.
5516 The problem arises only if the memory access is in an inner-loop nested
5517 inside LOOP, which is now being vectorized using outer-loop vectorization.
5518 This is the only case when the misalignment of the memory access may not
5519 remain fixed throughout the iterations of the inner-loop (as explained in
5520 detail in vect_supportable_dr_alignment). In this case, not only is the
5521 optimized realignment scheme not applicable, but also the misalignment
5522 computation (and generation of the realignment token that is passed to
5523 REALIGN_LOAD) have to be done inside the loop.
5525 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5526 or not, which in turn determines if the misalignment is computed inside
5527 the inner-loop, or outside LOOP. */
5529 if (init_addr != NULL_TREE || !loop_vinfo)
5531 compute_in_loop = true;
5532 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5536 /* 2. Determine where to generate the extra vector load.
5538 For the optimized realignment scheme, instead of generating two vector
5539 loads in each iteration, we generate a single extra vector load in the
5540 preheader of the loop, and in each iteration reuse the result of the
5541 vector load from the previous iteration. In case the memory access is in
5542 an inner-loop nested inside LOOP, which is now being vectorized using
5543 outer-loop vectorization, we need to determine whether this initial vector
5544 load should be generated at the preheader of the inner-loop, or can be
5545 generated at the preheader of LOOP. If the memory access has no evolution
5546 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5547 to be generated inside LOOP (in the preheader of the inner-loop). */
5549 if (nested_in_vect_loop)
5551 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5552 bool invariant_in_outerloop =
5553 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5554 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5556 else
5557 loop_for_initial_load = loop;
5558 if (at_loop)
5559 *at_loop = loop_for_initial_load;
5561 if (loop_for_initial_load)
5562 pe = loop_preheader_edge (loop_for_initial_load);
5564 /* 3. For the case of the optimized realignment, create the first vector
5565 load at the loop preheader. */
5567 if (alignment_support_scheme == dr_explicit_realign_optimized)
5569 /* Create msq_init = *(floor(p1)) in the loop preheader */
5570 gassign *new_stmt;
5572 gcc_assert (!compute_in_loop);
5573 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5574 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5575 NULL_TREE, &init_addr, NULL, &inc,
5576 true, &inv_p);
5577 if (TREE_CODE (ptr) == SSA_NAME)
5578 new_temp = copy_ssa_name (ptr);
5579 else
5580 new_temp = make_ssa_name (TREE_TYPE (ptr));
5581 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5582 new_stmt = gimple_build_assign
5583 (new_temp, BIT_AND_EXPR, ptr,
5584 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5585 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5586 gcc_assert (!new_bb);
5587 data_ref
5588 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5589 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5590 vect_copy_ref_info (data_ref, DR_REF (dr));
5591 new_stmt = gimple_build_assign (vec_dest, data_ref);
5592 new_temp = make_ssa_name (vec_dest, new_stmt);
5593 gimple_assign_set_lhs (new_stmt, new_temp);
5594 if (pe)
5596 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5597 gcc_assert (!new_bb);
5599 else
5600 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5602 msq_init = gimple_assign_lhs (new_stmt);
5605 /* 4. Create realignment token using a target builtin, if available.
5606 It is done either inside the containing loop, or before LOOP (as
5607 determined above). */
5609 if (targetm.vectorize.builtin_mask_for_load)
5611 gcall *new_stmt;
5612 tree builtin_decl;
5614 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5615 if (!init_addr)
5617 /* Generate the INIT_ADDR computation outside LOOP. */
5618 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5619 NULL_TREE);
5620 if (loop)
5622 pe = loop_preheader_edge (loop);
5623 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5624 gcc_assert (!new_bb);
5626 else
5627 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5630 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5631 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5632 vec_dest =
5633 vect_create_destination_var (scalar_dest,
5634 gimple_call_return_type (new_stmt));
5635 new_temp = make_ssa_name (vec_dest, new_stmt);
5636 gimple_call_set_lhs (new_stmt, new_temp);
5638 if (compute_in_loop)
5639 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5640 else
5642 /* Generate the misalignment computation outside LOOP. */
5643 pe = loop_preheader_edge (loop);
5644 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5645 gcc_assert (!new_bb);
5648 *realignment_token = gimple_call_lhs (new_stmt);
5650 /* The result of the CALL_EXPR to this builtin is determined from
5651 the value of the parameter and no global variables are touched
5652 which makes the builtin a "const" function. Requiring the
5653 builtin to have the "const" attribute makes it unnecessary
5654 to call mark_call_clobbered. */
5655 gcc_assert (TREE_READONLY (builtin_decl));
5658 if (alignment_support_scheme == dr_explicit_realign)
5659 return msq;
5661 gcc_assert (!compute_in_loop);
5662 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5665 /* 5. Create msq = phi <msq_init, lsq> in loop */
5667 pe = loop_preheader_edge (containing_loop);
5668 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5669 msq = make_ssa_name (vec_dest);
5670 phi_stmt = create_phi_node (msq, containing_loop->header);
5671 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5673 return msq;
5677 /* Function vect_grouped_load_supported.
5679 COUNT is the size of the load group (the number of statements plus the
5680 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5681 only one statement, with a gap of COUNT - 1.
5683 Returns true if a suitable permute exists. */
5685 bool
5686 vect_grouped_load_supported (tree vectype, bool single_element_p,
5687 unsigned HOST_WIDE_INT count)
5689 machine_mode mode = TYPE_MODE (vectype);
5691 /* If this is single-element interleaving with an element distance
5692 that leaves unused vector loads around punt - we at least create
5693 very sub-optimal code in that case (and blow up memory,
5694 see PR65518). */
5695 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5697 if (dump_enabled_p ())
5698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5699 "single-element interleaving not supported "
5700 "for not adjacent vector loads\n");
5701 return false;
5704 /* vect_permute_load_chain requires the group size to be equal to 3 or
5705 be a power of two. */
5706 if (count != 3 && exact_log2 (count) == -1)
5708 if (dump_enabled_p ())
5709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5710 "the size of the group of accesses"
5711 " is not a power of 2 or not equal to 3\n");
5712 return false;
5715 /* Check that the permutation is supported. */
5716 if (VECTOR_MODE_P (mode))
5718 unsigned int i, j;
5719 if (count == 3)
5721 unsigned int nelt;
5722 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5724 if (dump_enabled_p ())
5725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5726 "cannot handle groups of 3 loads for"
5727 " variable-length vectors\n");
5728 return false;
5731 vec_perm_builder sel (nelt, nelt, 1);
5732 sel.quick_grow (nelt);
5733 vec_perm_indices indices;
5734 unsigned int k;
5735 for (k = 0; k < 3; k++)
5737 for (i = 0; i < nelt; i++)
5738 if (3 * i + k < 2 * nelt)
5739 sel[i] = 3 * i + k;
5740 else
5741 sel[i] = 0;
5742 indices.new_vector (sel, 2, nelt);
5743 if (!can_vec_perm_const_p (mode, indices))
5745 if (dump_enabled_p ())
5746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5747 "shuffle of 3 loads is not supported by"
5748 " target\n");
5749 return false;
5751 for (i = 0, j = 0; i < nelt; i++)
5752 if (3 * i + k < 2 * nelt)
5753 sel[i] = i;
5754 else
5755 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5756 indices.new_vector (sel, 2, nelt);
5757 if (!can_vec_perm_const_p (mode, indices))
5759 if (dump_enabled_p ())
5760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5761 "shuffle of 3 loads is not supported by"
5762 " target\n");
5763 return false;
5766 return true;
5768 else
5770 /* If length is not equal to 3 then only power of 2 is supported. */
5771 gcc_assert (pow2p_hwi (count));
5772 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5774 /* The encoding has a single stepped pattern. */
5775 vec_perm_builder sel (nelt, 1, 3);
5776 sel.quick_grow (3);
5777 for (i = 0; i < 3; i++)
5778 sel[i] = i * 2;
5779 vec_perm_indices indices (sel, 2, nelt);
5780 if (can_vec_perm_const_p (mode, indices))
5782 for (i = 0; i < 3; i++)
5783 sel[i] = i * 2 + 1;
5784 indices.new_vector (sel, 2, nelt);
5785 if (can_vec_perm_const_p (mode, indices))
5786 return true;
5791 if (dump_enabled_p ())
5792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5793 "extract even/odd not supported by target\n");
5794 return false;
5797 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5798 type VECTYPE. MASKED_P says whether the masked form is needed. */
5800 bool
5801 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5802 bool masked_p)
5804 if (masked_p)
5805 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5806 vec_mask_load_lanes_optab,
5807 vectype, count);
5808 else
5809 return vect_lanes_optab_supported_p ("vec_load_lanes",
5810 vec_load_lanes_optab,
5811 vectype, count);
5814 /* Function vect_permute_load_chain.
5816 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5817 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5818 the input data correctly. Return the final references for loads in
5819 RESULT_CHAIN.
5821 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5822 The input is 4 vectors each containing 8 elements. We assign a number to each
5823 element, the input sequence is:
5825 1st vec: 0 1 2 3 4 5 6 7
5826 2nd vec: 8 9 10 11 12 13 14 15
5827 3rd vec: 16 17 18 19 20 21 22 23
5828 4th vec: 24 25 26 27 28 29 30 31
5830 The output sequence should be:
5832 1st vec: 0 4 8 12 16 20 24 28
5833 2nd vec: 1 5 9 13 17 21 25 29
5834 3rd vec: 2 6 10 14 18 22 26 30
5835 4th vec: 3 7 11 15 19 23 27 31
5837 i.e., the first output vector should contain the first elements of each
5838 interleaving group, etc.
5840 We use extract_even/odd instructions to create such output. The input of
5841 each extract_even/odd operation is two vectors
5842 1st vec 2nd vec
5843 0 1 2 3 4 5 6 7
5845 and the output is the vector of extracted even/odd elements. The output of
5846 extract_even will be: 0 2 4 6
5847 and of extract_odd: 1 3 5 7
5850 The permutation is done in log LENGTH stages. In each stage extract_even
5851 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5852 their order. In our example,
5854 E1: extract_even (1st vec, 2nd vec)
5855 E2: extract_odd (1st vec, 2nd vec)
5856 E3: extract_even (3rd vec, 4th vec)
5857 E4: extract_odd (3rd vec, 4th vec)
5859 The output for the first stage will be:
5861 E1: 0 2 4 6 8 10 12 14
5862 E2: 1 3 5 7 9 11 13 15
5863 E3: 16 18 20 22 24 26 28 30
5864 E4: 17 19 21 23 25 27 29 31
5866 In order to proceed and create the correct sequence for the next stage (or
5867 for the correct output, if the second stage is the last one, as in our
5868 example), we first put the output of extract_even operation and then the
5869 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5870 The input for the second stage is:
5872 1st vec (E1): 0 2 4 6 8 10 12 14
5873 2nd vec (E3): 16 18 20 22 24 26 28 30
5874 3rd vec (E2): 1 3 5 7 9 11 13 15
5875 4th vec (E4): 17 19 21 23 25 27 29 31
5877 The output of the second stage:
5879 E1: 0 4 8 12 16 20 24 28
5880 E2: 2 6 10 14 18 22 26 30
5881 E3: 1 5 9 13 17 21 25 29
5882 E4: 3 7 11 15 19 23 27 31
5884 And RESULT_CHAIN after reordering:
5886 1st vec (E1): 0 4 8 12 16 20 24 28
5887 2nd vec (E3): 1 5 9 13 17 21 25 29
5888 3rd vec (E2): 2 6 10 14 18 22 26 30
5889 4th vec (E4): 3 7 11 15 19 23 27 31. */
5891 static void
5892 vect_permute_load_chain (vec<tree> dr_chain,
5893 unsigned int length,
5894 gimple *stmt,
5895 gimple_stmt_iterator *gsi,
5896 vec<tree> *result_chain)
5898 tree data_ref, first_vect, second_vect;
5899 tree perm_mask_even, perm_mask_odd;
5900 tree perm3_mask_low, perm3_mask_high;
5901 gimple *perm_stmt;
5902 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5903 unsigned int i, j, log_length = exact_log2 (length);
5905 result_chain->quick_grow (length);
5906 memcpy (result_chain->address (), dr_chain.address (),
5907 length * sizeof (tree));
5909 if (length == 3)
5911 /* vect_grouped_load_supported ensures that this is constant. */
5912 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5913 unsigned int k;
5915 vec_perm_builder sel (nelt, nelt, 1);
5916 sel.quick_grow (nelt);
5917 vec_perm_indices indices;
5918 for (k = 0; k < 3; k++)
5920 for (i = 0; i < nelt; i++)
5921 if (3 * i + k < 2 * nelt)
5922 sel[i] = 3 * i + k;
5923 else
5924 sel[i] = 0;
5925 indices.new_vector (sel, 2, nelt);
5926 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5928 for (i = 0, j = 0; i < nelt; i++)
5929 if (3 * i + k < 2 * nelt)
5930 sel[i] = i;
5931 else
5932 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5933 indices.new_vector (sel, 2, nelt);
5934 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5936 first_vect = dr_chain[0];
5937 second_vect = dr_chain[1];
5939 /* Create interleaving stmt (low part of):
5940 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5941 ...}> */
5942 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5943 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5944 second_vect, perm3_mask_low);
5945 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5947 /* Create interleaving stmt (high part of):
5948 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5949 ...}> */
5950 first_vect = data_ref;
5951 second_vect = dr_chain[2];
5952 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5953 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5954 second_vect, perm3_mask_high);
5955 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5956 (*result_chain)[k] = data_ref;
5959 else
5961 /* If length is not equal to 3 then only power of 2 is supported. */
5962 gcc_assert (pow2p_hwi (length));
5964 /* The encoding has a single stepped pattern. */
5965 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5966 vec_perm_builder sel (nelt, 1, 3);
5967 sel.quick_grow (3);
5968 for (i = 0; i < 3; ++i)
5969 sel[i] = i * 2;
5970 vec_perm_indices indices (sel, 2, nelt);
5971 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5973 for (i = 0; i < 3; ++i)
5974 sel[i] = i * 2 + 1;
5975 indices.new_vector (sel, 2, nelt);
5976 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5978 for (i = 0; i < log_length; i++)
5980 for (j = 0; j < length; j += 2)
5982 first_vect = dr_chain[j];
5983 second_vect = dr_chain[j+1];
5985 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5986 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5987 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5988 first_vect, second_vect,
5989 perm_mask_even);
5990 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5991 (*result_chain)[j/2] = data_ref;
5993 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5994 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5995 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5996 first_vect, second_vect,
5997 perm_mask_odd);
5998 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5999 (*result_chain)[j/2+length/2] = data_ref;
6001 memcpy (dr_chain.address (), result_chain->address (),
6002 length * sizeof (tree));
6007 /* Function vect_shift_permute_load_chain.
6009 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6010 sequence of stmts to reorder the input data accordingly.
6011 Return the final references for loads in RESULT_CHAIN.
6012 Return true if successed, false otherwise.
6014 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6015 The input is 3 vectors each containing 8 elements. We assign a
6016 number to each element, the input sequence is:
6018 1st vec: 0 1 2 3 4 5 6 7
6019 2nd vec: 8 9 10 11 12 13 14 15
6020 3rd vec: 16 17 18 19 20 21 22 23
6022 The output sequence should be:
6024 1st vec: 0 3 6 9 12 15 18 21
6025 2nd vec: 1 4 7 10 13 16 19 22
6026 3rd vec: 2 5 8 11 14 17 20 23
6028 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6030 First we shuffle all 3 vectors to get correct elements order:
6032 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6033 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6034 3rd vec: (16 19 22) (17 20 23) (18 21)
6036 Next we unite and shift vector 3 times:
6038 1st step:
6039 shift right by 6 the concatenation of:
6040 "1st vec" and "2nd vec"
6041 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6042 "2nd vec" and "3rd vec"
6043 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6044 "3rd vec" and "1st vec"
6045 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6046 | New vectors |
6048 So that now new vectors are:
6050 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6051 2nd vec: (10 13) (16 19 22) (17 20 23)
6052 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6054 2nd step:
6055 shift right by 5 the concatenation of:
6056 "1st vec" and "3rd vec"
6057 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6058 "2nd vec" and "1st vec"
6059 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6060 "3rd vec" and "2nd vec"
6061 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6062 | New vectors |
6064 So that now new vectors are:
6066 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6067 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6068 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6070 3rd step:
6071 shift right by 5 the concatenation of:
6072 "1st vec" and "1st vec"
6073 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6074 shift right by 3 the concatenation of:
6075 "2nd vec" and "2nd vec"
6076 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6077 | New vectors |
6079 So that now all vectors are READY:
6080 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6081 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6082 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6084 This algorithm is faster than one in vect_permute_load_chain if:
6085 1. "shift of a concatination" is faster than general permutation.
6086 This is usually so.
6087 2. The TARGET machine can't execute vector instructions in parallel.
6088 This is because each step of the algorithm depends on previous.
6089 The algorithm in vect_permute_load_chain is much more parallel.
6091 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6094 static bool
6095 vect_shift_permute_load_chain (vec<tree> dr_chain,
6096 unsigned int length,
6097 gimple *stmt,
6098 gimple_stmt_iterator *gsi,
6099 vec<tree> *result_chain)
6101 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6102 tree perm2_mask1, perm2_mask2, perm3_mask;
6103 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6104 gimple *perm_stmt;
6106 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6107 unsigned int i;
6108 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6109 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6111 unsigned HOST_WIDE_INT nelt, vf;
6112 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6113 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6114 /* Not supported for variable-length vectors. */
6115 return false;
6117 vec_perm_builder sel (nelt, nelt, 1);
6118 sel.quick_grow (nelt);
6120 result_chain->quick_grow (length);
6121 memcpy (result_chain->address (), dr_chain.address (),
6122 length * sizeof (tree));
6124 if (pow2p_hwi (length) && vf > 4)
6126 unsigned int j, log_length = exact_log2 (length);
6127 for (i = 0; i < nelt / 2; ++i)
6128 sel[i] = i * 2;
6129 for (i = 0; i < nelt / 2; ++i)
6130 sel[nelt / 2 + i] = i * 2 + 1;
6131 vec_perm_indices indices (sel, 2, nelt);
6132 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6134 if (dump_enabled_p ())
6135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6136 "shuffle of 2 fields structure is not \
6137 supported by target\n");
6138 return false;
6140 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6142 for (i = 0; i < nelt / 2; ++i)
6143 sel[i] = i * 2 + 1;
6144 for (i = 0; i < nelt / 2; ++i)
6145 sel[nelt / 2 + i] = i * 2;
6146 indices.new_vector (sel, 2, nelt);
6147 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6149 if (dump_enabled_p ())
6150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6151 "shuffle of 2 fields structure is not \
6152 supported by target\n");
6153 return false;
6155 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6157 /* Generating permutation constant to shift all elements.
6158 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6159 for (i = 0; i < nelt; i++)
6160 sel[i] = nelt / 2 + i;
6161 indices.new_vector (sel, 2, nelt);
6162 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6164 if (dump_enabled_p ())
6165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6166 "shift permutation is not supported by target\n");
6167 return false;
6169 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6171 /* Generating permutation constant to select vector from 2.
6172 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6173 for (i = 0; i < nelt / 2; i++)
6174 sel[i] = i;
6175 for (i = nelt / 2; i < nelt; i++)
6176 sel[i] = nelt + i;
6177 indices.new_vector (sel, 2, nelt);
6178 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6180 if (dump_enabled_p ())
6181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6182 "select is not supported by target\n");
6183 return false;
6185 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6187 for (i = 0; i < log_length; i++)
6189 for (j = 0; j < length; j += 2)
6191 first_vect = dr_chain[j];
6192 second_vect = dr_chain[j + 1];
6194 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6195 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6196 first_vect, first_vect,
6197 perm2_mask1);
6198 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6199 vect[0] = data_ref;
6201 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6202 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6203 second_vect, second_vect,
6204 perm2_mask2);
6205 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6206 vect[1] = data_ref;
6208 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6209 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6210 vect[0], vect[1], shift1_mask);
6211 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6212 (*result_chain)[j/2 + length/2] = data_ref;
6214 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6215 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6216 vect[0], vect[1], select_mask);
6217 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6218 (*result_chain)[j/2] = data_ref;
6220 memcpy (dr_chain.address (), result_chain->address (),
6221 length * sizeof (tree));
6223 return true;
6225 if (length == 3 && vf > 2)
6227 unsigned int k = 0, l = 0;
6229 /* Generating permutation constant to get all elements in rigth order.
6230 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6231 for (i = 0; i < nelt; i++)
6233 if (3 * k + (l % 3) >= nelt)
6235 k = 0;
6236 l += (3 - (nelt % 3));
6238 sel[i] = 3 * k + (l % 3);
6239 k++;
6241 vec_perm_indices indices (sel, 2, nelt);
6242 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6244 if (dump_enabled_p ())
6245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6246 "shuffle of 3 fields structure is not \
6247 supported by target\n");
6248 return false;
6250 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6252 /* Generating permutation constant to shift all elements.
6253 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6254 for (i = 0; i < nelt; i++)
6255 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6256 indices.new_vector (sel, 2, nelt);
6257 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6259 if (dump_enabled_p ())
6260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6261 "shift permutation is not supported by target\n");
6262 return false;
6264 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6266 /* Generating permutation constant to shift all elements.
6267 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6268 for (i = 0; i < nelt; i++)
6269 sel[i] = 2 * (nelt / 3) + 1 + i;
6270 indices.new_vector (sel, 2, nelt);
6271 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6273 if (dump_enabled_p ())
6274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6275 "shift permutation is not supported by target\n");
6276 return false;
6278 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6280 /* Generating permutation constant to shift all elements.
6281 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6282 for (i = 0; i < nelt; i++)
6283 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6284 indices.new_vector (sel, 2, nelt);
6285 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6287 if (dump_enabled_p ())
6288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6289 "shift permutation is not supported by target\n");
6290 return false;
6292 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6294 /* Generating permutation constant to shift all elements.
6295 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6296 for (i = 0; i < nelt; i++)
6297 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6298 indices.new_vector (sel, 2, nelt);
6299 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6301 if (dump_enabled_p ())
6302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6303 "shift permutation is not supported by target\n");
6304 return false;
6306 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6308 for (k = 0; k < 3; k++)
6310 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6311 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6312 dr_chain[k], dr_chain[k],
6313 perm3_mask);
6314 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6315 vect[k] = data_ref;
6318 for (k = 0; k < 3; k++)
6320 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6321 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6322 vect[k % 3], vect[(k + 1) % 3],
6323 shift1_mask);
6324 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6325 vect_shift[k] = data_ref;
6328 for (k = 0; k < 3; k++)
6330 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6331 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6332 vect_shift[(4 - k) % 3],
6333 vect_shift[(3 - k) % 3],
6334 shift2_mask);
6335 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6336 vect[k] = data_ref;
6339 (*result_chain)[3 - (nelt % 3)] = vect[2];
6341 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6342 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6343 vect[0], shift3_mask);
6344 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6345 (*result_chain)[nelt % 3] = data_ref;
6347 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6348 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6349 vect[1], shift4_mask);
6350 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6351 (*result_chain)[0] = data_ref;
6352 return true;
6354 return false;
6357 /* Function vect_transform_grouped_load.
6359 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6360 to perform their permutation and ascribe the result vectorized statements to
6361 the scalar statements.
6364 void
6365 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6366 gimple_stmt_iterator *gsi)
6368 machine_mode mode;
6369 vec<tree> result_chain = vNULL;
6371 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6372 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6373 vectors, that are ready for vector computation. */
6374 result_chain.create (size);
6376 /* If reassociation width for vector type is 2 or greater target machine can
6377 execute 2 or more vector instructions in parallel. Otherwise try to
6378 get chain for loads group using vect_shift_permute_load_chain. */
6379 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6380 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6381 || pow2p_hwi (size)
6382 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6383 gsi, &result_chain))
6384 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6385 vect_record_grouped_load_vectors (stmt, result_chain);
6386 result_chain.release ();
6389 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6390 generated as part of the vectorization of STMT. Assign the statement
6391 for each vector to the associated scalar statement. */
6393 void
6394 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6396 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6397 gimple *next_stmt, *new_stmt;
6398 unsigned int i, gap_count;
6399 tree tmp_data_ref;
6401 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6402 Since we scan the chain starting from it's first node, their order
6403 corresponds the order of data-refs in RESULT_CHAIN. */
6404 next_stmt = first_stmt;
6405 gap_count = 1;
6406 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6408 if (!next_stmt)
6409 break;
6411 /* Skip the gaps. Loads created for the gaps will be removed by dead
6412 code elimination pass later. No need to check for the first stmt in
6413 the group, since it always exists.
6414 GROUP_GAP is the number of steps in elements from the previous
6415 access (if there is no gap GROUP_GAP is 1). We skip loads that
6416 correspond to the gaps. */
6417 if (next_stmt != first_stmt
6418 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6420 gap_count++;
6421 continue;
6424 while (next_stmt)
6426 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6427 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6428 copies, and we put the new vector statement in the first available
6429 RELATED_STMT. */
6430 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6431 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6432 else
6434 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6436 gimple *prev_stmt =
6437 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6438 gimple *rel_stmt =
6439 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6440 while (rel_stmt)
6442 prev_stmt = rel_stmt;
6443 rel_stmt =
6444 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6447 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6448 new_stmt;
6452 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6453 gap_count = 1;
6454 /* If NEXT_STMT accesses the same DR as the previous statement,
6455 put the same TMP_DATA_REF as its vectorized statement; otherwise
6456 get the next data-ref from RESULT_CHAIN. */
6457 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6458 break;
6463 /* Function vect_force_dr_alignment_p.
6465 Returns whether the alignment of a DECL can be forced to be aligned
6466 on ALIGNMENT bit boundary. */
6468 bool
6469 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6471 if (!VAR_P (decl))
6472 return false;
6474 if (decl_in_symtab_p (decl)
6475 && !symtab_node::get (decl)->can_increase_alignment_p ())
6476 return false;
6478 if (TREE_STATIC (decl))
6479 return (alignment <= MAX_OFILE_ALIGNMENT);
6480 else
6481 return (alignment <= MAX_STACK_ALIGNMENT);
6485 /* Return whether the data reference DR is supported with respect to its
6486 alignment.
6487 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6488 it is aligned, i.e., check if it is possible to vectorize it with different
6489 alignment. */
6491 enum dr_alignment_support
6492 vect_supportable_dr_alignment (struct data_reference *dr,
6493 bool check_aligned_accesses)
6495 gimple *stmt = DR_STMT (dr);
6496 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6497 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6498 machine_mode mode = TYPE_MODE (vectype);
6499 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6500 struct loop *vect_loop = NULL;
6501 bool nested_in_vect_loop = false;
6503 if (aligned_access_p (dr) && !check_aligned_accesses)
6504 return dr_aligned;
6506 /* For now assume all conditional loads/stores support unaligned
6507 access without any special code. */
6508 if (is_gimple_call (stmt)
6509 && gimple_call_internal_p (stmt)
6510 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6511 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6512 return dr_unaligned_supported;
6514 if (loop_vinfo)
6516 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6517 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6520 /* Possibly unaligned access. */
6522 /* We can choose between using the implicit realignment scheme (generating
6523 a misaligned_move stmt) and the explicit realignment scheme (generating
6524 aligned loads with a REALIGN_LOAD). There are two variants to the
6525 explicit realignment scheme: optimized, and unoptimized.
6526 We can optimize the realignment only if the step between consecutive
6527 vector loads is equal to the vector size. Since the vector memory
6528 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6529 is guaranteed that the misalignment amount remains the same throughout the
6530 execution of the vectorized loop. Therefore, we can create the
6531 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6532 at the loop preheader.
6534 However, in the case of outer-loop vectorization, when vectorizing a
6535 memory access in the inner-loop nested within the LOOP that is now being
6536 vectorized, while it is guaranteed that the misalignment of the
6537 vectorized memory access will remain the same in different outer-loop
6538 iterations, it is *not* guaranteed that is will remain the same throughout
6539 the execution of the inner-loop. This is because the inner-loop advances
6540 with the original scalar step (and not in steps of VS). If the inner-loop
6541 step happens to be a multiple of VS, then the misalignment remains fixed
6542 and we can use the optimized realignment scheme. For example:
6544 for (i=0; i<N; i++)
6545 for (j=0; j<M; j++)
6546 s += a[i+j];
6548 When vectorizing the i-loop in the above example, the step between
6549 consecutive vector loads is 1, and so the misalignment does not remain
6550 fixed across the execution of the inner-loop, and the realignment cannot
6551 be optimized (as illustrated in the following pseudo vectorized loop):
6553 for (i=0; i<N; i+=4)
6554 for (j=0; j<M; j++){
6555 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6556 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6557 // (assuming that we start from an aligned address).
6560 We therefore have to use the unoptimized realignment scheme:
6562 for (i=0; i<N; i+=4)
6563 for (j=k; j<M; j+=4)
6564 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6565 // that the misalignment of the initial address is
6566 // 0).
6568 The loop can then be vectorized as follows:
6570 for (k=0; k<4; k++){
6571 rt = get_realignment_token (&vp[k]);
6572 for (i=0; i<N; i+=4){
6573 v1 = vp[i+k];
6574 for (j=k; j<M; j+=4){
6575 v2 = vp[i+j+VS-1];
6576 va = REALIGN_LOAD <v1,v2,rt>;
6577 vs += va;
6578 v1 = v2;
6581 } */
6583 if (DR_IS_READ (dr))
6585 bool is_packed = false;
6586 tree type = (TREE_TYPE (DR_REF (dr)));
6588 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6589 && (!targetm.vectorize.builtin_mask_for_load
6590 || targetm.vectorize.builtin_mask_for_load ()))
6592 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6594 /* If we are doing SLP then the accesses need not have the
6595 same alignment, instead it depends on the SLP group size. */
6596 if (loop_vinfo
6597 && STMT_SLP_TYPE (stmt_info)
6598 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6599 * GROUP_SIZE (vinfo_for_stmt
6600 (GROUP_FIRST_ELEMENT (stmt_info))),
6601 TYPE_VECTOR_SUBPARTS (vectype)))
6603 else if (!loop_vinfo
6604 || (nested_in_vect_loop
6605 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6606 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6607 return dr_explicit_realign;
6608 else
6609 return dr_explicit_realign_optimized;
6611 if (!known_alignment_for_access_p (dr))
6612 is_packed = not_size_aligned (DR_REF (dr));
6614 if (targetm.vectorize.support_vector_misalignment
6615 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6616 /* Can't software pipeline the loads, but can at least do them. */
6617 return dr_unaligned_supported;
6619 else
6621 bool is_packed = false;
6622 tree type = (TREE_TYPE (DR_REF (dr)));
6624 if (!known_alignment_for_access_p (dr))
6625 is_packed = not_size_aligned (DR_REF (dr));
6627 if (targetm.vectorize.support_vector_misalignment
6628 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6629 return dr_unaligned_supported;
6632 /* Unsupported. */
6633 return dr_unaligned_unsupported;