PR testsuite/85483: Move aarch64/sve/vcond_1.c test to g++.dg/other/
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob9aabcc18ff9d3f4692d891ed1b84b11d8dedd35b
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
216 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
219 /* A subroutine of vect_analyze_data_ref_dependence. Handle
220 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
221 distances. These distances are conservatively correct but they don't
222 reflect a guaranteed dependence.
224 Return true if this function does all the work necessary to avoid
225 an alias or false if the caller should use the dependence distances
226 to limit the vectorization factor in the usual way. LOOP_DEPTH is
227 the depth of the loop described by LOOP_VINFO and the other arguments
228 are as for vect_analyze_data_ref_dependence. */
230 static bool
231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 int loop_depth, unsigned int *max_vf)
235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
236 lambda_vector dist_v;
237 unsigned int i;
238 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
240 int dist = dist_v[loop_depth];
241 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
243 /* If the user asserted safelen >= DIST consecutive iterations
244 can be executed concurrently, assume independence.
246 ??? An alternative would be to add the alias check even
247 in this case, and vectorize the fallback loop with the
248 maximum VF set to safelen. However, if the user has
249 explicitly given a length, it's less likely that that
250 would be a win. */
251 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
253 if ((unsigned int) loop->safelen < *max_vf)
254 *max_vf = loop->safelen;
255 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
256 continue;
259 /* For dependence distances of 2 or more, we have the option
260 of limiting VF or checking for an alias at runtime.
261 Prefer to check at runtime if we can, to avoid limiting
262 the VF unnecessarily when the bases are in fact independent.
264 Note that the alias checks will be removed if the VF ends up
265 being small enough. */
266 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
269 return true;
273 /* Function vect_analyze_data_ref_dependence.
275 Return TRUE if there (might) exist a dependence between a memory-reference
276 DRA and a memory-reference DRB. When versioning for alias may check a
277 dependence at run-time, return FALSE. Adjust *MAX_VF according to
278 the data dependence. */
280 static bool
281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
282 loop_vec_info loop_vinfo,
283 unsigned int *max_vf)
285 unsigned int i;
286 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
287 struct data_reference *dra = DDR_A (ddr);
288 struct data_reference *drb = DDR_B (ddr);
289 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
290 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
291 lambda_vector dist_v;
292 unsigned int loop_depth;
294 /* In loop analysis all data references should be vectorizable. */
295 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
296 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
297 gcc_unreachable ();
299 /* Independent data accesses. */
300 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
301 return false;
303 if (dra == drb
304 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
305 return false;
307 /* We do not have to consider dependences between accesses that belong
308 to the same group. */
309 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
310 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
311 return false;
313 /* Even if we have an anti-dependence then, as the vectorized loop covers at
314 least two scalar iterations, there is always also a true dependence.
315 As the vectorizer does not re-order loads and stores we can ignore
316 the anti-dependence if TBAA can disambiguate both DRs similar to the
317 case with known negative distance anti-dependences (positive
318 distance anti-dependences would violate TBAA constraints). */
319 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
320 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
321 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
322 get_alias_set (DR_REF (drb))))
323 return false;
325 /* Unknown data dependence. */
326 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
328 /* If user asserted safelen consecutive iterations can be
329 executed concurrently, assume independence. */
330 if (loop->safelen >= 2)
332 if ((unsigned int) loop->safelen < *max_vf)
333 *max_vf = loop->safelen;
334 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
335 return false;
338 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
339 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
341 if (dump_enabled_p ())
343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
344 "versioning for alias not supported for: "
345 "can't determine dependence between ");
346 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
347 DR_REF (dra));
348 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
349 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
350 DR_REF (drb));
351 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
353 return true;
356 if (dump_enabled_p ())
358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
359 "versioning for alias required: "
360 "can't determine dependence between ");
361 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
362 DR_REF (dra));
363 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
364 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
365 DR_REF (drb));
366 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
369 /* Add to list of ddrs that need to be tested at run-time. */
370 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
373 /* Known data dependence. */
374 if (DDR_NUM_DIST_VECTS (ddr) == 0)
376 /* If user asserted safelen consecutive iterations can be
377 executed concurrently, assume independence. */
378 if (loop->safelen >= 2)
380 if ((unsigned int) loop->safelen < *max_vf)
381 *max_vf = loop->safelen;
382 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
383 return false;
386 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
387 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
389 if (dump_enabled_p ())
391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
392 "versioning for alias not supported for: "
393 "bad dist vector for ");
394 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
395 DR_REF (dra));
396 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
397 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
398 DR_REF (drb));
399 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
401 return true;
404 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "versioning for alias required: "
408 "bad dist vector for ");
409 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
410 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
411 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
412 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
414 /* Add to list of ddrs that need to be tested at run-time. */
415 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
418 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
420 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
421 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
422 loop_depth, max_vf))
423 return false;
425 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
427 int dist = dist_v[loop_depth];
429 if (dump_enabled_p ())
430 dump_printf_loc (MSG_NOTE, vect_location,
431 "dependence distance = %d.\n", dist);
433 if (dist == 0)
435 if (dump_enabled_p ())
437 dump_printf_loc (MSG_NOTE, vect_location,
438 "dependence distance == 0 between ");
439 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
440 dump_printf (MSG_NOTE, " and ");
441 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
442 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
445 /* When we perform grouped accesses and perform implicit CSE
446 by detecting equal accesses and doing disambiguation with
447 runtime alias tests like for
448 .. = a[i];
449 .. = a[i+1];
450 a[i] = ..;
451 a[i+1] = ..;
452 *p = ..;
453 .. = a[i];
454 .. = a[i+1];
455 where we will end up loading { a[i], a[i+1] } once, make
456 sure that inserting group loads before the first load and
457 stores after the last store will do the right thing.
458 Similar for groups like
459 a[i] = ...;
460 ... = a[i];
461 a[i+1] = ...;
462 where loads from the group interleave with the store. */
463 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
465 if (dump_enabled_p ())
466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
467 "READ_WRITE dependence in interleaving.\n");
468 return true;
471 if (loop->safelen < 2)
473 tree indicator = dr_zero_step_indicator (dra);
474 if (TREE_CODE (indicator) != INTEGER_CST)
475 vect_check_nonzero_value (loop_vinfo, indicator);
476 else if (integer_zerop (indicator))
478 if (dump_enabled_p ())
479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
480 "access also has a zero step\n");
481 return true;
484 continue;
487 if (dist > 0 && DDR_REVERSED_P (ddr))
489 /* If DDR_REVERSED_P the order of the data-refs in DDR was
490 reversed (to make distance vector positive), and the actual
491 distance is negative. */
492 if (dump_enabled_p ())
493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
494 "dependence distance negative.\n");
495 /* Record a negative dependence distance to later limit the
496 amount of stmt copying / unrolling we can perform.
497 Only need to handle read-after-write dependence. */
498 if (DR_IS_READ (drb)
499 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
500 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
501 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
502 continue;
505 unsigned int abs_dist = abs (dist);
506 if (abs_dist >= 2 && abs_dist < *max_vf)
508 /* The dependence distance requires reduction of the maximal
509 vectorization factor. */
510 *max_vf = abs (dist);
511 if (dump_enabled_p ())
512 dump_printf_loc (MSG_NOTE, vect_location,
513 "adjusting maximal vectorization factor to %i\n",
514 *max_vf);
517 if (abs_dist >= *max_vf)
519 /* Dependence distance does not create dependence, as far as
520 vectorization is concerned, in this case. */
521 if (dump_enabled_p ())
522 dump_printf_loc (MSG_NOTE, vect_location,
523 "dependence distance >= VF.\n");
524 continue;
527 if (dump_enabled_p ())
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
530 "not vectorized, possible dependence "
531 "between data-refs ");
532 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
533 dump_printf (MSG_NOTE, " and ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
535 dump_printf (MSG_NOTE, "\n");
538 return true;
541 return false;
544 /* Function vect_analyze_data_ref_dependences.
546 Examine all the data references in the loop, and make sure there do not
547 exist any data dependences between them. Set *MAX_VF according to
548 the maximum vectorization factor the data dependences allow. */
550 bool
551 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
552 unsigned int *max_vf)
554 unsigned int i;
555 struct data_dependence_relation *ddr;
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location,
559 "=== vect_analyze_data_ref_dependences ===\n");
561 LOOP_VINFO_DDRS (loop_vinfo)
562 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
563 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
564 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
565 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
566 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
567 &LOOP_VINFO_DDRS (loop_vinfo),
568 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
569 return false;
571 /* For epilogues we either have no aliases or alias versioning
572 was applied to original loop. Therefore we may just get max_vf
573 using VF of original loop. */
574 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
575 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
576 else
577 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
578 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
579 return false;
581 return true;
585 /* Function vect_slp_analyze_data_ref_dependence.
587 Return TRUE if there (might) exist a dependence between a memory-reference
588 DRA and a memory-reference DRB. When versioning for alias may check a
589 dependence at run-time, return FALSE. Adjust *MAX_VF according to
590 the data dependence. */
592 static bool
593 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
595 struct data_reference *dra = DDR_A (ddr);
596 struct data_reference *drb = DDR_B (ddr);
598 /* We need to check dependences of statements marked as unvectorizable
599 as well, they still can prohibit vectorization. */
601 /* Independent data accesses. */
602 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
603 return false;
605 if (dra == drb)
606 return false;
608 /* Read-read is OK. */
609 if (DR_IS_READ (dra) && DR_IS_READ (drb))
610 return false;
612 /* If dra and drb are part of the same interleaving chain consider
613 them independent. */
614 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
615 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
616 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
617 return false;
619 /* Unknown data dependence. */
620 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
622 if (dump_enabled_p ())
624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
625 "can't determine dependence between ");
626 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
627 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
628 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
629 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
632 else if (dump_enabled_p ())
634 dump_printf_loc (MSG_NOTE, vect_location,
635 "determined dependence between ");
636 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
637 dump_printf (MSG_NOTE, " and ");
638 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
639 dump_printf (MSG_NOTE, "\n");
642 return true;
646 /* Analyze dependences involved in the transform of SLP NODE. STORES
647 contain the vector of scalar stores of this instance if we are
648 disambiguating the loads. */
650 static bool
651 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
652 vec<gimple *> stores, gimple *last_store)
654 /* This walks over all stmts involved in the SLP load/store done
655 in NODE verifying we can sink them up to the last stmt in the
656 group. */
657 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
658 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
660 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
661 if (access == last_access)
662 continue;
663 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
664 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
665 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
667 gimple *stmt = gsi_stmt (gsi);
668 if (! gimple_vuse (stmt)
669 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
670 continue;
672 /* If we couldn't record a (single) data reference for this
673 stmt we have to give up. */
674 /* ??? Here and below if dependence analysis fails we can resort
675 to the alias oracle which can handle more kinds of stmts. */
676 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
677 if (!dr_b)
678 return false;
680 bool dependent = false;
681 /* If we run into a store of this same instance (we've just
682 marked those) then delay dependence checking until we run
683 into the last store because this is where it will have
684 been sunk to (and we verify if we can do that as well). */
685 if (gimple_visited_p (stmt))
687 if (stmt != last_store)
688 continue;
689 unsigned i;
690 gimple *store;
691 FOR_EACH_VEC_ELT (stores, i, store)
693 data_reference *store_dr
694 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
695 ddr_p ddr = initialize_data_dependence_relation
696 (dr_a, store_dr, vNULL);
697 dependent = vect_slp_analyze_data_ref_dependence (ddr);
698 free_dependence_relation (ddr);
699 if (dependent)
700 break;
703 else
705 ddr_p ddr = initialize_data_dependence_relation (dr_a,
706 dr_b, vNULL);
707 dependent = vect_slp_analyze_data_ref_dependence (ddr);
708 free_dependence_relation (ddr);
710 if (dependent)
711 return false;
714 return true;
718 /* Function vect_analyze_data_ref_dependences.
720 Examine all the data references in the basic-block, and make sure there
721 do not exist any data dependences between them. Set *MAX_VF according to
722 the maximum vectorization factor the data dependences allow. */
724 bool
725 vect_slp_analyze_instance_dependence (slp_instance instance)
727 if (dump_enabled_p ())
728 dump_printf_loc (MSG_NOTE, vect_location,
729 "=== vect_slp_analyze_instance_dependence ===\n");
731 /* The stores of this instance are at the root of the SLP tree. */
732 slp_tree store = SLP_INSTANCE_TREE (instance);
733 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
734 store = NULL;
736 /* Verify we can sink stores to the vectorized stmt insert location. */
737 gimple *last_store = NULL;
738 if (store)
740 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
741 return false;
743 /* Mark stores in this instance and remember the last one. */
744 last_store = vect_find_last_scalar_stmt_in_slp (store);
745 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
746 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
749 bool res = true;
751 /* Verify we can sink loads to the vectorized stmt insert location,
752 special-casing stores of this instance. */
753 slp_tree load;
754 unsigned int i;
755 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
756 if (! vect_slp_analyze_node_dependences (instance, load,
757 store
758 ? SLP_TREE_SCALAR_STMTS (store)
759 : vNULL, last_store))
761 res = false;
762 break;
765 /* Unset the visited flag. */
766 if (store)
767 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
768 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
770 return res;
773 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
774 the statement that contains DRB, which is useful for recording in the
775 dump file. */
777 static void
778 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
779 innermost_loop_behavior *drb)
781 bool existed;
782 innermost_loop_behavior *&entry
783 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
784 if (!existed || entry->base_alignment < drb->base_alignment)
786 entry = drb;
787 if (dump_enabled_p ())
789 dump_printf_loc (MSG_NOTE, vect_location,
790 "recording new base alignment for ");
791 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
792 dump_printf (MSG_NOTE, "\n");
793 dump_printf_loc (MSG_NOTE, vect_location,
794 " alignment: %d\n", drb->base_alignment);
795 dump_printf_loc (MSG_NOTE, vect_location,
796 " misalignment: %d\n", drb->base_misalignment);
797 dump_printf_loc (MSG_NOTE, vect_location,
798 " based on: ");
799 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
804 /* If the region we're going to vectorize is reached, all unconditional
805 data references occur at least once. We can therefore pool the base
806 alignment guarantees from each unconditional reference. Do this by
807 going through all the data references in VINFO and checking whether
808 the containing statement makes the reference unconditionally. If so,
809 record the alignment of the base address in VINFO so that it can be
810 used for all other references with the same base. */
812 void
813 vect_record_base_alignments (vec_info *vinfo)
815 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
816 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
817 data_reference *dr;
818 unsigned int i;
819 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
820 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
822 gimple *stmt = DR_STMT (dr);
823 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
825 /* If DR is nested in the loop that is being vectorized, we can also
826 record the alignment of the base wrt the outer loop. */
827 if (loop && nested_in_vect_loop_p (loop, stmt))
829 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
830 vect_record_base_alignment
831 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
836 /* Return the target alignment for the vectorized form of DR. */
838 static unsigned int
839 vect_calculate_target_alignment (struct data_reference *dr)
841 gimple *stmt = DR_STMT (dr);
842 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
843 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
844 return targetm.vectorize.preferred_vector_alignment (vectype);
847 /* Function vect_compute_data_ref_alignment
849 Compute the misalignment of the data reference DR.
851 Output:
852 1. If during the misalignment computation it is found that the data reference
853 cannot be vectorized then false is returned.
854 2. DR_MISALIGNMENT (DR) is defined.
856 FOR NOW: No analysis is actually performed. Misalignment is calculated
857 only for trivial cases. TODO. */
859 bool
860 vect_compute_data_ref_alignment (struct data_reference *dr)
862 gimple *stmt = DR_STMT (dr);
863 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
864 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
865 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
866 struct loop *loop = NULL;
867 tree ref = DR_REF (dr);
868 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
870 if (dump_enabled_p ())
871 dump_printf_loc (MSG_NOTE, vect_location,
872 "vect_compute_data_ref_alignment:\n");
874 if (loop_vinfo)
875 loop = LOOP_VINFO_LOOP (loop_vinfo);
877 /* Initialize misalignment to unknown. */
878 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
880 innermost_loop_behavior *drb = vect_dr_behavior (dr);
881 bool step_preserves_misalignment_p;
883 unsigned HOST_WIDE_INT vector_alignment
884 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
885 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
887 /* No step for BB vectorization. */
888 if (!loop)
890 gcc_assert (integer_zerop (drb->step));
891 step_preserves_misalignment_p = true;
894 /* In case the dataref is in an inner-loop of the loop that is being
895 vectorized (LOOP), we use the base and misalignment information
896 relative to the outer-loop (LOOP). This is ok only if the misalignment
897 stays the same throughout the execution of the inner-loop, which is why
898 we have to check that the stride of the dataref in the inner-loop evenly
899 divides by the vector alignment. */
900 else if (nested_in_vect_loop_p (loop, stmt))
902 step_preserves_misalignment_p
903 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
905 if (dump_enabled_p ())
907 if (step_preserves_misalignment_p)
908 dump_printf_loc (MSG_NOTE, vect_location,
909 "inner step divides the vector alignment.\n");
910 else
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 "inner step doesn't divide the vector"
913 " alignment.\n");
917 /* Similarly we can only use base and misalignment information relative to
918 an innermost loop if the misalignment stays the same throughout the
919 execution of the loop. As above, this is the case if the stride of
920 the dataref evenly divides by the alignment. */
921 else
923 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
924 step_preserves_misalignment_p
925 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
927 if (!step_preserves_misalignment_p && dump_enabled_p ())
928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
929 "step doesn't divide the vector alignment.\n");
932 unsigned int base_alignment = drb->base_alignment;
933 unsigned int base_misalignment = drb->base_misalignment;
935 /* Calculate the maximum of the pooled base address alignment and the
936 alignment that we can compute for DR itself. */
937 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
938 if (entry && base_alignment < (*entry)->base_alignment)
940 base_alignment = (*entry)->base_alignment;
941 base_misalignment = (*entry)->base_misalignment;
944 if (drb->offset_alignment < vector_alignment
945 || !step_preserves_misalignment_p
946 /* We need to know whether the step wrt the vectorized loop is
947 negative when computing the starting misalignment below. */
948 || TREE_CODE (drb->step) != INTEGER_CST)
950 if (dump_enabled_p ())
952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
953 "Unknown alignment for access: ");
954 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
955 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
957 return true;
960 if (base_alignment < vector_alignment)
962 unsigned int max_alignment;
963 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
964 if (max_alignment < vector_alignment
965 || !vect_can_force_dr_alignment_p (base,
966 vector_alignment * BITS_PER_UNIT))
968 if (dump_enabled_p ())
970 dump_printf_loc (MSG_NOTE, vect_location,
971 "can't force alignment of ref: ");
972 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
973 dump_printf (MSG_NOTE, "\n");
975 return true;
978 /* Force the alignment of the decl.
979 NOTE: This is the only change to the code we make during
980 the analysis phase, before deciding to vectorize the loop. */
981 if (dump_enabled_p ())
983 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
984 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
985 dump_printf (MSG_NOTE, "\n");
988 DR_VECT_AUX (dr)->base_decl = base;
989 DR_VECT_AUX (dr)->base_misaligned = true;
990 base_misalignment = 0;
992 poly_int64 misalignment
993 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
995 /* If this is a backward running DR then first access in the larger
996 vectype actually is N-1 elements before the address in the DR.
997 Adjust misalign accordingly. */
998 if (tree_int_cst_sgn (drb->step) < 0)
999 /* PLUS because STEP is negative. */
1000 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1001 * TREE_INT_CST_LOW (drb->step));
1003 unsigned int const_misalignment;
1004 if (!known_misalignment (misalignment, vector_alignment,
1005 &const_misalignment))
1007 if (dump_enabled_p ())
1009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010 "Non-constant misalignment for access: ");
1011 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1012 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1014 return true;
1017 SET_DR_MISALIGNMENT (dr, const_misalignment);
1019 if (dump_enabled_p ())
1021 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1022 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1023 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1024 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1027 return true;
1030 /* Function vect_update_misalignment_for_peel.
1031 Sets DR's misalignment
1032 - to 0 if it has the same alignment as DR_PEEL,
1033 - to the misalignment computed using NPEEL if DR's salignment is known,
1034 - to -1 (unknown) otherwise.
1036 DR - the data reference whose misalignment is to be adjusted.
1037 DR_PEEL - the data reference whose misalignment is being made
1038 zero in the vector loop by the peel.
1039 NPEEL - the number of iterations in the peel loop if the misalignment
1040 of DR_PEEL is known at compile time. */
1042 static void
1043 vect_update_misalignment_for_peel (struct data_reference *dr,
1044 struct data_reference *dr_peel, int npeel)
1046 unsigned int i;
1047 vec<dr_p> same_aligned_drs;
1048 struct data_reference *current_dr;
1049 int dr_size = vect_get_scalar_dr_size (dr);
1050 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1051 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1052 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1054 /* For interleaved data accesses the step in the loop must be multiplied by
1055 the size of the interleaving group. */
1056 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1057 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1058 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1059 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1061 /* It can be assumed that the data refs with the same alignment as dr_peel
1062 are aligned in the vector loop. */
1063 same_aligned_drs
1064 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1065 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1067 if (current_dr != dr)
1068 continue;
1069 gcc_assert (!known_alignment_for_access_p (dr)
1070 || !known_alignment_for_access_p (dr_peel)
1071 || (DR_MISALIGNMENT (dr) / dr_size
1072 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1073 SET_DR_MISALIGNMENT (dr, 0);
1074 return;
1077 if (known_alignment_for_access_p (dr)
1078 && known_alignment_for_access_p (dr_peel))
1080 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1081 int misal = DR_MISALIGNMENT (dr);
1082 misal += negative ? -npeel * dr_size : npeel * dr_size;
1083 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1084 SET_DR_MISALIGNMENT (dr, misal);
1085 return;
1088 if (dump_enabled_p ())
1089 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1090 "to unknown (-1).\n");
1091 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1095 /* Function verify_data_ref_alignment
1097 Return TRUE if DR can be handled with respect to alignment. */
1099 static bool
1100 verify_data_ref_alignment (data_reference_p dr)
1102 enum dr_alignment_support supportable_dr_alignment
1103 = vect_supportable_dr_alignment (dr, false);
1104 if (!supportable_dr_alignment)
1106 if (dump_enabled_p ())
1108 if (DR_IS_READ (dr))
1109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1110 "not vectorized: unsupported unaligned load.");
1111 else
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "not vectorized: unsupported unaligned "
1114 "store.");
1116 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1117 DR_REF (dr));
1118 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1120 return false;
1123 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1124 dump_printf_loc (MSG_NOTE, vect_location,
1125 "Vectorizing an unaligned access.\n");
1127 return true;
1130 /* Function vect_verify_datarefs_alignment
1132 Return TRUE if all data references in the loop can be
1133 handled with respect to alignment. */
1135 bool
1136 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1138 vec<data_reference_p> datarefs = vinfo->datarefs;
1139 struct data_reference *dr;
1140 unsigned int i;
1142 FOR_EACH_VEC_ELT (datarefs, i, dr)
1144 gimple *stmt = DR_STMT (dr);
1145 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1147 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1148 continue;
1150 /* For interleaving, only the alignment of the first access matters. */
1151 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1152 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1153 continue;
1155 /* Strided accesses perform only component accesses, alignment is
1156 irrelevant for them. */
1157 if (STMT_VINFO_STRIDED_P (stmt_info)
1158 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1159 continue;
1161 if (! verify_data_ref_alignment (dr))
1162 return false;
1165 return true;
1168 /* Given an memory reference EXP return whether its alignment is less
1169 than its size. */
1171 static bool
1172 not_size_aligned (tree exp)
1174 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1175 return true;
1177 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1178 > get_object_alignment (exp));
1181 /* Function vector_alignment_reachable_p
1183 Return true if vector alignment for DR is reachable by peeling
1184 a few loop iterations. Return false otherwise. */
1186 static bool
1187 vector_alignment_reachable_p (struct data_reference *dr)
1189 gimple *stmt = DR_STMT (dr);
1190 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1191 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1193 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1195 /* For interleaved access we peel only if number of iterations in
1196 the prolog loop ({VF - misalignment}), is a multiple of the
1197 number of the interleaved accesses. */
1198 int elem_size, mis_in_elements;
1200 /* FORNOW: handle only known alignment. */
1201 if (!known_alignment_for_access_p (dr))
1202 return false;
1204 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1205 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1206 elem_size = vector_element_size (vector_size, nelements);
1207 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1209 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1210 return false;
1213 /* If misalignment is known at the compile time then allow peeling
1214 only if natural alignment is reachable through peeling. */
1215 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1217 HOST_WIDE_INT elmsize =
1218 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1219 if (dump_enabled_p ())
1221 dump_printf_loc (MSG_NOTE, vect_location,
1222 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1223 dump_printf (MSG_NOTE,
1224 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1226 if (DR_MISALIGNMENT (dr) % elmsize)
1228 if (dump_enabled_p ())
1229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230 "data size does not divide the misalignment.\n");
1231 return false;
1235 if (!known_alignment_for_access_p (dr))
1237 tree type = TREE_TYPE (DR_REF (dr));
1238 bool is_packed = not_size_aligned (DR_REF (dr));
1239 if (dump_enabled_p ())
1240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1241 "Unknown misalignment, %snaturally aligned\n",
1242 is_packed ? "not " : "");
1243 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1246 return true;
1250 /* Calculate the cost of the memory access represented by DR. */
1252 static void
1253 vect_get_data_access_cost (struct data_reference *dr,
1254 unsigned int *inside_cost,
1255 unsigned int *outside_cost,
1256 stmt_vector_for_cost *body_cost_vec)
1258 gimple *stmt = DR_STMT (dr);
1259 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1260 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1261 int ncopies;
1263 if (PURE_SLP_STMT (stmt_info))
1264 ncopies = 1;
1265 else
1266 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1268 if (DR_IS_READ (dr))
1269 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1270 NULL, body_cost_vec, false);
1271 else
1272 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1274 if (dump_enabled_p ())
1275 dump_printf_loc (MSG_NOTE, vect_location,
1276 "vect_get_data_access_cost: inside_cost = %d, "
1277 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1281 typedef struct _vect_peel_info
1283 struct data_reference *dr;
1284 int npeel;
1285 unsigned int count;
1286 } *vect_peel_info;
1288 typedef struct _vect_peel_extended_info
1290 struct _vect_peel_info peel_info;
1291 unsigned int inside_cost;
1292 unsigned int outside_cost;
1293 } *vect_peel_extended_info;
1296 /* Peeling hashtable helpers. */
1298 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1300 static inline hashval_t hash (const _vect_peel_info *);
1301 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1304 inline hashval_t
1305 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1307 return (hashval_t) peel_info->npeel;
1310 inline bool
1311 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1313 return (a->npeel == b->npeel);
1317 /* Insert DR into peeling hash table with NPEEL as key. */
1319 static void
1320 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1321 loop_vec_info loop_vinfo, struct data_reference *dr,
1322 int npeel)
1324 struct _vect_peel_info elem, *slot;
1325 _vect_peel_info **new_slot;
1326 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1328 elem.npeel = npeel;
1329 slot = peeling_htab->find (&elem);
1330 if (slot)
1331 slot->count++;
1332 else
1334 slot = XNEW (struct _vect_peel_info);
1335 slot->npeel = npeel;
1336 slot->dr = dr;
1337 slot->count = 1;
1338 new_slot = peeling_htab->find_slot (slot, INSERT);
1339 *new_slot = slot;
1342 if (!supportable_dr_alignment
1343 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1344 slot->count += VECT_MAX_COST;
1348 /* Traverse peeling hash table to find peeling option that aligns maximum
1349 number of data accesses. */
1352 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1353 _vect_peel_extended_info *max)
1355 vect_peel_info elem = *slot;
1357 if (elem->count > max->peel_info.count
1358 || (elem->count == max->peel_info.count
1359 && max->peel_info.npeel > elem->npeel))
1361 max->peel_info.npeel = elem->npeel;
1362 max->peel_info.count = elem->count;
1363 max->peel_info.dr = elem->dr;
1366 return 1;
1369 /* Get the costs of peeling NPEEL iterations checking data access costs
1370 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1371 misalignment will be zero after peeling. */
1373 static void
1374 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1375 struct data_reference *dr0,
1376 unsigned int *inside_cost,
1377 unsigned int *outside_cost,
1378 stmt_vector_for_cost *body_cost_vec,
1379 unsigned int npeel,
1380 bool unknown_misalignment)
1382 unsigned i;
1383 data_reference *dr;
1385 FOR_EACH_VEC_ELT (datarefs, i, dr)
1387 gimple *stmt = DR_STMT (dr);
1388 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1389 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1390 continue;
1392 /* For interleaving, only the alignment of the first access
1393 matters. */
1394 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1395 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1396 continue;
1398 /* Strided accesses perform only component accesses, alignment is
1399 irrelevant for them. */
1400 if (STMT_VINFO_STRIDED_P (stmt_info)
1401 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1402 continue;
1404 int save_misalignment;
1405 save_misalignment = DR_MISALIGNMENT (dr);
1406 if (npeel == 0)
1408 else if (unknown_misalignment && dr == dr0)
1409 SET_DR_MISALIGNMENT (dr, 0);
1410 else
1411 vect_update_misalignment_for_peel (dr, dr0, npeel);
1412 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1413 body_cost_vec);
1414 SET_DR_MISALIGNMENT (dr, save_misalignment);
1418 /* Traverse peeling hash table and calculate cost for each peeling option.
1419 Find the one with the lowest cost. */
1422 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1423 _vect_peel_extended_info *min)
1425 vect_peel_info elem = *slot;
1426 int dummy;
1427 unsigned int inside_cost = 0, outside_cost = 0;
1428 gimple *stmt = DR_STMT (elem->dr);
1429 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1430 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1431 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1432 epilogue_cost_vec;
1434 prologue_cost_vec.create (2);
1435 body_cost_vec.create (2);
1436 epilogue_cost_vec.create (2);
1438 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1439 elem->dr, &inside_cost, &outside_cost,
1440 &body_cost_vec, elem->npeel, false);
1442 body_cost_vec.release ();
1444 outside_cost += vect_get_known_peeling_cost
1445 (loop_vinfo, elem->npeel, &dummy,
1446 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1447 &prologue_cost_vec, &epilogue_cost_vec);
1449 /* Prologue and epilogue costs are added to the target model later.
1450 These costs depend only on the scalar iteration cost, the
1451 number of peeling iterations finally chosen, and the number of
1452 misaligned statements. So discard the information found here. */
1453 prologue_cost_vec.release ();
1454 epilogue_cost_vec.release ();
1456 if (inside_cost < min->inside_cost
1457 || (inside_cost == min->inside_cost
1458 && outside_cost < min->outside_cost))
1460 min->inside_cost = inside_cost;
1461 min->outside_cost = outside_cost;
1462 min->peel_info.dr = elem->dr;
1463 min->peel_info.npeel = elem->npeel;
1464 min->peel_info.count = elem->count;
1467 return 1;
1471 /* Choose best peeling option by traversing peeling hash table and either
1472 choosing an option with the lowest cost (if cost model is enabled) or the
1473 option that aligns as many accesses as possible. */
1475 static struct _vect_peel_extended_info
1476 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1477 loop_vec_info loop_vinfo)
1479 struct _vect_peel_extended_info res;
1481 res.peel_info.dr = NULL;
1483 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1485 res.inside_cost = INT_MAX;
1486 res.outside_cost = INT_MAX;
1487 peeling_htab->traverse <_vect_peel_extended_info *,
1488 vect_peeling_hash_get_lowest_cost> (&res);
1490 else
1492 res.peel_info.count = 0;
1493 peeling_htab->traverse <_vect_peel_extended_info *,
1494 vect_peeling_hash_get_most_frequent> (&res);
1495 res.inside_cost = 0;
1496 res.outside_cost = 0;
1499 return res;
1502 /* Return true if the new peeling NPEEL is supported. */
1504 static bool
1505 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1506 unsigned npeel)
1508 unsigned i;
1509 struct data_reference *dr = NULL;
1510 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1511 gimple *stmt;
1512 stmt_vec_info stmt_info;
1513 enum dr_alignment_support supportable_dr_alignment;
1515 /* Ensure that all data refs can be vectorized after the peel. */
1516 FOR_EACH_VEC_ELT (datarefs, i, dr)
1518 int save_misalignment;
1520 if (dr == dr0)
1521 continue;
1523 stmt = DR_STMT (dr);
1524 stmt_info = vinfo_for_stmt (stmt);
1525 /* For interleaving, only the alignment of the first access
1526 matters. */
1527 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1528 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1529 continue;
1531 /* Strided accesses perform only component accesses, alignment is
1532 irrelevant for them. */
1533 if (STMT_VINFO_STRIDED_P (stmt_info)
1534 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1535 continue;
1537 save_misalignment = DR_MISALIGNMENT (dr);
1538 vect_update_misalignment_for_peel (dr, dr0, npeel);
1539 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1540 SET_DR_MISALIGNMENT (dr, save_misalignment);
1542 if (!supportable_dr_alignment)
1543 return false;
1546 return true;
1549 /* Function vect_enhance_data_refs_alignment
1551 This pass will use loop versioning and loop peeling in order to enhance
1552 the alignment of data references in the loop.
1554 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1555 original loop is to be vectorized. Any other loops that are created by
1556 the transformations performed in this pass - are not supposed to be
1557 vectorized. This restriction will be relaxed.
1559 This pass will require a cost model to guide it whether to apply peeling
1560 or versioning or a combination of the two. For example, the scheme that
1561 intel uses when given a loop with several memory accesses, is as follows:
1562 choose one memory access ('p') which alignment you want to force by doing
1563 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1564 other accesses are not necessarily aligned, or (2) use loop versioning to
1565 generate one loop in which all accesses are aligned, and another loop in
1566 which only 'p' is necessarily aligned.
1568 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1569 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1570 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1572 Devising a cost model is the most critical aspect of this work. It will
1573 guide us on which access to peel for, whether to use loop versioning, how
1574 many versions to create, etc. The cost model will probably consist of
1575 generic considerations as well as target specific considerations (on
1576 powerpc for example, misaligned stores are more painful than misaligned
1577 loads).
1579 Here are the general steps involved in alignment enhancements:
1581 -- original loop, before alignment analysis:
1582 for (i=0; i<N; i++){
1583 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1584 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1587 -- After vect_compute_data_refs_alignment:
1588 for (i=0; i<N; i++){
1589 x = q[i]; # DR_MISALIGNMENT(q) = 3
1590 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1593 -- Possibility 1: we do loop versioning:
1594 if (p is aligned) {
1595 for (i=0; i<N; i++){ # loop 1A
1596 x = q[i]; # DR_MISALIGNMENT(q) = 3
1597 p[i] = y; # DR_MISALIGNMENT(p) = 0
1600 else {
1601 for (i=0; i<N; i++){ # loop 1B
1602 x = q[i]; # DR_MISALIGNMENT(q) = 3
1603 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1607 -- Possibility 2: we do loop peeling:
1608 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1609 x = q[i];
1610 p[i] = y;
1612 for (i = 3; i < N; i++){ # loop 2A
1613 x = q[i]; # DR_MISALIGNMENT(q) = 0
1614 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1617 -- Possibility 3: combination of loop peeling and versioning:
1618 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1619 x = q[i];
1620 p[i] = y;
1622 if (p is aligned) {
1623 for (i = 3; i<N; i++){ # loop 3A
1624 x = q[i]; # DR_MISALIGNMENT(q) = 0
1625 p[i] = y; # DR_MISALIGNMENT(p) = 0
1628 else {
1629 for (i = 3; i<N; i++){ # loop 3B
1630 x = q[i]; # DR_MISALIGNMENT(q) = 0
1631 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1635 These loops are later passed to loop_transform to be vectorized. The
1636 vectorizer will use the alignment information to guide the transformation
1637 (whether to generate regular loads/stores, or with special handling for
1638 misalignment). */
1640 bool
1641 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1643 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1644 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1645 enum dr_alignment_support supportable_dr_alignment;
1646 struct data_reference *dr0 = NULL, *first_store = NULL;
1647 struct data_reference *dr;
1648 unsigned int i, j;
1649 bool do_peeling = false;
1650 bool do_versioning = false;
1651 bool stat;
1652 gimple *stmt;
1653 stmt_vec_info stmt_info;
1654 unsigned int npeel = 0;
1655 bool one_misalignment_known = false;
1656 bool one_misalignment_unknown = false;
1657 bool one_dr_unsupportable = false;
1658 struct data_reference *unsupportable_dr = NULL;
1659 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1660 unsigned possible_npeel_number = 1;
1661 tree vectype;
1662 unsigned int mis, same_align_drs_max = 0;
1663 hash_table<peel_info_hasher> peeling_htab (1);
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_NOTE, vect_location,
1667 "=== vect_enhance_data_refs_alignment ===\n");
1669 /* Reset data so we can safely be called multiple times. */
1670 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1671 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1673 /* While cost model enhancements are expected in the future, the high level
1674 view of the code at this time is as follows:
1676 A) If there is a misaligned access then see if peeling to align
1677 this access can make all data references satisfy
1678 vect_supportable_dr_alignment. If so, update data structures
1679 as needed and return true.
1681 B) If peeling wasn't possible and there is a data reference with an
1682 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1683 then see if loop versioning checks can be used to make all data
1684 references satisfy vect_supportable_dr_alignment. If so, update
1685 data structures as needed and return true.
1687 C) If neither peeling nor versioning were successful then return false if
1688 any data reference does not satisfy vect_supportable_dr_alignment.
1690 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1692 Note, Possibility 3 above (which is peeling and versioning together) is not
1693 being done at this time. */
1695 /* (1) Peeling to force alignment. */
1697 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1698 Considerations:
1699 + How many accesses will become aligned due to the peeling
1700 - How many accesses will become unaligned due to the peeling,
1701 and the cost of misaligned accesses.
1702 - The cost of peeling (the extra runtime checks, the increase
1703 in code size). */
1705 FOR_EACH_VEC_ELT (datarefs, i, dr)
1707 stmt = DR_STMT (dr);
1708 stmt_info = vinfo_for_stmt (stmt);
1710 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1711 continue;
1713 /* For interleaving, only the alignment of the first access
1714 matters. */
1715 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1716 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1717 continue;
1719 /* For invariant accesses there is nothing to enhance. */
1720 if (integer_zerop (DR_STEP (dr)))
1721 continue;
1723 /* Strided accesses perform only component accesses, alignment is
1724 irrelevant for them. */
1725 if (STMT_VINFO_STRIDED_P (stmt_info)
1726 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1727 continue;
1729 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1730 do_peeling = vector_alignment_reachable_p (dr);
1731 if (do_peeling)
1733 if (known_alignment_for_access_p (dr))
1735 unsigned int npeel_tmp = 0;
1736 bool negative = tree_int_cst_compare (DR_STEP (dr),
1737 size_zero_node) < 0;
1739 vectype = STMT_VINFO_VECTYPE (stmt_info);
1740 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1741 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1742 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1743 if (DR_MISALIGNMENT (dr) != 0)
1744 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1746 /* For multiple types, it is possible that the bigger type access
1747 will have more than one peeling option. E.g., a loop with two
1748 types: one of size (vector size / 4), and the other one of
1749 size (vector size / 8). Vectorization factor will 8. If both
1750 accesses are misaligned by 3, the first one needs one scalar
1751 iteration to be aligned, and the second one needs 5. But the
1752 first one will be aligned also by peeling 5 scalar
1753 iterations, and in that case both accesses will be aligned.
1754 Hence, except for the immediate peeling amount, we also want
1755 to try to add full vector size, while we don't exceed
1756 vectorization factor.
1757 We do this automatically for cost model, since we calculate
1758 cost for every peeling option. */
1759 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1761 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1762 ? vf * GROUP_SIZE (stmt_info) : vf);
1763 possible_npeel_number
1764 = vect_get_num_vectors (nscalars, vectype);
1766 /* NPEEL_TMP is 0 when there is no misalignment, but also
1767 allow peeling NELEMENTS. */
1768 if (DR_MISALIGNMENT (dr) == 0)
1769 possible_npeel_number++;
1772 /* Save info about DR in the hash table. Also include peeling
1773 amounts according to the explanation above. */
1774 for (j = 0; j < possible_npeel_number; j++)
1776 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1777 dr, npeel_tmp);
1778 npeel_tmp += target_align / dr_size;
1781 one_misalignment_known = true;
1783 else
1785 /* If we don't know any misalignment values, we prefer
1786 peeling for data-ref that has the maximum number of data-refs
1787 with the same alignment, unless the target prefers to align
1788 stores over load. */
1789 unsigned same_align_drs
1790 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1791 if (!dr0
1792 || same_align_drs_max < same_align_drs)
1794 same_align_drs_max = same_align_drs;
1795 dr0 = dr;
1797 /* For data-refs with the same number of related
1798 accesses prefer the one where the misalign
1799 computation will be invariant in the outermost loop. */
1800 else if (same_align_drs_max == same_align_drs)
1802 struct loop *ivloop0, *ivloop;
1803 ivloop0 = outermost_invariant_loop_for_expr
1804 (loop, DR_BASE_ADDRESS (dr0));
1805 ivloop = outermost_invariant_loop_for_expr
1806 (loop, DR_BASE_ADDRESS (dr));
1807 if ((ivloop && !ivloop0)
1808 || (ivloop && ivloop0
1809 && flow_loop_nested_p (ivloop, ivloop0)))
1810 dr0 = dr;
1813 one_misalignment_unknown = true;
1815 /* Check for data refs with unsupportable alignment that
1816 can be peeled. */
1817 if (!supportable_dr_alignment)
1819 one_dr_unsupportable = true;
1820 unsupportable_dr = dr;
1823 if (!first_store && DR_IS_WRITE (dr))
1824 first_store = dr;
1827 else
1829 if (!aligned_access_p (dr))
1831 if (dump_enabled_p ())
1832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1833 "vector alignment may not be reachable\n");
1834 break;
1839 /* Check if we can possibly peel the loop. */
1840 if (!vect_can_advance_ivs_p (loop_vinfo)
1841 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1842 || loop->inner)
1843 do_peeling = false;
1845 struct _vect_peel_extended_info peel_for_known_alignment;
1846 struct _vect_peel_extended_info peel_for_unknown_alignment;
1847 struct _vect_peel_extended_info best_peel;
1849 peel_for_unknown_alignment.inside_cost = INT_MAX;
1850 peel_for_unknown_alignment.outside_cost = INT_MAX;
1851 peel_for_unknown_alignment.peel_info.count = 0;
1853 if (do_peeling
1854 && one_misalignment_unknown)
1856 /* Check if the target requires to prefer stores over loads, i.e., if
1857 misaligned stores are more expensive than misaligned loads (taking
1858 drs with same alignment into account). */
1859 unsigned int load_inside_cost = 0;
1860 unsigned int load_outside_cost = 0;
1861 unsigned int store_inside_cost = 0;
1862 unsigned int store_outside_cost = 0;
1863 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1865 stmt_vector_for_cost dummy;
1866 dummy.create (2);
1867 vect_get_peeling_costs_all_drs (datarefs, dr0,
1868 &load_inside_cost,
1869 &load_outside_cost,
1870 &dummy, estimated_npeels, true);
1871 dummy.release ();
1873 if (first_store)
1875 dummy.create (2);
1876 vect_get_peeling_costs_all_drs (datarefs, first_store,
1877 &store_inside_cost,
1878 &store_outside_cost,
1879 &dummy, estimated_npeels, true);
1880 dummy.release ();
1882 else
1884 store_inside_cost = INT_MAX;
1885 store_outside_cost = INT_MAX;
1888 if (load_inside_cost > store_inside_cost
1889 || (load_inside_cost == store_inside_cost
1890 && load_outside_cost > store_outside_cost))
1892 dr0 = first_store;
1893 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1894 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1896 else
1898 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1899 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1902 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1903 prologue_cost_vec.create (2);
1904 epilogue_cost_vec.create (2);
1906 int dummy2;
1907 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1908 (loop_vinfo, estimated_npeels, &dummy2,
1909 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1910 &prologue_cost_vec, &epilogue_cost_vec);
1912 prologue_cost_vec.release ();
1913 epilogue_cost_vec.release ();
1915 peel_for_unknown_alignment.peel_info.count = 1
1916 + STMT_VINFO_SAME_ALIGN_REFS
1917 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1920 peel_for_unknown_alignment.peel_info.npeel = 0;
1921 peel_for_unknown_alignment.peel_info.dr = dr0;
1923 best_peel = peel_for_unknown_alignment;
1925 peel_for_known_alignment.inside_cost = INT_MAX;
1926 peel_for_known_alignment.outside_cost = INT_MAX;
1927 peel_for_known_alignment.peel_info.count = 0;
1928 peel_for_known_alignment.peel_info.dr = NULL;
1930 if (do_peeling && one_misalignment_known)
1932 /* Peeling is possible, but there is no data access that is not supported
1933 unless aligned. So we try to choose the best possible peeling from
1934 the hash table. */
1935 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1936 (&peeling_htab, loop_vinfo);
1939 /* Compare costs of peeling for known and unknown alignment. */
1940 if (peel_for_known_alignment.peel_info.dr != NULL
1941 && peel_for_unknown_alignment.inside_cost
1942 >= peel_for_known_alignment.inside_cost)
1944 best_peel = peel_for_known_alignment;
1946 /* If the best peeling for known alignment has NPEEL == 0, perform no
1947 peeling at all except if there is an unsupportable dr that we can
1948 align. */
1949 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1950 do_peeling = false;
1953 /* If there is an unsupportable data ref, prefer this over all choices so far
1954 since we'd have to discard a chosen peeling except when it accidentally
1955 aligned the unsupportable data ref. */
1956 if (one_dr_unsupportable)
1957 dr0 = unsupportable_dr;
1958 else if (do_peeling)
1960 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1961 TODO: Use nopeel_outside_cost or get rid of it? */
1962 unsigned nopeel_inside_cost = 0;
1963 unsigned nopeel_outside_cost = 0;
1965 stmt_vector_for_cost dummy;
1966 dummy.create (2);
1967 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1968 &nopeel_outside_cost, &dummy, 0, false);
1969 dummy.release ();
1971 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1972 costs will be recorded. */
1973 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1974 prologue_cost_vec.create (2);
1975 epilogue_cost_vec.create (2);
1977 int dummy2;
1978 nopeel_outside_cost += vect_get_known_peeling_cost
1979 (loop_vinfo, 0, &dummy2,
1980 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1981 &prologue_cost_vec, &epilogue_cost_vec);
1983 prologue_cost_vec.release ();
1984 epilogue_cost_vec.release ();
1986 npeel = best_peel.peel_info.npeel;
1987 dr0 = best_peel.peel_info.dr;
1989 /* If no peeling is not more expensive than the best peeling we
1990 have so far, don't perform any peeling. */
1991 if (nopeel_inside_cost <= best_peel.inside_cost)
1992 do_peeling = false;
1995 if (do_peeling)
1997 stmt = DR_STMT (dr0);
1998 stmt_info = vinfo_for_stmt (stmt);
1999 vectype = STMT_VINFO_VECTYPE (stmt_info);
2001 if (known_alignment_for_access_p (dr0))
2003 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2004 size_zero_node) < 0;
2005 if (!npeel)
2007 /* Since it's known at compile time, compute the number of
2008 iterations in the peeled loop (the peeling factor) for use in
2009 updating DR_MISALIGNMENT values. The peeling factor is the
2010 vectorization factor minus the misalignment as an element
2011 count. */
2012 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2013 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2014 npeel = ((mis & (target_align - 1))
2015 / vect_get_scalar_dr_size (dr0));
2018 /* For interleaved data access every iteration accesses all the
2019 members of the group, therefore we divide the number of iterations
2020 by the group size. */
2021 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2022 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2023 npeel /= GROUP_SIZE (stmt_info);
2025 if (dump_enabled_p ())
2026 dump_printf_loc (MSG_NOTE, vect_location,
2027 "Try peeling by %d\n", npeel);
2030 /* Ensure that all datarefs can be vectorized after the peel. */
2031 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2032 do_peeling = false;
2034 /* Check if all datarefs are supportable and log. */
2035 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2037 stat = vect_verify_datarefs_alignment (loop_vinfo);
2038 if (!stat)
2039 do_peeling = false;
2040 else
2041 return stat;
2044 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2045 if (do_peeling)
2047 unsigned max_allowed_peel
2048 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2049 if (max_allowed_peel != (unsigned)-1)
2051 unsigned max_peel = npeel;
2052 if (max_peel == 0)
2054 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2055 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2057 if (max_peel > max_allowed_peel)
2059 do_peeling = false;
2060 if (dump_enabled_p ())
2061 dump_printf_loc (MSG_NOTE, vect_location,
2062 "Disable peeling, max peels reached: %d\n", max_peel);
2067 /* Cost model #2 - if peeling may result in a remaining loop not
2068 iterating enough to be vectorized then do not peel. Since this
2069 is a cost heuristic rather than a correctness decision, use the
2070 most likely runtime value for variable vectorization factors. */
2071 if (do_peeling
2072 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2074 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2075 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2076 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2077 < assumed_vf + max_peel)
2078 do_peeling = false;
2081 if (do_peeling)
2083 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2084 If the misalignment of DR_i is identical to that of dr0 then set
2085 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2086 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2087 by the peeling factor times the element size of DR_i (MOD the
2088 vectorization factor times the size). Otherwise, the
2089 misalignment of DR_i must be set to unknown. */
2090 FOR_EACH_VEC_ELT (datarefs, i, dr)
2091 if (dr != dr0)
2093 /* Strided accesses perform only component accesses, alignment
2094 is irrelevant for them. */
2095 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2096 if (STMT_VINFO_STRIDED_P (stmt_info)
2097 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2098 continue;
2100 vect_update_misalignment_for_peel (dr, dr0, npeel);
2103 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2104 if (npeel)
2105 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2106 else
2107 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2108 = DR_MISALIGNMENT (dr0);
2109 SET_DR_MISALIGNMENT (dr0, 0);
2110 if (dump_enabled_p ())
2112 dump_printf_loc (MSG_NOTE, vect_location,
2113 "Alignment of access forced using peeling.\n");
2114 dump_printf_loc (MSG_NOTE, vect_location,
2115 "Peeling for alignment will be applied.\n");
2118 /* The inside-loop cost will be accounted for in vectorizable_load
2119 and vectorizable_store correctly with adjusted alignments.
2120 Drop the body_cst_vec on the floor here. */
2121 stat = vect_verify_datarefs_alignment (loop_vinfo);
2122 gcc_assert (stat);
2123 return stat;
2127 /* (2) Versioning to force alignment. */
2129 /* Try versioning if:
2130 1) optimize loop for speed
2131 2) there is at least one unsupported misaligned data ref with an unknown
2132 misalignment, and
2133 3) all misaligned data refs with a known misalignment are supported, and
2134 4) the number of runtime alignment checks is within reason. */
2136 do_versioning =
2137 optimize_loop_nest_for_speed_p (loop)
2138 && (!loop->inner); /* FORNOW */
2140 if (do_versioning)
2142 FOR_EACH_VEC_ELT (datarefs, i, dr)
2144 stmt = DR_STMT (dr);
2145 stmt_info = vinfo_for_stmt (stmt);
2147 /* For interleaving, only the alignment of the first access
2148 matters. */
2149 if (aligned_access_p (dr)
2150 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2151 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2152 continue;
2154 if (STMT_VINFO_STRIDED_P (stmt_info))
2156 /* Strided loads perform only component accesses, alignment is
2157 irrelevant for them. */
2158 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2159 continue;
2160 do_versioning = false;
2161 break;
2164 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2166 if (!supportable_dr_alignment)
2168 gimple *stmt;
2169 int mask;
2170 tree vectype;
2172 if (known_alignment_for_access_p (dr)
2173 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2174 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2176 do_versioning = false;
2177 break;
2180 stmt = DR_STMT (dr);
2181 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2182 gcc_assert (vectype);
2184 /* At present we don't support versioning for alignment
2185 with variable VF, since there's no guarantee that the
2186 VF is a power of two. We could relax this if we added
2187 a way of enforcing a power-of-two size. */
2188 unsigned HOST_WIDE_INT size;
2189 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2191 do_versioning = false;
2192 break;
2195 /* The rightmost bits of an aligned address must be zeros.
2196 Construct the mask needed for this test. For example,
2197 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2198 mask must be 15 = 0xf. */
2199 mask = size - 1;
2201 /* FORNOW: use the same mask to test all potentially unaligned
2202 references in the loop. The vectorizer currently supports
2203 a single vector size, see the reference to
2204 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2205 vectorization factor is computed. */
2206 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2207 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2208 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2209 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2210 DR_STMT (dr));
2214 /* Versioning requires at least one misaligned data reference. */
2215 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2216 do_versioning = false;
2217 else if (!do_versioning)
2218 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2221 if (do_versioning)
2223 vec<gimple *> may_misalign_stmts
2224 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2225 gimple *stmt;
2227 /* It can now be assumed that the data references in the statements
2228 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2229 of the loop being vectorized. */
2230 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2232 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2233 dr = STMT_VINFO_DATA_REF (stmt_info);
2234 SET_DR_MISALIGNMENT (dr, 0);
2235 if (dump_enabled_p ())
2236 dump_printf_loc (MSG_NOTE, vect_location,
2237 "Alignment of access forced using versioning.\n");
2240 if (dump_enabled_p ())
2241 dump_printf_loc (MSG_NOTE, vect_location,
2242 "Versioning for alignment will be applied.\n");
2244 /* Peeling and versioning can't be done together at this time. */
2245 gcc_assert (! (do_peeling && do_versioning));
2247 stat = vect_verify_datarefs_alignment (loop_vinfo);
2248 gcc_assert (stat);
2249 return stat;
2252 /* This point is reached if neither peeling nor versioning is being done. */
2253 gcc_assert (! (do_peeling || do_versioning));
2255 stat = vect_verify_datarefs_alignment (loop_vinfo);
2256 return stat;
2260 /* Function vect_find_same_alignment_drs.
2262 Update group and alignment relations according to the chosen
2263 vectorization factor. */
2265 static void
2266 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2268 struct data_reference *dra = DDR_A (ddr);
2269 struct data_reference *drb = DDR_B (ddr);
2270 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2271 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2273 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2274 return;
2276 if (dra == drb)
2277 return;
2279 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2280 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2281 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2282 return;
2284 /* Two references with distance zero have the same alignment. */
2285 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2286 - wi::to_poly_offset (DR_INIT (drb)));
2287 if (maybe_ne (diff, 0))
2289 /* Get the wider of the two alignments. */
2290 unsigned int align_a = (vect_calculate_target_alignment (dra)
2291 / BITS_PER_UNIT);
2292 unsigned int align_b = (vect_calculate_target_alignment (drb)
2293 / BITS_PER_UNIT);
2294 unsigned int max_align = MAX (align_a, align_b);
2296 /* Require the gap to be a multiple of the larger vector alignment. */
2297 if (!multiple_p (diff, max_align))
2298 return;
2301 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2302 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2303 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_NOTE, vect_location,
2306 "accesses have the same alignment: ");
2307 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2308 dump_printf (MSG_NOTE, " and ");
2309 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2310 dump_printf (MSG_NOTE, "\n");
2315 /* Function vect_analyze_data_refs_alignment
2317 Analyze the alignment of the data-references in the loop.
2318 Return FALSE if a data reference is found that cannot be vectorized. */
2320 bool
2321 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2323 if (dump_enabled_p ())
2324 dump_printf_loc (MSG_NOTE, vect_location,
2325 "=== vect_analyze_data_refs_alignment ===\n");
2327 /* Mark groups of data references with same alignment using
2328 data dependence information. */
2329 vec<ddr_p> ddrs = vinfo->ddrs;
2330 struct data_dependence_relation *ddr;
2331 unsigned int i;
2333 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2334 vect_find_same_alignment_drs (ddr);
2336 vec<data_reference_p> datarefs = vinfo->datarefs;
2337 struct data_reference *dr;
2339 vect_record_base_alignments (vinfo);
2340 FOR_EACH_VEC_ELT (datarefs, i, dr)
2342 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2343 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2344 && !vect_compute_data_ref_alignment (dr))
2346 /* Strided accesses perform only component accesses, misalignment
2347 information is irrelevant for them. */
2348 if (STMT_VINFO_STRIDED_P (stmt_info)
2349 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2350 continue;
2352 if (dump_enabled_p ())
2353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2354 "not vectorized: can't calculate alignment "
2355 "for data ref.\n");
2357 return false;
2361 return true;
2365 /* Analyze alignment of DRs of stmts in NODE. */
2367 static bool
2368 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2370 /* We vectorize from the first scalar stmt in the node unless
2371 the node is permuted in which case we start from the first
2372 element in the group. */
2373 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2374 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2375 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2376 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2378 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2379 if (! vect_compute_data_ref_alignment (dr)
2380 /* For creating the data-ref pointer we need alignment of the
2381 first element anyway. */
2382 || (dr != first_dr
2383 && ! vect_compute_data_ref_alignment (first_dr))
2384 || ! verify_data_ref_alignment (dr))
2386 if (dump_enabled_p ())
2387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2388 "not vectorized: bad data alignment in basic "
2389 "block.\n");
2390 return false;
2393 return true;
2396 /* Function vect_slp_analyze_instance_alignment
2398 Analyze the alignment of the data-references in the SLP instance.
2399 Return FALSE if a data reference is found that cannot be vectorized. */
2401 bool
2402 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2404 if (dump_enabled_p ())
2405 dump_printf_loc (MSG_NOTE, vect_location,
2406 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2408 slp_tree node;
2409 unsigned i;
2410 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2411 if (! vect_slp_analyze_and_verify_node_alignment (node))
2412 return false;
2414 node = SLP_INSTANCE_TREE (instance);
2415 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2416 && ! vect_slp_analyze_and_verify_node_alignment
2417 (SLP_INSTANCE_TREE (instance)))
2418 return false;
2420 return true;
2424 /* Analyze groups of accesses: check that DR belongs to a group of
2425 accesses of legal size, step, etc. Detect gaps, single element
2426 interleaving, and other special cases. Set grouped access info.
2427 Collect groups of strided stores for further use in SLP analysis.
2428 Worker for vect_analyze_group_access. */
2430 static bool
2431 vect_analyze_group_access_1 (struct data_reference *dr)
2433 tree step = DR_STEP (dr);
2434 tree scalar_type = TREE_TYPE (DR_REF (dr));
2435 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2436 gimple *stmt = DR_STMT (dr);
2437 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2438 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2439 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2440 HOST_WIDE_INT dr_step = -1;
2441 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2442 bool slp_impossible = false;
2444 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2445 size of the interleaving group (including gaps). */
2446 if (tree_fits_shwi_p (step))
2448 dr_step = tree_to_shwi (step);
2449 /* Check that STEP is a multiple of type size. Otherwise there is
2450 a non-element-sized gap at the end of the group which we
2451 cannot represent in GROUP_GAP or GROUP_SIZE.
2452 ??? As we can handle non-constant step fine here we should
2453 simply remove uses of GROUP_GAP between the last and first
2454 element and instead rely on DR_STEP. GROUP_SIZE then would
2455 simply not include that gap. */
2456 if ((dr_step % type_size) != 0)
2458 if (dump_enabled_p ())
2460 dump_printf_loc (MSG_NOTE, vect_location,
2461 "Step ");
2462 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2463 dump_printf (MSG_NOTE,
2464 " is not a multiple of the element size for ");
2465 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2466 dump_printf (MSG_NOTE, "\n");
2468 return false;
2470 groupsize = absu_hwi (dr_step) / type_size;
2472 else
2473 groupsize = 0;
2475 /* Not consecutive access is possible only if it is a part of interleaving. */
2476 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2478 /* Check if it this DR is a part of interleaving, and is a single
2479 element of the group that is accessed in the loop. */
2481 /* Gaps are supported only for loads. STEP must be a multiple of the type
2482 size. */
2483 if (DR_IS_READ (dr)
2484 && (dr_step % type_size) == 0
2485 && groupsize > 0)
2487 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2488 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2489 GROUP_GAP (stmt_info) = groupsize - 1;
2490 if (dump_enabled_p ())
2492 dump_printf_loc (MSG_NOTE, vect_location,
2493 "Detected single element interleaving ");
2494 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2495 dump_printf (MSG_NOTE, " step ");
2496 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2497 dump_printf (MSG_NOTE, "\n");
2500 return true;
2503 if (dump_enabled_p ())
2505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2506 "not consecutive access ");
2507 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2510 if (bb_vinfo)
2512 /* Mark the statement as unvectorizable. */
2513 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2514 return true;
2517 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2518 STMT_VINFO_STRIDED_P (stmt_info) = true;
2519 return true;
2522 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2524 /* First stmt in the interleaving chain. Check the chain. */
2525 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2526 struct data_reference *data_ref = dr;
2527 unsigned int count = 1;
2528 tree prev_init = DR_INIT (data_ref);
2529 gimple *prev = stmt;
2530 HOST_WIDE_INT diff, gaps = 0;
2532 /* By construction, all group members have INTEGER_CST DR_INITs. */
2533 while (next)
2535 /* Skip same data-refs. In case that two or more stmts share
2536 data-ref (supported only for loads), we vectorize only the first
2537 stmt, and the rest get their vectorized loads from the first
2538 one. */
2539 if (!tree_int_cst_compare (DR_INIT (data_ref),
2540 DR_INIT (STMT_VINFO_DATA_REF (
2541 vinfo_for_stmt (next)))))
2543 if (DR_IS_WRITE (data_ref))
2545 if (dump_enabled_p ())
2546 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2547 "Two store stmts share the same dr.\n");
2548 return false;
2551 if (dump_enabled_p ())
2552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2553 "Two or more load stmts share the same dr.\n");
2555 /* For load use the same data-ref load. */
2556 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2558 prev = next;
2559 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2560 continue;
2563 prev = next;
2564 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2566 /* All group members have the same STEP by construction. */
2567 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2569 /* Check that the distance between two accesses is equal to the type
2570 size. Otherwise, we have gaps. */
2571 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2572 - TREE_INT_CST_LOW (prev_init)) / type_size;
2573 if (diff != 1)
2575 /* FORNOW: SLP of accesses with gaps is not supported. */
2576 slp_impossible = true;
2577 if (DR_IS_WRITE (data_ref))
2579 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2581 "interleaved store with gaps\n");
2582 return false;
2585 gaps += diff - 1;
2588 last_accessed_element += diff;
2590 /* Store the gap from the previous member of the group. If there is no
2591 gap in the access, GROUP_GAP is always 1. */
2592 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2594 prev_init = DR_INIT (data_ref);
2595 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2596 /* Count the number of data-refs in the chain. */
2597 count++;
2600 if (groupsize == 0)
2601 groupsize = count + gaps;
2603 /* This could be UINT_MAX but as we are generating code in a very
2604 inefficient way we have to cap earlier. See PR78699 for example. */
2605 if (groupsize > 4096)
2607 if (dump_enabled_p ())
2608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2609 "group is too large\n");
2610 return false;
2613 /* Check that the size of the interleaving is equal to count for stores,
2614 i.e., that there are no gaps. */
2615 if (groupsize != count
2616 && !DR_IS_READ (dr))
2618 if (dump_enabled_p ())
2619 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2620 "interleaved store with gaps\n");
2621 return false;
2624 /* If there is a gap after the last load in the group it is the
2625 difference between the groupsize and the last accessed
2626 element.
2627 When there is no gap, this difference should be 0. */
2628 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2630 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2631 if (dump_enabled_p ())
2633 dump_printf_loc (MSG_NOTE, vect_location,
2634 "Detected interleaving ");
2635 if (DR_IS_READ (dr))
2636 dump_printf (MSG_NOTE, "load ");
2637 else
2638 dump_printf (MSG_NOTE, "store ");
2639 dump_printf (MSG_NOTE, "of size %u starting with ",
2640 (unsigned)groupsize);
2641 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2642 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2643 dump_printf_loc (MSG_NOTE, vect_location,
2644 "There is a gap of %u elements after the group\n",
2645 GROUP_GAP (vinfo_for_stmt (stmt)));
2648 /* SLP: create an SLP data structure for every interleaving group of
2649 stores for further analysis in vect_analyse_slp. */
2650 if (DR_IS_WRITE (dr) && !slp_impossible)
2652 if (loop_vinfo)
2653 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2654 if (bb_vinfo)
2655 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2659 return true;
2662 /* Analyze groups of accesses: check that DR belongs to a group of
2663 accesses of legal size, step, etc. Detect gaps, single element
2664 interleaving, and other special cases. Set grouped access info.
2665 Collect groups of strided stores for further use in SLP analysis. */
2667 static bool
2668 vect_analyze_group_access (struct data_reference *dr)
2670 if (!vect_analyze_group_access_1 (dr))
2672 /* Dissolve the group if present. */
2673 gimple *next;
2674 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2675 while (stmt)
2677 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2678 next = GROUP_NEXT_ELEMENT (vinfo);
2679 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2680 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2681 stmt = next;
2683 return false;
2685 return true;
2688 /* Analyze the access pattern of the data-reference DR.
2689 In case of non-consecutive accesses call vect_analyze_group_access() to
2690 analyze groups of accesses. */
2692 static bool
2693 vect_analyze_data_ref_access (struct data_reference *dr)
2695 tree step = DR_STEP (dr);
2696 tree scalar_type = TREE_TYPE (DR_REF (dr));
2697 gimple *stmt = DR_STMT (dr);
2698 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2699 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2700 struct loop *loop = NULL;
2702 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2703 return true;
2705 if (loop_vinfo)
2706 loop = LOOP_VINFO_LOOP (loop_vinfo);
2708 if (loop_vinfo && !step)
2710 if (dump_enabled_p ())
2711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2712 "bad data-ref access in loop\n");
2713 return false;
2716 /* Allow loads with zero step in inner-loop vectorization. */
2717 if (loop_vinfo && integer_zerop (step))
2719 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2720 if (!nested_in_vect_loop_p (loop, stmt))
2721 return DR_IS_READ (dr);
2722 /* Allow references with zero step for outer loops marked
2723 with pragma omp simd only - it guarantees absence of
2724 loop-carried dependencies between inner loop iterations. */
2725 if (loop->safelen < 2)
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_NOTE, vect_location,
2729 "zero step in inner loop of nest\n");
2730 return false;
2734 if (loop && nested_in_vect_loop_p (loop, stmt))
2736 /* Interleaved accesses are not yet supported within outer-loop
2737 vectorization for references in the inner-loop. */
2738 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2740 /* For the rest of the analysis we use the outer-loop step. */
2741 step = STMT_VINFO_DR_STEP (stmt_info);
2742 if (integer_zerop (step))
2744 if (dump_enabled_p ())
2745 dump_printf_loc (MSG_NOTE, vect_location,
2746 "zero step in outer loop.\n");
2747 return DR_IS_READ (dr);
2751 /* Consecutive? */
2752 if (TREE_CODE (step) == INTEGER_CST)
2754 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2755 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2756 || (dr_step < 0
2757 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2759 /* Mark that it is not interleaving. */
2760 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2761 return true;
2765 if (loop && nested_in_vect_loop_p (loop, stmt))
2767 if (dump_enabled_p ())
2768 dump_printf_loc (MSG_NOTE, vect_location,
2769 "grouped access in outer loop.\n");
2770 return false;
2774 /* Assume this is a DR handled by non-constant strided load case. */
2775 if (TREE_CODE (step) != INTEGER_CST)
2776 return (STMT_VINFO_STRIDED_P (stmt_info)
2777 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2778 || vect_analyze_group_access (dr)));
2780 /* Not consecutive access - check if it's a part of interleaving group. */
2781 return vect_analyze_group_access (dr);
2784 /* Compare two data-references DRA and DRB to group them into chunks
2785 suitable for grouping. */
2787 static int
2788 dr_group_sort_cmp (const void *dra_, const void *drb_)
2790 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2791 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2792 int cmp;
2794 /* Stabilize sort. */
2795 if (dra == drb)
2796 return 0;
2798 /* DRs in different loops never belong to the same group. */
2799 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2800 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2801 if (loopa != loopb)
2802 return loopa->num < loopb->num ? -1 : 1;
2804 /* Ordering of DRs according to base. */
2805 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2806 DR_BASE_ADDRESS (drb));
2807 if (cmp != 0)
2808 return cmp;
2810 /* And according to DR_OFFSET. */
2811 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2812 if (cmp != 0)
2813 return cmp;
2815 /* Put reads before writes. */
2816 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2817 return DR_IS_READ (dra) ? -1 : 1;
2819 /* Then sort after access size. */
2820 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2821 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2822 if (cmp != 0)
2823 return cmp;
2825 /* And after step. */
2826 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2827 if (cmp != 0)
2828 return cmp;
2830 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2831 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2832 if (cmp == 0)
2833 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2834 return cmp;
2837 /* If OP is the result of a conversion, return the unconverted value,
2838 otherwise return null. */
2840 static tree
2841 strip_conversion (tree op)
2843 if (TREE_CODE (op) != SSA_NAME)
2844 return NULL_TREE;
2845 gimple *stmt = SSA_NAME_DEF_STMT (op);
2846 if (!is_gimple_assign (stmt)
2847 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2848 return NULL_TREE;
2849 return gimple_assign_rhs1 (stmt);
2852 /* Return true if vectorizable_* routines can handle statements STMT1
2853 and STMT2 being in a single group. */
2855 static bool
2856 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2858 if (gimple_assign_single_p (stmt1))
2859 return gimple_assign_single_p (stmt2);
2861 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2863 /* Check for two masked loads or two masked stores. */
2864 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2865 return false;
2866 internal_fn ifn = gimple_call_internal_fn (stmt1);
2867 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2868 return false;
2869 if (ifn != gimple_call_internal_fn (stmt2))
2870 return false;
2872 /* Check that the masks are the same. Cope with casts of masks,
2873 like those created by build_mask_conversion. */
2874 tree mask1 = gimple_call_arg (stmt1, 2);
2875 tree mask2 = gimple_call_arg (stmt2, 2);
2876 if (!operand_equal_p (mask1, mask2, 0))
2878 mask1 = strip_conversion (mask1);
2879 if (!mask1)
2880 return false;
2881 mask2 = strip_conversion (mask2);
2882 if (!mask2)
2883 return false;
2884 if (!operand_equal_p (mask1, mask2, 0))
2885 return false;
2887 return true;
2890 return false;
2893 /* Function vect_analyze_data_ref_accesses.
2895 Analyze the access pattern of all the data references in the loop.
2897 FORNOW: the only access pattern that is considered vectorizable is a
2898 simple step 1 (consecutive) access.
2900 FORNOW: handle only arrays and pointer accesses. */
2902 bool
2903 vect_analyze_data_ref_accesses (vec_info *vinfo)
2905 unsigned int i;
2906 vec<data_reference_p> datarefs = vinfo->datarefs;
2907 struct data_reference *dr;
2909 if (dump_enabled_p ())
2910 dump_printf_loc (MSG_NOTE, vect_location,
2911 "=== vect_analyze_data_ref_accesses ===\n");
2913 if (datarefs.is_empty ())
2914 return true;
2916 /* Sort the array of datarefs to make building the interleaving chains
2917 linear. Don't modify the original vector's order, it is needed for
2918 determining what dependencies are reversed. */
2919 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2920 datarefs_copy.qsort (dr_group_sort_cmp);
2922 /* Build the interleaving chains. */
2923 for (i = 0; i < datarefs_copy.length () - 1;)
2925 data_reference_p dra = datarefs_copy[i];
2926 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2927 stmt_vec_info lastinfo = NULL;
2928 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2929 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2931 ++i;
2932 continue;
2934 for (i = i + 1; i < datarefs_copy.length (); ++i)
2936 data_reference_p drb = datarefs_copy[i];
2937 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2938 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2939 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2940 break;
2942 /* ??? Imperfect sorting (non-compatible types, non-modulo
2943 accesses, same accesses) can lead to a group to be artificially
2944 split here as we don't just skip over those. If it really
2945 matters we can push those to a worklist and re-iterate
2946 over them. The we can just skip ahead to the next DR here. */
2948 /* DRs in a different loop should not be put into the same
2949 interleaving group. */
2950 if (gimple_bb (DR_STMT (dra))->loop_father
2951 != gimple_bb (DR_STMT (drb))->loop_father)
2952 break;
2954 /* Check that the data-refs have same first location (except init)
2955 and they are both either store or load (not load and store,
2956 not masked loads or stores). */
2957 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2958 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2959 DR_BASE_ADDRESS (drb)) != 0
2960 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2961 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2962 break;
2964 /* Check that the data-refs have the same constant size. */
2965 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2966 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2967 if (!tree_fits_uhwi_p (sza)
2968 || !tree_fits_uhwi_p (szb)
2969 || !tree_int_cst_equal (sza, szb))
2970 break;
2972 /* Check that the data-refs have the same step. */
2973 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2974 break;
2976 /* Check the types are compatible.
2977 ??? We don't distinguish this during sorting. */
2978 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2979 TREE_TYPE (DR_REF (drb))))
2980 break;
2982 /* Check that the DR_INITs are compile-time constants. */
2983 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2984 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2985 break;
2987 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2988 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2989 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2990 HOST_WIDE_INT init_prev
2991 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2992 gcc_assert (init_a <= init_b
2993 && init_a <= init_prev
2994 && init_prev <= init_b);
2996 /* Do not place the same access in the interleaving chain twice. */
2997 if (init_b == init_prev)
2999 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3000 < gimple_uid (DR_STMT (drb)));
3001 /* ??? For now we simply "drop" the later reference which is
3002 otherwise the same rather than finishing off this group.
3003 In the end we'd want to re-process duplicates forming
3004 multiple groups from the refs, likely by just collecting
3005 all candidates (including duplicates and split points
3006 below) in a vector and then process them together. */
3007 continue;
3010 /* If init_b == init_a + the size of the type * k, we have an
3011 interleaving, and DRA is accessed before DRB. */
3012 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3013 if (type_size_a == 0
3014 || (init_b - init_a) % type_size_a != 0)
3015 break;
3017 /* If we have a store, the accesses are adjacent. This splits
3018 groups into chunks we support (we don't support vectorization
3019 of stores with gaps). */
3020 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3021 break;
3023 /* If the step (if not zero or non-constant) is greater than the
3024 difference between data-refs' inits this splits groups into
3025 suitable sizes. */
3026 if (tree_fits_shwi_p (DR_STEP (dra)))
3028 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3029 if (step != 0 && step <= (init_b - init_a))
3030 break;
3033 if (dump_enabled_p ())
3035 dump_printf_loc (MSG_NOTE, vect_location,
3036 "Detected interleaving ");
3037 if (DR_IS_READ (dra))
3038 dump_printf (MSG_NOTE, "load ");
3039 else
3040 dump_printf (MSG_NOTE, "store ");
3041 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3042 dump_printf (MSG_NOTE, " and ");
3043 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3044 dump_printf (MSG_NOTE, "\n");
3047 /* Link the found element into the group list. */
3048 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3050 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3051 lastinfo = stmtinfo_a;
3053 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3054 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3055 lastinfo = stmtinfo_b;
3059 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3060 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3061 && !vect_analyze_data_ref_access (dr))
3063 if (dump_enabled_p ())
3064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3065 "not vectorized: complicated access pattern.\n");
3067 if (is_a <bb_vec_info> (vinfo))
3069 /* Mark the statement as not vectorizable. */
3070 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3071 continue;
3073 else
3075 datarefs_copy.release ();
3076 return false;
3080 datarefs_copy.release ();
3081 return true;
3084 /* Function vect_vfa_segment_size.
3086 Input:
3087 DR: The data reference.
3088 LENGTH_FACTOR: segment length to consider.
3090 Return a value suitable for the dr_with_seg_len::seg_len field.
3091 This is the "distance travelled" by the pointer from the first
3092 iteration in the segment to the last. Note that it does not include
3093 the size of the access; in effect it only describes the first byte. */
3095 static tree
3096 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3098 length_factor = size_binop (MINUS_EXPR,
3099 fold_convert (sizetype, length_factor),
3100 size_one_node);
3101 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3102 length_factor);
3105 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3106 gives the worst-case number of bytes covered by the segment. */
3108 static unsigned HOST_WIDE_INT
3109 vect_vfa_access_size (data_reference *dr)
3111 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3112 tree ref_type = TREE_TYPE (DR_REF (dr));
3113 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3114 unsigned HOST_WIDE_INT access_size = ref_size;
3115 if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3117 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3118 access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3120 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3121 && (vect_supportable_dr_alignment (dr, false)
3122 == dr_explicit_realign_optimized))
3124 /* We might access a full vector's worth. */
3125 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3126 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3128 return access_size;
3131 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3133 static unsigned int
3134 vect_vfa_align (const data_reference *dr)
3136 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3139 /* Function vect_no_alias_p.
3141 Given data references A and B with equal base and offset, see whether
3142 the alias relation can be decided at compilation time. Return 1 if
3143 it can and the references alias, 0 if it can and the references do
3144 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3145 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3146 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3148 static int
3149 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3150 tree segment_length_a, tree segment_length_b,
3151 unsigned HOST_WIDE_INT access_size_a,
3152 unsigned HOST_WIDE_INT access_size_b)
3154 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3155 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3156 poly_uint64 const_length_a;
3157 poly_uint64 const_length_b;
3159 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3160 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3161 [a, a+12) */
3162 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3164 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3165 offset_a = (offset_a + access_size_a) - const_length_a;
3167 else
3168 const_length_a = tree_to_poly_uint64 (segment_length_a);
3169 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3171 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3172 offset_b = (offset_b + access_size_b) - const_length_b;
3174 else
3175 const_length_b = tree_to_poly_uint64 (segment_length_b);
3177 const_length_a += access_size_a;
3178 const_length_b += access_size_b;
3180 if (ranges_known_overlap_p (offset_a, const_length_a,
3181 offset_b, const_length_b))
3182 return 1;
3184 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3185 offset_b, const_length_b))
3186 return 0;
3188 return -1;
3191 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3192 in DDR is >= VF. */
3194 static bool
3195 dependence_distance_ge_vf (data_dependence_relation *ddr,
3196 unsigned int loop_depth, poly_uint64 vf)
3198 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3199 || DDR_NUM_DIST_VECTS (ddr) == 0)
3200 return false;
3202 /* If the dependence is exact, we should have limited the VF instead. */
3203 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3205 unsigned int i;
3206 lambda_vector dist_v;
3207 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3209 HOST_WIDE_INT dist = dist_v[loop_depth];
3210 if (dist != 0
3211 && !(dist > 0 && DDR_REVERSED_P (ddr))
3212 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3213 return false;
3216 if (dump_enabled_p ())
3218 dump_printf_loc (MSG_NOTE, vect_location,
3219 "dependence distance between ");
3220 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3221 dump_printf (MSG_NOTE, " and ");
3222 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3223 dump_printf (MSG_NOTE, " is >= VF\n");
3226 return true;
3229 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3231 static void
3232 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3234 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3235 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3236 dump_printf (dump_kind, ") >= ");
3237 dump_dec (dump_kind, lower_bound.min_value);
3240 /* Record that the vectorized loop requires the vec_lower_bound described
3241 by EXPR, UNSIGNED_P and MIN_VALUE. */
3243 static void
3244 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3245 poly_uint64 min_value)
3247 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3248 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3249 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3251 unsigned_p &= lower_bounds[i].unsigned_p;
3252 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3253 if (lower_bounds[i].unsigned_p != unsigned_p
3254 || maybe_lt (lower_bounds[i].min_value, min_value))
3256 lower_bounds[i].unsigned_p = unsigned_p;
3257 lower_bounds[i].min_value = min_value;
3258 if (dump_enabled_p ())
3260 dump_printf_loc (MSG_NOTE, vect_location,
3261 "updating run-time check to ");
3262 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3263 dump_printf (MSG_NOTE, "\n");
3266 return;
3269 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3270 if (dump_enabled_p ())
3272 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3273 dump_lower_bound (MSG_NOTE, lower_bound);
3274 dump_printf (MSG_NOTE, "\n");
3276 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3279 /* Return true if it's unlikely that the step of the vectorized form of DR
3280 will span fewer than GAP bytes. */
3282 static bool
3283 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3285 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3286 HOST_WIDE_INT count
3287 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3288 if (GROUP_FIRST_ELEMENT (stmt_info))
3289 count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3290 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3293 /* Return true if we know that there is no alias between DR_A and DR_B
3294 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3295 *LOWER_BOUND_OUT to this N. */
3297 static bool
3298 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3299 poly_uint64 *lower_bound_out)
3301 /* Check that there is a constant gap of known sign between DR_A
3302 and DR_B. */
3303 poly_int64 init_a, init_b;
3304 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3305 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3306 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3307 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3308 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3309 || !ordered_p (init_a, init_b))
3310 return false;
3312 /* Sort DR_A and DR_B by the address they access. */
3313 if (maybe_lt (init_b, init_a))
3315 std::swap (init_a, init_b);
3316 std::swap (dr_a, dr_b);
3319 /* If the two accesses could be dependent within a scalar iteration,
3320 make sure that we'd retain their order. */
3321 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3322 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3323 return false;
3325 /* There is no alias if abs (DR_STEP) is greater than or equal to
3326 the bytes spanned by the combination of the two accesses. */
3327 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3328 return true;
3331 /* Function vect_prune_runtime_alias_test_list.
3333 Prune a list of ddrs to be tested at run-time by versioning for alias.
3334 Merge several alias checks into one if possible.
3335 Return FALSE if resulting list of ddrs is longer then allowed by
3336 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3338 bool
3339 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3341 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3342 hash_set <tree_pair_hash> compared_objects;
3344 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3345 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3346 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3347 vec<vec_object_pair> &check_unequal_addrs
3348 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3349 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3350 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3352 ddr_p ddr;
3353 unsigned int i;
3354 tree length_factor;
3356 if (dump_enabled_p ())
3357 dump_printf_loc (MSG_NOTE, vect_location,
3358 "=== vect_prune_runtime_alias_test_list ===\n");
3360 /* Step values are irrelevant for aliasing if the number of vector
3361 iterations is equal to the number of scalar iterations (which can
3362 happen for fully-SLP loops). */
3363 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3365 if (!ignore_step_p)
3367 /* Convert the checks for nonzero steps into bound tests. */
3368 tree value;
3369 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3370 vect_check_lower_bound (loop_vinfo, value, true, 1);
3373 if (may_alias_ddrs.is_empty ())
3374 return true;
3376 comp_alias_ddrs.create (may_alias_ddrs.length ());
3378 unsigned int loop_depth
3379 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3380 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3382 /* First, we collect all data ref pairs for aliasing checks. */
3383 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3385 int comp_res;
3386 poly_uint64 lower_bound;
3387 struct data_reference *dr_a, *dr_b;
3388 gimple *dr_group_first_a, *dr_group_first_b;
3389 tree segment_length_a, segment_length_b;
3390 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3391 unsigned int align_a, align_b;
3392 gimple *stmt_a, *stmt_b;
3394 /* Ignore the alias if the VF we chose ended up being no greater
3395 than the dependence distance. */
3396 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3397 continue;
3399 if (DDR_OBJECT_A (ddr))
3401 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3402 if (!compared_objects.add (new_pair))
3404 if (dump_enabled_p ())
3406 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3407 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3408 dump_printf (MSG_NOTE, " and ");
3409 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3410 dump_printf (MSG_NOTE, " have different addresses\n");
3412 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3414 continue;
3417 dr_a = DDR_A (ddr);
3418 stmt_a = DR_STMT (DDR_A (ddr));
3420 dr_b = DDR_B (ddr);
3421 stmt_b = DR_STMT (DDR_B (ddr));
3423 /* Skip the pair if inter-iteration dependencies are irrelevant
3424 and intra-iteration dependencies are guaranteed to be honored. */
3425 if (ignore_step_p
3426 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3427 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3429 if (dump_enabled_p ())
3431 dump_printf_loc (MSG_NOTE, vect_location,
3432 "no need for alias check between ");
3433 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3434 dump_printf (MSG_NOTE, " and ");
3435 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3436 dump_printf (MSG_NOTE, " when VF is 1\n");
3438 continue;
3441 /* See whether we can handle the alias using a bounds check on
3442 the step, and whether that's likely to be the best approach.
3443 (It might not be, for example, if the minimum step is much larger
3444 than the number of bytes handled by one vector iteration.) */
3445 if (!ignore_step_p
3446 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3447 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3448 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3449 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3451 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3452 if (dump_enabled_p ())
3454 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3455 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3456 dump_printf (MSG_NOTE, " and ");
3457 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3458 dump_printf (MSG_NOTE, " when the step ");
3459 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3460 dump_printf (MSG_NOTE, " is outside ");
3461 if (unsigned_p)
3462 dump_printf (MSG_NOTE, "[0");
3463 else
3465 dump_printf (MSG_NOTE, "(");
3466 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3468 dump_printf (MSG_NOTE, ", ");
3469 dump_dec (MSG_NOTE, lower_bound);
3470 dump_printf (MSG_NOTE, ")\n");
3472 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3473 lower_bound);
3474 continue;
3477 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3478 if (dr_group_first_a)
3480 stmt_a = dr_group_first_a;
3481 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3484 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3485 if (dr_group_first_b)
3487 stmt_b = dr_group_first_b;
3488 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3491 if (ignore_step_p)
3493 segment_length_a = size_zero_node;
3494 segment_length_b = size_zero_node;
3496 else
3498 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3499 length_factor = scalar_loop_iters;
3500 else
3501 length_factor = size_int (vect_factor);
3502 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3503 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3505 access_size_a = vect_vfa_access_size (dr_a);
3506 access_size_b = vect_vfa_access_size (dr_b);
3507 align_a = vect_vfa_align (dr_a);
3508 align_b = vect_vfa_align (dr_b);
3510 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3511 DR_BASE_ADDRESS (dr_b));
3512 if (comp_res == 0)
3513 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3514 DR_OFFSET (dr_b));
3516 /* See whether the alias is known at compilation time. */
3517 if (comp_res == 0
3518 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3519 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3520 && poly_int_tree_p (segment_length_a)
3521 && poly_int_tree_p (segment_length_b))
3523 int res = vect_compile_time_alias (dr_a, dr_b,
3524 segment_length_a,
3525 segment_length_b,
3526 access_size_a,
3527 access_size_b);
3528 if (res >= 0 && dump_enabled_p ())
3530 dump_printf_loc (MSG_NOTE, vect_location,
3531 "can tell at compile time that ");
3532 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3533 dump_printf (MSG_NOTE, " and ");
3534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3535 if (res == 0)
3536 dump_printf (MSG_NOTE, " do not alias\n");
3537 else
3538 dump_printf (MSG_NOTE, " alias\n");
3541 if (res == 0)
3542 continue;
3544 if (res == 1)
3546 if (dump_enabled_p ())
3547 dump_printf_loc (MSG_NOTE, vect_location,
3548 "not vectorized: compilation time alias.\n");
3549 return false;
3553 dr_with_seg_len_pair_t dr_with_seg_len_pair
3554 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3555 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3557 /* Canonicalize pairs by sorting the two DR members. */
3558 if (comp_res > 0)
3559 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3561 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3564 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3566 unsigned int count = (comp_alias_ddrs.length ()
3567 + check_unequal_addrs.length ());
3569 dump_printf_loc (MSG_NOTE, vect_location,
3570 "improved number of alias checks from %d to %d\n",
3571 may_alias_ddrs.length (), count);
3572 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3574 if (dump_enabled_p ())
3575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3576 "number of versioning for alias "
3577 "run-time tests exceeds %d "
3578 "(--param vect-max-version-for-alias-checks)\n",
3579 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3580 return false;
3583 return true;
3586 /* Check whether we can use an internal function for a gather load
3587 or scatter store. READ_P is true for loads and false for stores.
3588 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3589 the type of the memory elements being loaded or stored. OFFSET_BITS
3590 is the number of bits in each scalar offset and OFFSET_SIGN is the
3591 sign of the offset. SCALE is the amount by which the offset should
3592 be multiplied *after* it has been converted to address width.
3594 Return true if the function is supported, storing the function
3595 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3597 bool
3598 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3599 tree memory_type, unsigned int offset_bits,
3600 signop offset_sign, int scale,
3601 internal_fn *ifn_out, tree *element_type_out)
3603 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3604 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3605 if (offset_bits > element_bits)
3606 /* Internal functions require the offset to be the same width as
3607 the vector elements. We can extend narrower offsets, but it isn't
3608 safe to truncate wider offsets. */
3609 return false;
3611 if (element_bits != memory_bits)
3612 /* For now the vector elements must be the same width as the
3613 memory elements. */
3614 return false;
3616 /* Work out which function we need. */
3617 internal_fn ifn;
3618 if (read_p)
3619 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3620 else
3621 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3623 /* Test whether the target supports this combination. */
3624 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3625 offset_sign, scale))
3626 return false;
3628 *ifn_out = ifn;
3629 *element_type_out = TREE_TYPE (vectype);
3630 return true;
3633 /* CALL is a call to an internal gather load or scatter store function.
3634 Describe the operation in INFO. */
3636 static void
3637 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3639 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3640 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3641 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3643 info->ifn = gimple_call_internal_fn (call);
3644 info->decl = NULL_TREE;
3645 info->base = gimple_call_arg (call, 0);
3646 info->offset = gimple_call_arg (call, 1);
3647 info->offset_dt = vect_unknown_def_type;
3648 info->offset_vectype = NULL_TREE;
3649 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3650 info->element_type = TREE_TYPE (vectype);
3651 info->memory_type = TREE_TYPE (DR_REF (dr));
3654 /* Return true if a non-affine read or write in STMT is suitable for a
3655 gather load or scatter store. Describe the operation in *INFO if so. */
3657 bool
3658 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3659 gather_scatter_info *info)
3661 HOST_WIDE_INT scale = 1;
3662 poly_int64 pbitpos, pbitsize;
3663 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3664 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3665 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3666 tree offtype = NULL_TREE;
3667 tree decl = NULL_TREE, base, off;
3668 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3669 tree memory_type = TREE_TYPE (DR_REF (dr));
3670 machine_mode pmode;
3671 int punsignedp, reversep, pvolatilep = 0;
3672 internal_fn ifn;
3673 tree element_type;
3674 bool masked_p = false;
3676 /* See whether this is already a call to a gather/scatter internal function.
3677 If not, see whether it's a masked load or store. */
3678 gcall *call = dyn_cast <gcall *> (stmt);
3679 if (call && gimple_call_internal_p (call))
3681 ifn = gimple_call_internal_fn (stmt);
3682 if (internal_gather_scatter_fn_p (ifn))
3684 vect_describe_gather_scatter_call (call, info);
3685 return true;
3687 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3690 /* True if we should aim to use internal functions rather than
3691 built-in functions. */
3692 bool use_ifn_p = (DR_IS_READ (dr)
3693 ? supports_vec_gather_load_p ()
3694 : supports_vec_scatter_store_p ());
3696 base = DR_REF (dr);
3697 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3698 see if we can use the def stmt of the address. */
3699 if (masked_p
3700 && TREE_CODE (base) == MEM_REF
3701 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3702 && integer_zerop (TREE_OPERAND (base, 1))
3703 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3705 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3706 if (is_gimple_assign (def_stmt)
3707 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3708 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3711 /* The gather and scatter builtins need address of the form
3712 loop_invariant + vector * {1, 2, 4, 8}
3714 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3715 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3716 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3717 multiplications and additions in it. To get a vector, we need
3718 a single SSA_NAME that will be defined in the loop and will
3719 contain everything that is not loop invariant and that can be
3720 vectorized. The following code attempts to find such a preexistng
3721 SSA_NAME OFF and put the loop invariants into a tree BASE
3722 that can be gimplified before the loop. */
3723 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3724 &punsignedp, &reversep, &pvolatilep);
3725 gcc_assert (base && !reversep);
3726 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3728 if (TREE_CODE (base) == MEM_REF)
3730 if (!integer_zerop (TREE_OPERAND (base, 1)))
3732 if (off == NULL_TREE)
3733 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3734 else
3735 off = size_binop (PLUS_EXPR, off,
3736 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3738 base = TREE_OPERAND (base, 0);
3740 else
3741 base = build_fold_addr_expr (base);
3743 if (off == NULL_TREE)
3744 off = size_zero_node;
3746 /* If base is not loop invariant, either off is 0, then we start with just
3747 the constant offset in the loop invariant BASE and continue with base
3748 as OFF, otherwise give up.
3749 We could handle that case by gimplifying the addition of base + off
3750 into some SSA_NAME and use that as off, but for now punt. */
3751 if (!expr_invariant_in_loop_p (loop, base))
3753 if (!integer_zerop (off))
3754 return false;
3755 off = base;
3756 base = size_int (pbytepos);
3758 /* Otherwise put base + constant offset into the loop invariant BASE
3759 and continue with OFF. */
3760 else
3762 base = fold_convert (sizetype, base);
3763 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3766 /* OFF at this point may be either a SSA_NAME or some tree expression
3767 from get_inner_reference. Try to peel off loop invariants from it
3768 into BASE as long as possible. */
3769 STRIP_NOPS (off);
3770 while (offtype == NULL_TREE)
3772 enum tree_code code;
3773 tree op0, op1, add = NULL_TREE;
3775 if (TREE_CODE (off) == SSA_NAME)
3777 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3779 if (expr_invariant_in_loop_p (loop, off))
3780 return false;
3782 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3783 break;
3785 op0 = gimple_assign_rhs1 (def_stmt);
3786 code = gimple_assign_rhs_code (def_stmt);
3787 op1 = gimple_assign_rhs2 (def_stmt);
3789 else
3791 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3792 return false;
3793 code = TREE_CODE (off);
3794 extract_ops_from_tree (off, &code, &op0, &op1);
3796 switch (code)
3798 case POINTER_PLUS_EXPR:
3799 case PLUS_EXPR:
3800 if (expr_invariant_in_loop_p (loop, op0))
3802 add = op0;
3803 off = op1;
3804 do_add:
3805 add = fold_convert (sizetype, add);
3806 if (scale != 1)
3807 add = size_binop (MULT_EXPR, add, size_int (scale));
3808 base = size_binop (PLUS_EXPR, base, add);
3809 continue;
3811 if (expr_invariant_in_loop_p (loop, op1))
3813 add = op1;
3814 off = op0;
3815 goto do_add;
3817 break;
3818 case MINUS_EXPR:
3819 if (expr_invariant_in_loop_p (loop, op1))
3821 add = fold_convert (sizetype, op1);
3822 add = size_binop (MINUS_EXPR, size_zero_node, add);
3823 off = op0;
3824 goto do_add;
3826 break;
3827 case MULT_EXPR:
3828 if (scale == 1 && tree_fits_shwi_p (op1))
3830 int new_scale = tree_to_shwi (op1);
3831 /* Only treat this as a scaling operation if the target
3832 supports it. */
3833 if (use_ifn_p
3834 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3835 vectype, memory_type, 1,
3836 TYPE_SIGN (TREE_TYPE (op0)),
3837 new_scale, &ifn,
3838 &element_type))
3839 break;
3840 scale = new_scale;
3841 off = op0;
3842 continue;
3844 break;
3845 case SSA_NAME:
3846 off = op0;
3847 continue;
3848 CASE_CONVERT:
3849 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3850 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3851 break;
3852 if (TYPE_PRECISION (TREE_TYPE (op0))
3853 == TYPE_PRECISION (TREE_TYPE (off)))
3855 off = op0;
3856 continue;
3859 /* The internal functions need the offset to be the same width
3860 as the elements of VECTYPE. Don't include operations that
3861 cast the offset from that width to a different width. */
3862 if (use_ifn_p
3863 && (int_size_in_bytes (TREE_TYPE (vectype))
3864 == int_size_in_bytes (TREE_TYPE (off))))
3865 break;
3867 if (TYPE_PRECISION (TREE_TYPE (op0))
3868 < TYPE_PRECISION (TREE_TYPE (off)))
3870 off = op0;
3871 offtype = TREE_TYPE (off);
3872 STRIP_NOPS (off);
3873 continue;
3875 break;
3876 default:
3877 break;
3879 break;
3882 /* If at the end OFF still isn't a SSA_NAME or isn't
3883 defined in the loop, punt. */
3884 if (TREE_CODE (off) != SSA_NAME
3885 || expr_invariant_in_loop_p (loop, off))
3886 return false;
3888 if (offtype == NULL_TREE)
3889 offtype = TREE_TYPE (off);
3891 if (use_ifn_p)
3893 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3894 memory_type, TYPE_PRECISION (offtype),
3895 TYPE_SIGN (offtype), scale, &ifn,
3896 &element_type))
3897 return false;
3899 else
3901 if (DR_IS_READ (dr))
3903 if (targetm.vectorize.builtin_gather)
3904 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3906 else
3908 if (targetm.vectorize.builtin_scatter)
3909 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3912 if (!decl)
3913 return false;
3915 ifn = IFN_LAST;
3916 element_type = TREE_TYPE (vectype);
3919 info->ifn = ifn;
3920 info->decl = decl;
3921 info->base = base;
3922 info->offset = off;
3923 info->offset_dt = vect_unknown_def_type;
3924 info->offset_vectype = NULL_TREE;
3925 info->scale = scale;
3926 info->element_type = element_type;
3927 info->memory_type = memory_type;
3928 return true;
3931 /* Function vect_analyze_data_refs.
3933 Find all the data references in the loop or basic block.
3935 The general structure of the analysis of data refs in the vectorizer is as
3936 follows:
3937 1- vect_analyze_data_refs(loop/bb): call
3938 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3939 in the loop/bb and their dependences.
3940 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3941 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3942 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3946 bool
3947 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3949 struct loop *loop = NULL;
3950 unsigned int i;
3951 struct data_reference *dr;
3952 tree scalar_type;
3954 if (dump_enabled_p ())
3955 dump_printf_loc (MSG_NOTE, vect_location,
3956 "=== vect_analyze_data_refs ===\n");
3958 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3959 loop = LOOP_VINFO_LOOP (loop_vinfo);
3961 /* Go through the data-refs, check that the analysis succeeded. Update
3962 pointer from stmt_vec_info struct to DR and vectype. */
3964 vec<data_reference_p> datarefs = vinfo->datarefs;
3965 FOR_EACH_VEC_ELT (datarefs, i, dr)
3967 gimple *stmt;
3968 stmt_vec_info stmt_info;
3969 tree base, offset, init;
3970 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3971 bool simd_lane_access = false;
3972 poly_uint64 vf;
3974 again:
3975 if (!dr || !DR_REF (dr))
3977 if (dump_enabled_p ())
3978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3979 "not vectorized: unhandled data-ref\n");
3980 return false;
3983 stmt = DR_STMT (dr);
3984 stmt_info = vinfo_for_stmt (stmt);
3986 /* Discard clobbers from the dataref vector. We will remove
3987 clobber stmts during vectorization. */
3988 if (gimple_clobber_p (stmt))
3990 free_data_ref (dr);
3991 if (i == datarefs.length () - 1)
3993 datarefs.pop ();
3994 break;
3996 datarefs.ordered_remove (i);
3997 dr = datarefs[i];
3998 goto again;
4001 /* Check that analysis of the data-ref succeeded. */
4002 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4003 || !DR_STEP (dr))
4005 bool maybe_gather
4006 = DR_IS_READ (dr)
4007 && !TREE_THIS_VOLATILE (DR_REF (dr))
4008 && (targetm.vectorize.builtin_gather != NULL
4009 || supports_vec_gather_load_p ());
4010 bool maybe_scatter
4011 = DR_IS_WRITE (dr)
4012 && !TREE_THIS_VOLATILE (DR_REF (dr))
4013 && (targetm.vectorize.builtin_scatter != NULL
4014 || supports_vec_scatter_store_p ());
4015 bool maybe_simd_lane_access
4016 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4018 /* If target supports vector gather loads or scatter stores, or if
4019 this might be a SIMD lane access, see if they can't be used. */
4020 if (is_a <loop_vec_info> (vinfo)
4021 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4022 && !nested_in_vect_loop_p (loop, stmt))
4024 struct data_reference *newdr
4025 = create_data_ref (NULL, loop_containing_stmt (stmt),
4026 DR_REF (dr), stmt, !maybe_scatter,
4027 DR_IS_CONDITIONAL_IN_STMT (dr));
4028 gcc_assert (newdr != NULL && DR_REF (newdr));
4029 if (DR_BASE_ADDRESS (newdr)
4030 && DR_OFFSET (newdr)
4031 && DR_INIT (newdr)
4032 && DR_STEP (newdr)
4033 && integer_zerop (DR_STEP (newdr)))
4035 if (maybe_simd_lane_access)
4037 tree off = DR_OFFSET (newdr);
4038 STRIP_NOPS (off);
4039 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4040 && TREE_CODE (off) == MULT_EXPR
4041 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4043 tree step = TREE_OPERAND (off, 1);
4044 off = TREE_OPERAND (off, 0);
4045 STRIP_NOPS (off);
4046 if (CONVERT_EXPR_P (off)
4047 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4048 0)))
4049 < TYPE_PRECISION (TREE_TYPE (off)))
4050 off = TREE_OPERAND (off, 0);
4051 if (TREE_CODE (off) == SSA_NAME)
4053 gimple *def = SSA_NAME_DEF_STMT (off);
4054 tree reft = TREE_TYPE (DR_REF (newdr));
4055 if (is_gimple_call (def)
4056 && gimple_call_internal_p (def)
4057 && (gimple_call_internal_fn (def)
4058 == IFN_GOMP_SIMD_LANE))
4060 tree arg = gimple_call_arg (def, 0);
4061 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4062 arg = SSA_NAME_VAR (arg);
4063 if (arg == loop->simduid
4064 /* For now. */
4065 && tree_int_cst_equal
4066 (TYPE_SIZE_UNIT (reft),
4067 step))
4069 DR_OFFSET (newdr) = ssize_int (0);
4070 DR_STEP (newdr) = step;
4071 DR_OFFSET_ALIGNMENT (newdr)
4072 = BIGGEST_ALIGNMENT;
4073 DR_STEP_ALIGNMENT (newdr)
4074 = highest_pow2_factor (step);
4075 dr = newdr;
4076 simd_lane_access = true;
4082 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4084 dr = newdr;
4085 if (maybe_gather)
4086 gatherscatter = GATHER;
4087 else
4088 gatherscatter = SCATTER;
4091 if (gatherscatter == SG_NONE && !simd_lane_access)
4092 free_data_ref (newdr);
4095 if (gatherscatter == SG_NONE && !simd_lane_access)
4097 if (dump_enabled_p ())
4099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4100 "not vectorized: data ref analysis "
4101 "failed ");
4102 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4105 if (is_a <bb_vec_info> (vinfo))
4106 break;
4108 return false;
4112 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4114 if (dump_enabled_p ())
4115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4116 "not vectorized: base addr of dr is a "
4117 "constant\n");
4119 if (is_a <bb_vec_info> (vinfo))
4120 break;
4122 if (gatherscatter != SG_NONE || simd_lane_access)
4123 free_data_ref (dr);
4124 return false;
4127 if (TREE_THIS_VOLATILE (DR_REF (dr)))
4129 if (dump_enabled_p ())
4131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4132 "not vectorized: volatile type ");
4133 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4136 if (is_a <bb_vec_info> (vinfo))
4137 break;
4139 return false;
4142 if (stmt_can_throw_internal (stmt))
4144 if (dump_enabled_p ())
4146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4147 "not vectorized: statement can throw an "
4148 "exception ");
4149 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4152 if (is_a <bb_vec_info> (vinfo))
4153 break;
4155 if (gatherscatter != SG_NONE || simd_lane_access)
4156 free_data_ref (dr);
4157 return false;
4160 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4161 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4163 if (dump_enabled_p ())
4165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4166 "not vectorized: statement is bitfield "
4167 "access ");
4168 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4171 if (is_a <bb_vec_info> (vinfo))
4172 break;
4174 if (gatherscatter != SG_NONE || simd_lane_access)
4175 free_data_ref (dr);
4176 return false;
4179 base = unshare_expr (DR_BASE_ADDRESS (dr));
4180 offset = unshare_expr (DR_OFFSET (dr));
4181 init = unshare_expr (DR_INIT (dr));
4183 if (is_gimple_call (stmt)
4184 && (!gimple_call_internal_p (stmt)
4185 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4186 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4188 if (dump_enabled_p ())
4190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4191 "not vectorized: dr in a call ");
4192 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4195 if (is_a <bb_vec_info> (vinfo))
4196 break;
4198 if (gatherscatter != SG_NONE || simd_lane_access)
4199 free_data_ref (dr);
4200 return false;
4203 /* Update DR field in stmt_vec_info struct. */
4205 /* If the dataref is in an inner-loop of the loop that is considered for
4206 for vectorization, we also want to analyze the access relative to
4207 the outer-loop (DR contains information only relative to the
4208 inner-most enclosing loop). We do that by building a reference to the
4209 first location accessed by the inner-loop, and analyze it relative to
4210 the outer-loop. */
4211 if (loop && nested_in_vect_loop_p (loop, stmt))
4213 /* Build a reference to the first location accessed by the
4214 inner loop: *(BASE + INIT + OFFSET). By construction,
4215 this address must be invariant in the inner loop, so we
4216 can consider it as being used in the outer loop. */
4217 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4218 init, offset);
4219 tree init_addr = fold_build_pointer_plus (base, init_offset);
4220 tree init_ref = build_fold_indirect_ref (init_addr);
4222 if (dump_enabled_p ())
4224 dump_printf_loc (MSG_NOTE, vect_location,
4225 "analyze in outer loop: ");
4226 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4227 dump_printf (MSG_NOTE, "\n");
4230 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4231 init_ref, loop))
4232 /* dr_analyze_innermost already explained the failure. */
4233 return false;
4235 if (dump_enabled_p ())
4237 dump_printf_loc (MSG_NOTE, vect_location,
4238 "\touter base_address: ");
4239 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4240 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4241 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4242 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4243 STMT_VINFO_DR_OFFSET (stmt_info));
4244 dump_printf (MSG_NOTE,
4245 "\n\touter constant offset from base address: ");
4246 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4247 STMT_VINFO_DR_INIT (stmt_info));
4248 dump_printf (MSG_NOTE, "\n\touter step: ");
4249 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4250 STMT_VINFO_DR_STEP (stmt_info));
4251 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4252 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4253 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4254 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4255 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4256 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4257 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4258 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4262 if (STMT_VINFO_DATA_REF (stmt_info))
4264 if (dump_enabled_p ())
4266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4267 "not vectorized: more than one data ref "
4268 "in stmt: ");
4269 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4272 if (is_a <bb_vec_info> (vinfo))
4273 break;
4275 if (gatherscatter != SG_NONE || simd_lane_access)
4276 free_data_ref (dr);
4277 return false;
4280 STMT_VINFO_DATA_REF (stmt_info) = dr;
4281 if (simd_lane_access)
4283 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4284 free_data_ref (datarefs[i]);
4285 datarefs[i] = dr;
4288 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4289 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4290 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4292 if (dump_enabled_p ())
4294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4295 "not vectorized: base object not addressable "
4296 "for stmt: ");
4297 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4299 if (is_a <bb_vec_info> (vinfo))
4301 /* In BB vectorization the ref can still participate
4302 in dependence analysis, we just can't vectorize it. */
4303 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4304 continue;
4306 return false;
4309 /* Set vectype for STMT. */
4310 scalar_type = TREE_TYPE (DR_REF (dr));
4311 STMT_VINFO_VECTYPE (stmt_info)
4312 = get_vectype_for_scalar_type (scalar_type);
4313 if (!STMT_VINFO_VECTYPE (stmt_info))
4315 if (dump_enabled_p ())
4317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4318 "not vectorized: no vectype for stmt: ");
4319 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4320 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4321 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4322 scalar_type);
4323 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4326 if (is_a <bb_vec_info> (vinfo))
4328 /* No vector type is fine, the ref can still participate
4329 in dependence analysis, we just can't vectorize it. */
4330 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4331 continue;
4334 if (gatherscatter != SG_NONE || simd_lane_access)
4336 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4337 if (gatherscatter != SG_NONE)
4338 free_data_ref (dr);
4340 return false;
4342 else
4344 if (dump_enabled_p ())
4346 dump_printf_loc (MSG_NOTE, vect_location,
4347 "got vectype for stmt: ");
4348 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4349 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4350 STMT_VINFO_VECTYPE (stmt_info));
4351 dump_printf (MSG_NOTE, "\n");
4355 /* Adjust the minimal vectorization factor according to the
4356 vector type. */
4357 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4358 *min_vf = upper_bound (*min_vf, vf);
4360 if (gatherscatter != SG_NONE)
4362 gather_scatter_info gs_info;
4363 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4364 &gs_info)
4365 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4367 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4368 free_data_ref (dr);
4369 if (dump_enabled_p ())
4371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4372 (gatherscatter == GATHER) ?
4373 "not vectorized: not suitable for gather "
4374 "load " :
4375 "not vectorized: not suitable for scatter "
4376 "store ");
4377 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4379 return false;
4382 free_data_ref (datarefs[i]);
4383 datarefs[i] = dr;
4384 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4387 else if (is_a <loop_vec_info> (vinfo)
4388 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4390 if (nested_in_vect_loop_p (loop, stmt))
4392 if (dump_enabled_p ())
4394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4395 "not vectorized: not suitable for strided "
4396 "load ");
4397 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4399 return false;
4401 STMT_VINFO_STRIDED_P (stmt_info) = true;
4405 /* If we stopped analysis at the first dataref we could not analyze
4406 when trying to vectorize a basic-block mark the rest of the datarefs
4407 as not vectorizable and truncate the vector of datarefs. That
4408 avoids spending useless time in analyzing their dependence. */
4409 if (i != datarefs.length ())
4411 gcc_assert (is_a <bb_vec_info> (vinfo));
4412 for (unsigned j = i; j < datarefs.length (); ++j)
4414 data_reference_p dr = datarefs[j];
4415 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4416 free_data_ref (dr);
4418 datarefs.truncate (i);
4421 return true;
4425 /* Function vect_get_new_vect_var.
4427 Returns a name for a new variable. The current naming scheme appends the
4428 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4429 the name of vectorizer generated variables, and appends that to NAME if
4430 provided. */
4432 tree
4433 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4435 const char *prefix;
4436 tree new_vect_var;
4438 switch (var_kind)
4440 case vect_simple_var:
4441 prefix = "vect";
4442 break;
4443 case vect_scalar_var:
4444 prefix = "stmp";
4445 break;
4446 case vect_mask_var:
4447 prefix = "mask";
4448 break;
4449 case vect_pointer_var:
4450 prefix = "vectp";
4451 break;
4452 default:
4453 gcc_unreachable ();
4456 if (name)
4458 char* tmp = concat (prefix, "_", name, NULL);
4459 new_vect_var = create_tmp_reg (type, tmp);
4460 free (tmp);
4462 else
4463 new_vect_var = create_tmp_reg (type, prefix);
4465 return new_vect_var;
4468 /* Like vect_get_new_vect_var but return an SSA name. */
4470 tree
4471 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4473 const char *prefix;
4474 tree new_vect_var;
4476 switch (var_kind)
4478 case vect_simple_var:
4479 prefix = "vect";
4480 break;
4481 case vect_scalar_var:
4482 prefix = "stmp";
4483 break;
4484 case vect_pointer_var:
4485 prefix = "vectp";
4486 break;
4487 default:
4488 gcc_unreachable ();
4491 if (name)
4493 char* tmp = concat (prefix, "_", name, NULL);
4494 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4495 free (tmp);
4497 else
4498 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4500 return new_vect_var;
4503 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4505 static void
4506 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4508 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4509 int misalign = DR_MISALIGNMENT (dr);
4510 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4511 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4512 else
4513 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4514 DR_TARGET_ALIGNMENT (dr), misalign);
4517 /* Function vect_create_addr_base_for_vector_ref.
4519 Create an expression that computes the address of the first memory location
4520 that will be accessed for a data reference.
4522 Input:
4523 STMT: The statement containing the data reference.
4524 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4525 OFFSET: Optional. If supplied, it is be added to the initial address.
4526 LOOP: Specify relative to which loop-nest should the address be computed.
4527 For example, when the dataref is in an inner-loop nested in an
4528 outer-loop that is now being vectorized, LOOP can be either the
4529 outer-loop, or the inner-loop. The first memory location accessed
4530 by the following dataref ('in' points to short):
4532 for (i=0; i<N; i++)
4533 for (j=0; j<M; j++)
4534 s += in[i+j]
4536 is as follows:
4537 if LOOP=i_loop: &in (relative to i_loop)
4538 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4539 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4540 initial address. Unlike OFFSET, which is number of elements to
4541 be added, BYTE_OFFSET is measured in bytes.
4543 Output:
4544 1. Return an SSA_NAME whose value is the address of the memory location of
4545 the first vector of the data reference.
4546 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4547 these statement(s) which define the returned SSA_NAME.
4549 FORNOW: We are only handling array accesses with step 1. */
4551 tree
4552 vect_create_addr_base_for_vector_ref (gimple *stmt,
4553 gimple_seq *new_stmt_list,
4554 tree offset,
4555 tree byte_offset)
4557 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4558 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4559 const char *base_name;
4560 tree addr_base;
4561 tree dest;
4562 gimple_seq seq = NULL;
4563 tree vect_ptr_type;
4564 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4565 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4566 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4568 tree data_ref_base = unshare_expr (drb->base_address);
4569 tree base_offset = unshare_expr (drb->offset);
4570 tree init = unshare_expr (drb->init);
4572 if (loop_vinfo)
4573 base_name = get_name (data_ref_base);
4574 else
4576 base_offset = ssize_int (0);
4577 init = ssize_int (0);
4578 base_name = get_name (DR_REF (dr));
4581 /* Create base_offset */
4582 base_offset = size_binop (PLUS_EXPR,
4583 fold_convert (sizetype, base_offset),
4584 fold_convert (sizetype, init));
4586 if (offset)
4588 offset = fold_build2 (MULT_EXPR, sizetype,
4589 fold_convert (sizetype, offset), step);
4590 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4591 base_offset, offset);
4593 if (byte_offset)
4595 byte_offset = fold_convert (sizetype, byte_offset);
4596 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4597 base_offset, byte_offset);
4600 /* base + base_offset */
4601 if (loop_vinfo)
4602 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4603 else
4605 addr_base = build1 (ADDR_EXPR,
4606 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4607 unshare_expr (DR_REF (dr)));
4610 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4611 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4612 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4613 gimple_seq_add_seq (new_stmt_list, seq);
4615 if (DR_PTR_INFO (dr)
4616 && TREE_CODE (addr_base) == SSA_NAME
4617 && !SSA_NAME_PTR_INFO (addr_base))
4619 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4620 if (offset || byte_offset)
4621 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4624 if (dump_enabled_p ())
4626 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4627 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4628 dump_printf (MSG_NOTE, "\n");
4631 return addr_base;
4635 /* Function vect_create_data_ref_ptr.
4637 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4638 location accessed in the loop by STMT, along with the def-use update
4639 chain to appropriately advance the pointer through the loop iterations.
4640 Also set aliasing information for the pointer. This pointer is used by
4641 the callers to this function to create a memory reference expression for
4642 vector load/store access.
4644 Input:
4645 1. STMT: a stmt that references memory. Expected to be of the form
4646 GIMPLE_ASSIGN <name, data-ref> or
4647 GIMPLE_ASSIGN <data-ref, name>.
4648 2. AGGR_TYPE: the type of the reference, which should be either a vector
4649 or an array.
4650 3. AT_LOOP: the loop where the vector memref is to be created.
4651 4. OFFSET (optional): an offset to be added to the initial address accessed
4652 by the data-ref in STMT.
4653 5. BSI: location where the new stmts are to be placed if there is no loop
4654 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4655 pointing to the initial address.
4656 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4657 to the initial address accessed by the data-ref in STMT. This is
4658 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4659 in bytes.
4660 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4661 to the IV during each iteration of the loop. NULL says to move
4662 by one copy of AGGR_TYPE up or down, depending on the step of the
4663 data reference.
4665 Output:
4666 1. Declare a new ptr to vector_type, and have it point to the base of the
4667 data reference (initial addressed accessed by the data reference).
4668 For example, for vector of type V8HI, the following code is generated:
4670 v8hi *ap;
4671 ap = (v8hi *)initial_address;
4673 if OFFSET is not supplied:
4674 initial_address = &a[init];
4675 if OFFSET is supplied:
4676 initial_address = &a[init + OFFSET];
4677 if BYTE_OFFSET is supplied:
4678 initial_address = &a[init] + BYTE_OFFSET;
4680 Return the initial_address in INITIAL_ADDRESS.
4682 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4683 update the pointer in each iteration of the loop.
4685 Return the increment stmt that updates the pointer in PTR_INCR.
4687 3. Set INV_P to true if the access pattern of the data reference in the
4688 vectorized loop is invariant. Set it to false otherwise.
4690 4. Return the pointer. */
4692 tree
4693 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4694 tree offset, tree *initial_address,
4695 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4696 bool only_init, bool *inv_p, tree byte_offset,
4697 tree iv_step)
4699 const char *base_name;
4700 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4701 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4702 struct loop *loop = NULL;
4703 bool nested_in_vect_loop = false;
4704 struct loop *containing_loop = NULL;
4705 tree aggr_ptr_type;
4706 tree aggr_ptr;
4707 tree new_temp;
4708 gimple_seq new_stmt_list = NULL;
4709 edge pe = NULL;
4710 basic_block new_bb;
4711 tree aggr_ptr_init;
4712 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4713 tree aptr;
4714 gimple_stmt_iterator incr_gsi;
4715 bool insert_after;
4716 tree indx_before_incr, indx_after_incr;
4717 gimple *incr;
4718 tree step;
4719 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4721 gcc_assert (iv_step != NULL_TREE
4722 || TREE_CODE (aggr_type) == ARRAY_TYPE
4723 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4725 if (loop_vinfo)
4727 loop = LOOP_VINFO_LOOP (loop_vinfo);
4728 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4729 containing_loop = (gimple_bb (stmt))->loop_father;
4730 pe = loop_preheader_edge (loop);
4732 else
4734 gcc_assert (bb_vinfo);
4735 only_init = true;
4736 *ptr_incr = NULL;
4739 /* Check the step (evolution) of the load in LOOP, and record
4740 whether it's invariant. */
4741 step = vect_dr_behavior (dr)->step;
4742 if (integer_zerop (step))
4743 *inv_p = true;
4744 else
4745 *inv_p = false;
4747 /* Create an expression for the first address accessed by this load
4748 in LOOP. */
4749 base_name = get_name (DR_BASE_ADDRESS (dr));
4751 if (dump_enabled_p ())
4753 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4754 dump_printf_loc (MSG_NOTE, vect_location,
4755 "create %s-pointer variable to type: ",
4756 get_tree_code_name (TREE_CODE (aggr_type)));
4757 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4758 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4759 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4760 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4761 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4762 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4763 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4764 else
4765 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4766 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4767 dump_printf (MSG_NOTE, "\n");
4770 /* (1) Create the new aggregate-pointer variable.
4771 Vector and array types inherit the alias set of their component
4772 type by default so we need to use a ref-all pointer if the data
4773 reference does not conflict with the created aggregated data
4774 reference because it is not addressable. */
4775 bool need_ref_all = false;
4776 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4777 get_alias_set (DR_REF (dr))))
4778 need_ref_all = true;
4779 /* Likewise for any of the data references in the stmt group. */
4780 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4782 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4785 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4786 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4787 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4788 get_alias_set (DR_REF (sdr))))
4790 need_ref_all = true;
4791 break;
4793 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4795 while (orig_stmt);
4797 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4798 need_ref_all);
4799 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4802 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4803 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4804 def-use update cycles for the pointer: one relative to the outer-loop
4805 (LOOP), which is what steps (3) and (4) below do. The other is relative
4806 to the inner-loop (which is the inner-most loop containing the dataref),
4807 and this is done be step (5) below.
4809 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4810 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4811 redundant. Steps (3),(4) create the following:
4813 vp0 = &base_addr;
4814 LOOP: vp1 = phi(vp0,vp2)
4817 vp2 = vp1 + step
4818 goto LOOP
4820 If there is an inner-loop nested in loop, then step (5) will also be
4821 applied, and an additional update in the inner-loop will be created:
4823 vp0 = &base_addr;
4824 LOOP: vp1 = phi(vp0,vp2)
4826 inner: vp3 = phi(vp1,vp4)
4827 vp4 = vp3 + inner_step
4828 if () goto inner
4830 vp2 = vp1 + step
4831 if () goto LOOP */
4833 /* (2) Calculate the initial address of the aggregate-pointer, and set
4834 the aggregate-pointer to point to it before the loop. */
4836 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4838 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4839 offset, byte_offset);
4840 if (new_stmt_list)
4842 if (pe)
4844 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4845 gcc_assert (!new_bb);
4847 else
4848 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4851 *initial_address = new_temp;
4852 aggr_ptr_init = new_temp;
4854 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4855 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4856 inner-loop nested in LOOP (during outer-loop vectorization). */
4858 /* No update in loop is required. */
4859 if (only_init && (!loop_vinfo || at_loop == loop))
4860 aptr = aggr_ptr_init;
4861 else
4863 if (iv_step == NULL_TREE)
4865 /* The step of the aggregate pointer is the type size. */
4866 iv_step = TYPE_SIZE_UNIT (aggr_type);
4867 /* One exception to the above is when the scalar step of the load in
4868 LOOP is zero. In this case the step here is also zero. */
4869 if (*inv_p)
4870 iv_step = size_zero_node;
4871 else if (tree_int_cst_sgn (step) == -1)
4872 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4875 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4877 create_iv (aggr_ptr_init,
4878 fold_convert (aggr_ptr_type, iv_step),
4879 aggr_ptr, loop, &incr_gsi, insert_after,
4880 &indx_before_incr, &indx_after_incr);
4881 incr = gsi_stmt (incr_gsi);
4882 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4884 /* Copy the points-to information if it exists. */
4885 if (DR_PTR_INFO (dr))
4887 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4888 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4890 if (ptr_incr)
4891 *ptr_incr = incr;
4893 aptr = indx_before_incr;
4896 if (!nested_in_vect_loop || only_init)
4897 return aptr;
4900 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4901 nested in LOOP, if exists. */
4903 gcc_assert (nested_in_vect_loop);
4904 if (!only_init)
4906 standard_iv_increment_position (containing_loop, &incr_gsi,
4907 &insert_after);
4908 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4909 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4910 &indx_after_incr);
4911 incr = gsi_stmt (incr_gsi);
4912 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4914 /* Copy the points-to information if it exists. */
4915 if (DR_PTR_INFO (dr))
4917 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4918 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4920 if (ptr_incr)
4921 *ptr_incr = incr;
4923 return indx_before_incr;
4925 else
4926 gcc_unreachable ();
4930 /* Function bump_vector_ptr
4932 Increment a pointer (to a vector type) by vector-size. If requested,
4933 i.e. if PTR-INCR is given, then also connect the new increment stmt
4934 to the existing def-use update-chain of the pointer, by modifying
4935 the PTR_INCR as illustrated below:
4937 The pointer def-use update-chain before this function:
4938 DATAREF_PTR = phi (p_0, p_2)
4939 ....
4940 PTR_INCR: p_2 = DATAREF_PTR + step
4942 The pointer def-use update-chain after this function:
4943 DATAREF_PTR = phi (p_0, p_2)
4944 ....
4945 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4946 ....
4947 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4949 Input:
4950 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4951 in the loop.
4952 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4953 the loop. The increment amount across iterations is expected
4954 to be vector_size.
4955 BSI - location where the new update stmt is to be placed.
4956 STMT - the original scalar memory-access stmt that is being vectorized.
4957 BUMP - optional. The offset by which to bump the pointer. If not given,
4958 the offset is assumed to be vector_size.
4960 Output: Return NEW_DATAREF_PTR as illustrated above.
4964 tree
4965 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4966 gimple *stmt, tree bump)
4968 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4969 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4970 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4971 tree update = TYPE_SIZE_UNIT (vectype);
4972 gassign *incr_stmt;
4973 ssa_op_iter iter;
4974 use_operand_p use_p;
4975 tree new_dataref_ptr;
4977 if (bump)
4978 update = bump;
4980 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4981 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4982 else
4983 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4984 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4985 dataref_ptr, update);
4986 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4988 /* Copy the points-to information if it exists. */
4989 if (DR_PTR_INFO (dr))
4991 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4992 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4995 if (!ptr_incr)
4996 return new_dataref_ptr;
4998 /* Update the vector-pointer's cross-iteration increment. */
4999 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5001 tree use = USE_FROM_PTR (use_p);
5003 if (use == dataref_ptr)
5004 SET_USE (use_p, new_dataref_ptr);
5005 else
5006 gcc_assert (operand_equal_p (use, update, 0));
5009 return new_dataref_ptr;
5013 /* Copy memory reference info such as base/clique from the SRC reference
5014 to the DEST MEM_REF. */
5016 void
5017 vect_copy_ref_info (tree dest, tree src)
5019 if (TREE_CODE (dest) != MEM_REF)
5020 return;
5022 tree src_base = src;
5023 while (handled_component_p (src_base))
5024 src_base = TREE_OPERAND (src_base, 0);
5025 if (TREE_CODE (src_base) != MEM_REF
5026 && TREE_CODE (src_base) != TARGET_MEM_REF)
5027 return;
5029 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5030 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5034 /* Function vect_create_destination_var.
5036 Create a new temporary of type VECTYPE. */
5038 tree
5039 vect_create_destination_var (tree scalar_dest, tree vectype)
5041 tree vec_dest;
5042 const char *name;
5043 char *new_name;
5044 tree type;
5045 enum vect_var_kind kind;
5047 kind = vectype
5048 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5049 ? vect_mask_var
5050 : vect_simple_var
5051 : vect_scalar_var;
5052 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5054 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5056 name = get_name (scalar_dest);
5057 if (name)
5058 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5059 else
5060 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5061 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5062 free (new_name);
5064 return vec_dest;
5067 /* Function vect_grouped_store_supported.
5069 Returns TRUE if interleave high and interleave low permutations
5070 are supported, and FALSE otherwise. */
5072 bool
5073 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5075 machine_mode mode = TYPE_MODE (vectype);
5077 /* vect_permute_store_chain requires the group size to be equal to 3 or
5078 be a power of two. */
5079 if (count != 3 && exact_log2 (count) == -1)
5081 if (dump_enabled_p ())
5082 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5083 "the size of the group of accesses"
5084 " is not a power of 2 or not eqaul to 3\n");
5085 return false;
5088 /* Check that the permutation is supported. */
5089 if (VECTOR_MODE_P (mode))
5091 unsigned int i;
5092 if (count == 3)
5094 unsigned int j0 = 0, j1 = 0, j2 = 0;
5095 unsigned int i, j;
5097 unsigned int nelt;
5098 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5100 if (dump_enabled_p ())
5101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5102 "cannot handle groups of 3 stores for"
5103 " variable-length vectors\n");
5104 return false;
5107 vec_perm_builder sel (nelt, nelt, 1);
5108 sel.quick_grow (nelt);
5109 vec_perm_indices indices;
5110 for (j = 0; j < 3; j++)
5112 int nelt0 = ((3 - j) * nelt) % 3;
5113 int nelt1 = ((3 - j) * nelt + 1) % 3;
5114 int nelt2 = ((3 - j) * nelt + 2) % 3;
5115 for (i = 0; i < nelt; i++)
5117 if (3 * i + nelt0 < nelt)
5118 sel[3 * i + nelt0] = j0++;
5119 if (3 * i + nelt1 < nelt)
5120 sel[3 * i + nelt1] = nelt + j1++;
5121 if (3 * i + nelt2 < nelt)
5122 sel[3 * i + nelt2] = 0;
5124 indices.new_vector (sel, 2, nelt);
5125 if (!can_vec_perm_const_p (mode, indices))
5127 if (dump_enabled_p ())
5128 dump_printf (MSG_MISSED_OPTIMIZATION,
5129 "permutation op not supported by target.\n");
5130 return false;
5133 for (i = 0; i < nelt; i++)
5135 if (3 * i + nelt0 < nelt)
5136 sel[3 * i + nelt0] = 3 * i + nelt0;
5137 if (3 * i + nelt1 < nelt)
5138 sel[3 * i + nelt1] = 3 * i + nelt1;
5139 if (3 * i + nelt2 < nelt)
5140 sel[3 * i + nelt2] = nelt + j2++;
5142 indices.new_vector (sel, 2, nelt);
5143 if (!can_vec_perm_const_p (mode, indices))
5145 if (dump_enabled_p ())
5146 dump_printf (MSG_MISSED_OPTIMIZATION,
5147 "permutation op not supported by target.\n");
5148 return false;
5151 return true;
5153 else
5155 /* If length is not equal to 3 then only power of 2 is supported. */
5156 gcc_assert (pow2p_hwi (count));
5157 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5159 /* The encoding has 2 interleaved stepped patterns. */
5160 vec_perm_builder sel (nelt, 2, 3);
5161 sel.quick_grow (6);
5162 for (i = 0; i < 3; i++)
5164 sel[i * 2] = i;
5165 sel[i * 2 + 1] = i + nelt;
5167 vec_perm_indices indices (sel, 2, nelt);
5168 if (can_vec_perm_const_p (mode, indices))
5170 for (i = 0; i < 6; i++)
5171 sel[i] += exact_div (nelt, 2);
5172 indices.new_vector (sel, 2, nelt);
5173 if (can_vec_perm_const_p (mode, indices))
5174 return true;
5179 if (dump_enabled_p ())
5180 dump_printf (MSG_MISSED_OPTIMIZATION,
5181 "permutaion op not supported by target.\n");
5182 return false;
5186 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5187 type VECTYPE. MASKED_P says whether the masked form is needed. */
5189 bool
5190 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5191 bool masked_p)
5193 if (masked_p)
5194 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5195 vec_mask_store_lanes_optab,
5196 vectype, count);
5197 else
5198 return vect_lanes_optab_supported_p ("vec_store_lanes",
5199 vec_store_lanes_optab,
5200 vectype, count);
5204 /* Function vect_permute_store_chain.
5206 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5207 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5208 the data correctly for the stores. Return the final references for stores
5209 in RESULT_CHAIN.
5211 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5212 The input is 4 vectors each containing 8 elements. We assign a number to
5213 each element, the input sequence is:
5215 1st vec: 0 1 2 3 4 5 6 7
5216 2nd vec: 8 9 10 11 12 13 14 15
5217 3rd vec: 16 17 18 19 20 21 22 23
5218 4th vec: 24 25 26 27 28 29 30 31
5220 The output sequence should be:
5222 1st vec: 0 8 16 24 1 9 17 25
5223 2nd vec: 2 10 18 26 3 11 19 27
5224 3rd vec: 4 12 20 28 5 13 21 30
5225 4th vec: 6 14 22 30 7 15 23 31
5227 i.e., we interleave the contents of the four vectors in their order.
5229 We use interleave_high/low instructions to create such output. The input of
5230 each interleave_high/low operation is two vectors:
5231 1st vec 2nd vec
5232 0 1 2 3 4 5 6 7
5233 the even elements of the result vector are obtained left-to-right from the
5234 high/low elements of the first vector. The odd elements of the result are
5235 obtained left-to-right from the high/low elements of the second vector.
5236 The output of interleave_high will be: 0 4 1 5
5237 and of interleave_low: 2 6 3 7
5240 The permutation is done in log LENGTH stages. In each stage interleave_high
5241 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5242 where the first argument is taken from the first half of DR_CHAIN and the
5243 second argument from it's second half.
5244 In our example,
5246 I1: interleave_high (1st vec, 3rd vec)
5247 I2: interleave_low (1st vec, 3rd vec)
5248 I3: interleave_high (2nd vec, 4th vec)
5249 I4: interleave_low (2nd vec, 4th vec)
5251 The output for the first stage is:
5253 I1: 0 16 1 17 2 18 3 19
5254 I2: 4 20 5 21 6 22 7 23
5255 I3: 8 24 9 25 10 26 11 27
5256 I4: 12 28 13 29 14 30 15 31
5258 The output of the second stage, i.e. the final result is:
5260 I1: 0 8 16 24 1 9 17 25
5261 I2: 2 10 18 26 3 11 19 27
5262 I3: 4 12 20 28 5 13 21 30
5263 I4: 6 14 22 30 7 15 23 31. */
5265 void
5266 vect_permute_store_chain (vec<tree> dr_chain,
5267 unsigned int length,
5268 gimple *stmt,
5269 gimple_stmt_iterator *gsi,
5270 vec<tree> *result_chain)
5272 tree vect1, vect2, high, low;
5273 gimple *perm_stmt;
5274 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5275 tree perm_mask_low, perm_mask_high;
5276 tree data_ref;
5277 tree perm3_mask_low, perm3_mask_high;
5278 unsigned int i, j, n, log_length = exact_log2 (length);
5280 result_chain->quick_grow (length);
5281 memcpy (result_chain->address (), dr_chain.address (),
5282 length * sizeof (tree));
5284 if (length == 3)
5286 /* vect_grouped_store_supported ensures that this is constant. */
5287 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5288 unsigned int j0 = 0, j1 = 0, j2 = 0;
5290 vec_perm_builder sel (nelt, nelt, 1);
5291 sel.quick_grow (nelt);
5292 vec_perm_indices indices;
5293 for (j = 0; j < 3; j++)
5295 int nelt0 = ((3 - j) * nelt) % 3;
5296 int nelt1 = ((3 - j) * nelt + 1) % 3;
5297 int nelt2 = ((3 - j) * nelt + 2) % 3;
5299 for (i = 0; i < nelt; i++)
5301 if (3 * i + nelt0 < nelt)
5302 sel[3 * i + nelt0] = j0++;
5303 if (3 * i + nelt1 < nelt)
5304 sel[3 * i + nelt1] = nelt + j1++;
5305 if (3 * i + nelt2 < nelt)
5306 sel[3 * i + nelt2] = 0;
5308 indices.new_vector (sel, 2, nelt);
5309 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5311 for (i = 0; i < nelt; i++)
5313 if (3 * i + nelt0 < nelt)
5314 sel[3 * i + nelt0] = 3 * i + nelt0;
5315 if (3 * i + nelt1 < nelt)
5316 sel[3 * i + nelt1] = 3 * i + nelt1;
5317 if (3 * i + nelt2 < nelt)
5318 sel[3 * i + nelt2] = nelt + j2++;
5320 indices.new_vector (sel, 2, nelt);
5321 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5323 vect1 = dr_chain[0];
5324 vect2 = dr_chain[1];
5326 /* Create interleaving stmt:
5327 low = VEC_PERM_EXPR <vect1, vect2,
5328 {j, nelt, *, j + 1, nelt + j + 1, *,
5329 j + 2, nelt + j + 2, *, ...}> */
5330 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5331 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5332 vect2, perm3_mask_low);
5333 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5335 vect1 = data_ref;
5336 vect2 = dr_chain[2];
5337 /* Create interleaving stmt:
5338 low = VEC_PERM_EXPR <vect1, vect2,
5339 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5340 6, 7, nelt + j + 2, ...}> */
5341 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5342 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5343 vect2, perm3_mask_high);
5344 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5345 (*result_chain)[j] = data_ref;
5348 else
5350 /* If length is not equal to 3 then only power of 2 is supported. */
5351 gcc_assert (pow2p_hwi (length));
5353 /* The encoding has 2 interleaved stepped patterns. */
5354 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5355 vec_perm_builder sel (nelt, 2, 3);
5356 sel.quick_grow (6);
5357 for (i = 0; i < 3; i++)
5359 sel[i * 2] = i;
5360 sel[i * 2 + 1] = i + nelt;
5362 vec_perm_indices indices (sel, 2, nelt);
5363 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5365 for (i = 0; i < 6; i++)
5366 sel[i] += exact_div (nelt, 2);
5367 indices.new_vector (sel, 2, nelt);
5368 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5370 for (i = 0, n = log_length; i < n; i++)
5372 for (j = 0; j < length/2; j++)
5374 vect1 = dr_chain[j];
5375 vect2 = dr_chain[j+length/2];
5377 /* Create interleaving stmt:
5378 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5379 ...}> */
5380 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5381 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5382 vect2, perm_mask_high);
5383 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5384 (*result_chain)[2*j] = high;
5386 /* Create interleaving stmt:
5387 low = VEC_PERM_EXPR <vect1, vect2,
5388 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5389 ...}> */
5390 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5391 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5392 vect2, perm_mask_low);
5393 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5394 (*result_chain)[2*j+1] = low;
5396 memcpy (dr_chain.address (), result_chain->address (),
5397 length * sizeof (tree));
5402 /* Function vect_setup_realignment
5404 This function is called when vectorizing an unaligned load using
5405 the dr_explicit_realign[_optimized] scheme.
5406 This function generates the following code at the loop prolog:
5408 p = initial_addr;
5409 x msq_init = *(floor(p)); # prolog load
5410 realignment_token = call target_builtin;
5411 loop:
5412 x msq = phi (msq_init, ---)
5414 The stmts marked with x are generated only for the case of
5415 dr_explicit_realign_optimized.
5417 The code above sets up a new (vector) pointer, pointing to the first
5418 location accessed by STMT, and a "floor-aligned" load using that pointer.
5419 It also generates code to compute the "realignment-token" (if the relevant
5420 target hook was defined), and creates a phi-node at the loop-header bb
5421 whose arguments are the result of the prolog-load (created by this
5422 function) and the result of a load that takes place in the loop (to be
5423 created by the caller to this function).
5425 For the case of dr_explicit_realign_optimized:
5426 The caller to this function uses the phi-result (msq) to create the
5427 realignment code inside the loop, and sets up the missing phi argument,
5428 as follows:
5429 loop:
5430 msq = phi (msq_init, lsq)
5431 lsq = *(floor(p')); # load in loop
5432 result = realign_load (msq, lsq, realignment_token);
5434 For the case of dr_explicit_realign:
5435 loop:
5436 msq = *(floor(p)); # load in loop
5437 p' = p + (VS-1);
5438 lsq = *(floor(p')); # load in loop
5439 result = realign_load (msq, lsq, realignment_token);
5441 Input:
5442 STMT - (scalar) load stmt to be vectorized. This load accesses
5443 a memory location that may be unaligned.
5444 BSI - place where new code is to be inserted.
5445 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5446 is used.
5448 Output:
5449 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5450 target hook, if defined.
5451 Return value - the result of the loop-header phi node. */
5453 tree
5454 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5455 tree *realignment_token,
5456 enum dr_alignment_support alignment_support_scheme,
5457 tree init_addr,
5458 struct loop **at_loop)
5460 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5461 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5462 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5463 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5464 struct loop *loop = NULL;
5465 edge pe = NULL;
5466 tree scalar_dest = gimple_assign_lhs (stmt);
5467 tree vec_dest;
5468 gimple *inc;
5469 tree ptr;
5470 tree data_ref;
5471 basic_block new_bb;
5472 tree msq_init = NULL_TREE;
5473 tree new_temp;
5474 gphi *phi_stmt;
5475 tree msq = NULL_TREE;
5476 gimple_seq stmts = NULL;
5477 bool inv_p;
5478 bool compute_in_loop = false;
5479 bool nested_in_vect_loop = false;
5480 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5481 struct loop *loop_for_initial_load = NULL;
5483 if (loop_vinfo)
5485 loop = LOOP_VINFO_LOOP (loop_vinfo);
5486 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5489 gcc_assert (alignment_support_scheme == dr_explicit_realign
5490 || alignment_support_scheme == dr_explicit_realign_optimized);
5492 /* We need to generate three things:
5493 1. the misalignment computation
5494 2. the extra vector load (for the optimized realignment scheme).
5495 3. the phi node for the two vectors from which the realignment is
5496 done (for the optimized realignment scheme). */
5498 /* 1. Determine where to generate the misalignment computation.
5500 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5501 calculation will be generated by this function, outside the loop (in the
5502 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5503 caller, inside the loop.
5505 Background: If the misalignment remains fixed throughout the iterations of
5506 the loop, then both realignment schemes are applicable, and also the
5507 misalignment computation can be done outside LOOP. This is because we are
5508 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5509 are a multiple of VS (the Vector Size), and therefore the misalignment in
5510 different vectorized LOOP iterations is always the same.
5511 The problem arises only if the memory access is in an inner-loop nested
5512 inside LOOP, which is now being vectorized using outer-loop vectorization.
5513 This is the only case when the misalignment of the memory access may not
5514 remain fixed throughout the iterations of the inner-loop (as explained in
5515 detail in vect_supportable_dr_alignment). In this case, not only is the
5516 optimized realignment scheme not applicable, but also the misalignment
5517 computation (and generation of the realignment token that is passed to
5518 REALIGN_LOAD) have to be done inside the loop.
5520 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5521 or not, which in turn determines if the misalignment is computed inside
5522 the inner-loop, or outside LOOP. */
5524 if (init_addr != NULL_TREE || !loop_vinfo)
5526 compute_in_loop = true;
5527 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5531 /* 2. Determine where to generate the extra vector load.
5533 For the optimized realignment scheme, instead of generating two vector
5534 loads in each iteration, we generate a single extra vector load in the
5535 preheader of the loop, and in each iteration reuse the result of the
5536 vector load from the previous iteration. In case the memory access is in
5537 an inner-loop nested inside LOOP, which is now being vectorized using
5538 outer-loop vectorization, we need to determine whether this initial vector
5539 load should be generated at the preheader of the inner-loop, or can be
5540 generated at the preheader of LOOP. If the memory access has no evolution
5541 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5542 to be generated inside LOOP (in the preheader of the inner-loop). */
5544 if (nested_in_vect_loop)
5546 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5547 bool invariant_in_outerloop =
5548 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5549 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5551 else
5552 loop_for_initial_load = loop;
5553 if (at_loop)
5554 *at_loop = loop_for_initial_load;
5556 if (loop_for_initial_load)
5557 pe = loop_preheader_edge (loop_for_initial_load);
5559 /* 3. For the case of the optimized realignment, create the first vector
5560 load at the loop preheader. */
5562 if (alignment_support_scheme == dr_explicit_realign_optimized)
5564 /* Create msq_init = *(floor(p1)) in the loop preheader */
5565 gassign *new_stmt;
5567 gcc_assert (!compute_in_loop);
5568 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5569 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5570 NULL_TREE, &init_addr, NULL, &inc,
5571 true, &inv_p);
5572 if (TREE_CODE (ptr) == SSA_NAME)
5573 new_temp = copy_ssa_name (ptr);
5574 else
5575 new_temp = make_ssa_name (TREE_TYPE (ptr));
5576 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5577 new_stmt = gimple_build_assign
5578 (new_temp, BIT_AND_EXPR, ptr,
5579 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5580 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5581 gcc_assert (!new_bb);
5582 data_ref
5583 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5584 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5585 vect_copy_ref_info (data_ref, DR_REF (dr));
5586 new_stmt = gimple_build_assign (vec_dest, data_ref);
5587 new_temp = make_ssa_name (vec_dest, new_stmt);
5588 gimple_assign_set_lhs (new_stmt, new_temp);
5589 if (pe)
5591 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5592 gcc_assert (!new_bb);
5594 else
5595 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5597 msq_init = gimple_assign_lhs (new_stmt);
5600 /* 4. Create realignment token using a target builtin, if available.
5601 It is done either inside the containing loop, or before LOOP (as
5602 determined above). */
5604 if (targetm.vectorize.builtin_mask_for_load)
5606 gcall *new_stmt;
5607 tree builtin_decl;
5609 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5610 if (!init_addr)
5612 /* Generate the INIT_ADDR computation outside LOOP. */
5613 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5614 NULL_TREE);
5615 if (loop)
5617 pe = loop_preheader_edge (loop);
5618 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5619 gcc_assert (!new_bb);
5621 else
5622 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5625 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5626 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5627 vec_dest =
5628 vect_create_destination_var (scalar_dest,
5629 gimple_call_return_type (new_stmt));
5630 new_temp = make_ssa_name (vec_dest, new_stmt);
5631 gimple_call_set_lhs (new_stmt, new_temp);
5633 if (compute_in_loop)
5634 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5635 else
5637 /* Generate the misalignment computation outside LOOP. */
5638 pe = loop_preheader_edge (loop);
5639 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5640 gcc_assert (!new_bb);
5643 *realignment_token = gimple_call_lhs (new_stmt);
5645 /* The result of the CALL_EXPR to this builtin is determined from
5646 the value of the parameter and no global variables are touched
5647 which makes the builtin a "const" function. Requiring the
5648 builtin to have the "const" attribute makes it unnecessary
5649 to call mark_call_clobbered. */
5650 gcc_assert (TREE_READONLY (builtin_decl));
5653 if (alignment_support_scheme == dr_explicit_realign)
5654 return msq;
5656 gcc_assert (!compute_in_loop);
5657 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5660 /* 5. Create msq = phi <msq_init, lsq> in loop */
5662 pe = loop_preheader_edge (containing_loop);
5663 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5664 msq = make_ssa_name (vec_dest);
5665 phi_stmt = create_phi_node (msq, containing_loop->header);
5666 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5668 return msq;
5672 /* Function vect_grouped_load_supported.
5674 COUNT is the size of the load group (the number of statements plus the
5675 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5676 only one statement, with a gap of COUNT - 1.
5678 Returns true if a suitable permute exists. */
5680 bool
5681 vect_grouped_load_supported (tree vectype, bool single_element_p,
5682 unsigned HOST_WIDE_INT count)
5684 machine_mode mode = TYPE_MODE (vectype);
5686 /* If this is single-element interleaving with an element distance
5687 that leaves unused vector loads around punt - we at least create
5688 very sub-optimal code in that case (and blow up memory,
5689 see PR65518). */
5690 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5692 if (dump_enabled_p ())
5693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5694 "single-element interleaving not supported "
5695 "for not adjacent vector loads\n");
5696 return false;
5699 /* vect_permute_load_chain requires the group size to be equal to 3 or
5700 be a power of two. */
5701 if (count != 3 && exact_log2 (count) == -1)
5703 if (dump_enabled_p ())
5704 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5705 "the size of the group of accesses"
5706 " is not a power of 2 or not equal to 3\n");
5707 return false;
5710 /* Check that the permutation is supported. */
5711 if (VECTOR_MODE_P (mode))
5713 unsigned int i, j;
5714 if (count == 3)
5716 unsigned int nelt;
5717 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5719 if (dump_enabled_p ())
5720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5721 "cannot handle groups of 3 loads for"
5722 " variable-length vectors\n");
5723 return false;
5726 vec_perm_builder sel (nelt, nelt, 1);
5727 sel.quick_grow (nelt);
5728 vec_perm_indices indices;
5729 unsigned int k;
5730 for (k = 0; k < 3; k++)
5732 for (i = 0; i < nelt; i++)
5733 if (3 * i + k < 2 * nelt)
5734 sel[i] = 3 * i + k;
5735 else
5736 sel[i] = 0;
5737 indices.new_vector (sel, 2, nelt);
5738 if (!can_vec_perm_const_p (mode, indices))
5740 if (dump_enabled_p ())
5741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5742 "shuffle of 3 loads is not supported by"
5743 " target\n");
5744 return false;
5746 for (i = 0, j = 0; i < nelt; i++)
5747 if (3 * i + k < 2 * nelt)
5748 sel[i] = i;
5749 else
5750 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5751 indices.new_vector (sel, 2, nelt);
5752 if (!can_vec_perm_const_p (mode, indices))
5754 if (dump_enabled_p ())
5755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5756 "shuffle of 3 loads is not supported by"
5757 " target\n");
5758 return false;
5761 return true;
5763 else
5765 /* If length is not equal to 3 then only power of 2 is supported. */
5766 gcc_assert (pow2p_hwi (count));
5767 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5769 /* The encoding has a single stepped pattern. */
5770 vec_perm_builder sel (nelt, 1, 3);
5771 sel.quick_grow (3);
5772 for (i = 0; i < 3; i++)
5773 sel[i] = i * 2;
5774 vec_perm_indices indices (sel, 2, nelt);
5775 if (can_vec_perm_const_p (mode, indices))
5777 for (i = 0; i < 3; i++)
5778 sel[i] = i * 2 + 1;
5779 indices.new_vector (sel, 2, nelt);
5780 if (can_vec_perm_const_p (mode, indices))
5781 return true;
5786 if (dump_enabled_p ())
5787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5788 "extract even/odd not supported by target\n");
5789 return false;
5792 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5793 type VECTYPE. MASKED_P says whether the masked form is needed. */
5795 bool
5796 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5797 bool masked_p)
5799 if (masked_p)
5800 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5801 vec_mask_load_lanes_optab,
5802 vectype, count);
5803 else
5804 return vect_lanes_optab_supported_p ("vec_load_lanes",
5805 vec_load_lanes_optab,
5806 vectype, count);
5809 /* Function vect_permute_load_chain.
5811 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5812 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5813 the input data correctly. Return the final references for loads in
5814 RESULT_CHAIN.
5816 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5817 The input is 4 vectors each containing 8 elements. We assign a number to each
5818 element, the input sequence is:
5820 1st vec: 0 1 2 3 4 5 6 7
5821 2nd vec: 8 9 10 11 12 13 14 15
5822 3rd vec: 16 17 18 19 20 21 22 23
5823 4th vec: 24 25 26 27 28 29 30 31
5825 The output sequence should be:
5827 1st vec: 0 4 8 12 16 20 24 28
5828 2nd vec: 1 5 9 13 17 21 25 29
5829 3rd vec: 2 6 10 14 18 22 26 30
5830 4th vec: 3 7 11 15 19 23 27 31
5832 i.e., the first output vector should contain the first elements of each
5833 interleaving group, etc.
5835 We use extract_even/odd instructions to create such output. The input of
5836 each extract_even/odd operation is two vectors
5837 1st vec 2nd vec
5838 0 1 2 3 4 5 6 7
5840 and the output is the vector of extracted even/odd elements. The output of
5841 extract_even will be: 0 2 4 6
5842 and of extract_odd: 1 3 5 7
5845 The permutation is done in log LENGTH stages. In each stage extract_even
5846 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5847 their order. In our example,
5849 E1: extract_even (1st vec, 2nd vec)
5850 E2: extract_odd (1st vec, 2nd vec)
5851 E3: extract_even (3rd vec, 4th vec)
5852 E4: extract_odd (3rd vec, 4th vec)
5854 The output for the first stage will be:
5856 E1: 0 2 4 6 8 10 12 14
5857 E2: 1 3 5 7 9 11 13 15
5858 E3: 16 18 20 22 24 26 28 30
5859 E4: 17 19 21 23 25 27 29 31
5861 In order to proceed and create the correct sequence for the next stage (or
5862 for the correct output, if the second stage is the last one, as in our
5863 example), we first put the output of extract_even operation and then the
5864 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5865 The input for the second stage is:
5867 1st vec (E1): 0 2 4 6 8 10 12 14
5868 2nd vec (E3): 16 18 20 22 24 26 28 30
5869 3rd vec (E2): 1 3 5 7 9 11 13 15
5870 4th vec (E4): 17 19 21 23 25 27 29 31
5872 The output of the second stage:
5874 E1: 0 4 8 12 16 20 24 28
5875 E2: 2 6 10 14 18 22 26 30
5876 E3: 1 5 9 13 17 21 25 29
5877 E4: 3 7 11 15 19 23 27 31
5879 And RESULT_CHAIN after reordering:
5881 1st vec (E1): 0 4 8 12 16 20 24 28
5882 2nd vec (E3): 1 5 9 13 17 21 25 29
5883 3rd vec (E2): 2 6 10 14 18 22 26 30
5884 4th vec (E4): 3 7 11 15 19 23 27 31. */
5886 static void
5887 vect_permute_load_chain (vec<tree> dr_chain,
5888 unsigned int length,
5889 gimple *stmt,
5890 gimple_stmt_iterator *gsi,
5891 vec<tree> *result_chain)
5893 tree data_ref, first_vect, second_vect;
5894 tree perm_mask_even, perm_mask_odd;
5895 tree perm3_mask_low, perm3_mask_high;
5896 gimple *perm_stmt;
5897 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5898 unsigned int i, j, log_length = exact_log2 (length);
5900 result_chain->quick_grow (length);
5901 memcpy (result_chain->address (), dr_chain.address (),
5902 length * sizeof (tree));
5904 if (length == 3)
5906 /* vect_grouped_load_supported ensures that this is constant. */
5907 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5908 unsigned int k;
5910 vec_perm_builder sel (nelt, nelt, 1);
5911 sel.quick_grow (nelt);
5912 vec_perm_indices indices;
5913 for (k = 0; k < 3; k++)
5915 for (i = 0; i < nelt; i++)
5916 if (3 * i + k < 2 * nelt)
5917 sel[i] = 3 * i + k;
5918 else
5919 sel[i] = 0;
5920 indices.new_vector (sel, 2, nelt);
5921 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5923 for (i = 0, j = 0; i < nelt; i++)
5924 if (3 * i + k < 2 * nelt)
5925 sel[i] = i;
5926 else
5927 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5928 indices.new_vector (sel, 2, nelt);
5929 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5931 first_vect = dr_chain[0];
5932 second_vect = dr_chain[1];
5934 /* Create interleaving stmt (low part of):
5935 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5936 ...}> */
5937 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5938 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5939 second_vect, perm3_mask_low);
5940 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5942 /* Create interleaving stmt (high part of):
5943 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5944 ...}> */
5945 first_vect = data_ref;
5946 second_vect = dr_chain[2];
5947 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5948 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5949 second_vect, perm3_mask_high);
5950 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5951 (*result_chain)[k] = data_ref;
5954 else
5956 /* If length is not equal to 3 then only power of 2 is supported. */
5957 gcc_assert (pow2p_hwi (length));
5959 /* The encoding has a single stepped pattern. */
5960 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5961 vec_perm_builder sel (nelt, 1, 3);
5962 sel.quick_grow (3);
5963 for (i = 0; i < 3; ++i)
5964 sel[i] = i * 2;
5965 vec_perm_indices indices (sel, 2, nelt);
5966 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5968 for (i = 0; i < 3; ++i)
5969 sel[i] = i * 2 + 1;
5970 indices.new_vector (sel, 2, nelt);
5971 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5973 for (i = 0; i < log_length; i++)
5975 for (j = 0; j < length; j += 2)
5977 first_vect = dr_chain[j];
5978 second_vect = dr_chain[j+1];
5980 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5981 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5982 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5983 first_vect, second_vect,
5984 perm_mask_even);
5985 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5986 (*result_chain)[j/2] = data_ref;
5988 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5989 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5990 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5991 first_vect, second_vect,
5992 perm_mask_odd);
5993 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5994 (*result_chain)[j/2+length/2] = data_ref;
5996 memcpy (dr_chain.address (), result_chain->address (),
5997 length * sizeof (tree));
6002 /* Function vect_shift_permute_load_chain.
6004 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6005 sequence of stmts to reorder the input data accordingly.
6006 Return the final references for loads in RESULT_CHAIN.
6007 Return true if successed, false otherwise.
6009 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6010 The input is 3 vectors each containing 8 elements. We assign a
6011 number to each element, the input sequence is:
6013 1st vec: 0 1 2 3 4 5 6 7
6014 2nd vec: 8 9 10 11 12 13 14 15
6015 3rd vec: 16 17 18 19 20 21 22 23
6017 The output sequence should be:
6019 1st vec: 0 3 6 9 12 15 18 21
6020 2nd vec: 1 4 7 10 13 16 19 22
6021 3rd vec: 2 5 8 11 14 17 20 23
6023 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6025 First we shuffle all 3 vectors to get correct elements order:
6027 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6028 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6029 3rd vec: (16 19 22) (17 20 23) (18 21)
6031 Next we unite and shift vector 3 times:
6033 1st step:
6034 shift right by 6 the concatenation of:
6035 "1st vec" and "2nd vec"
6036 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6037 "2nd vec" and "3rd vec"
6038 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6039 "3rd vec" and "1st vec"
6040 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6041 | New vectors |
6043 So that now new vectors are:
6045 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6046 2nd vec: (10 13) (16 19 22) (17 20 23)
6047 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6049 2nd step:
6050 shift right by 5 the concatenation of:
6051 "1st vec" and "3rd vec"
6052 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6053 "2nd vec" and "1st vec"
6054 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6055 "3rd vec" and "2nd vec"
6056 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6057 | New vectors |
6059 So that now new vectors are:
6061 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6062 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6063 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6065 3rd step:
6066 shift right by 5 the concatenation of:
6067 "1st vec" and "1st vec"
6068 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6069 shift right by 3 the concatenation of:
6070 "2nd vec" and "2nd vec"
6071 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6072 | New vectors |
6074 So that now all vectors are READY:
6075 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6076 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6077 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6079 This algorithm is faster than one in vect_permute_load_chain if:
6080 1. "shift of a concatination" is faster than general permutation.
6081 This is usually so.
6082 2. The TARGET machine can't execute vector instructions in parallel.
6083 This is because each step of the algorithm depends on previous.
6084 The algorithm in vect_permute_load_chain is much more parallel.
6086 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6089 static bool
6090 vect_shift_permute_load_chain (vec<tree> dr_chain,
6091 unsigned int length,
6092 gimple *stmt,
6093 gimple_stmt_iterator *gsi,
6094 vec<tree> *result_chain)
6096 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6097 tree perm2_mask1, perm2_mask2, perm3_mask;
6098 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6099 gimple *perm_stmt;
6101 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6102 unsigned int i;
6103 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6104 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6106 unsigned HOST_WIDE_INT nelt, vf;
6107 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6108 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6109 /* Not supported for variable-length vectors. */
6110 return false;
6112 vec_perm_builder sel (nelt, nelt, 1);
6113 sel.quick_grow (nelt);
6115 result_chain->quick_grow (length);
6116 memcpy (result_chain->address (), dr_chain.address (),
6117 length * sizeof (tree));
6119 if (pow2p_hwi (length) && vf > 4)
6121 unsigned int j, log_length = exact_log2 (length);
6122 for (i = 0; i < nelt / 2; ++i)
6123 sel[i] = i * 2;
6124 for (i = 0; i < nelt / 2; ++i)
6125 sel[nelt / 2 + i] = i * 2 + 1;
6126 vec_perm_indices indices (sel, 2, nelt);
6127 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6129 if (dump_enabled_p ())
6130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6131 "shuffle of 2 fields structure is not \
6132 supported by target\n");
6133 return false;
6135 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6137 for (i = 0; i < nelt / 2; ++i)
6138 sel[i] = i * 2 + 1;
6139 for (i = 0; i < nelt / 2; ++i)
6140 sel[nelt / 2 + i] = i * 2;
6141 indices.new_vector (sel, 2, nelt);
6142 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6144 if (dump_enabled_p ())
6145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6146 "shuffle of 2 fields structure is not \
6147 supported by target\n");
6148 return false;
6150 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6152 /* Generating permutation constant to shift all elements.
6153 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6154 for (i = 0; i < nelt; i++)
6155 sel[i] = nelt / 2 + i;
6156 indices.new_vector (sel, 2, nelt);
6157 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6159 if (dump_enabled_p ())
6160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6161 "shift permutation is not supported by target\n");
6162 return false;
6164 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6166 /* Generating permutation constant to select vector from 2.
6167 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6168 for (i = 0; i < nelt / 2; i++)
6169 sel[i] = i;
6170 for (i = nelt / 2; i < nelt; i++)
6171 sel[i] = nelt + i;
6172 indices.new_vector (sel, 2, nelt);
6173 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6175 if (dump_enabled_p ())
6176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6177 "select is not supported by target\n");
6178 return false;
6180 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6182 for (i = 0; i < log_length; i++)
6184 for (j = 0; j < length; j += 2)
6186 first_vect = dr_chain[j];
6187 second_vect = dr_chain[j + 1];
6189 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6190 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6191 first_vect, first_vect,
6192 perm2_mask1);
6193 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6194 vect[0] = data_ref;
6196 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6197 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6198 second_vect, second_vect,
6199 perm2_mask2);
6200 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6201 vect[1] = data_ref;
6203 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6204 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6205 vect[0], vect[1], shift1_mask);
6206 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6207 (*result_chain)[j/2 + length/2] = data_ref;
6209 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6210 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6211 vect[0], vect[1], select_mask);
6212 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6213 (*result_chain)[j/2] = data_ref;
6215 memcpy (dr_chain.address (), result_chain->address (),
6216 length * sizeof (tree));
6218 return true;
6220 if (length == 3 && vf > 2)
6222 unsigned int k = 0, l = 0;
6224 /* Generating permutation constant to get all elements in rigth order.
6225 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6226 for (i = 0; i < nelt; i++)
6228 if (3 * k + (l % 3) >= nelt)
6230 k = 0;
6231 l += (3 - (nelt % 3));
6233 sel[i] = 3 * k + (l % 3);
6234 k++;
6236 vec_perm_indices indices (sel, 2, nelt);
6237 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6239 if (dump_enabled_p ())
6240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6241 "shuffle of 3 fields structure is not \
6242 supported by target\n");
6243 return false;
6245 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6247 /* Generating permutation constant to shift all elements.
6248 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6249 for (i = 0; i < nelt; i++)
6250 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6251 indices.new_vector (sel, 2, nelt);
6252 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6254 if (dump_enabled_p ())
6255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6256 "shift permutation is not supported by target\n");
6257 return false;
6259 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6261 /* Generating permutation constant to shift all elements.
6262 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6263 for (i = 0; i < nelt; i++)
6264 sel[i] = 2 * (nelt / 3) + 1 + i;
6265 indices.new_vector (sel, 2, nelt);
6266 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6268 if (dump_enabled_p ())
6269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6270 "shift permutation is not supported by target\n");
6271 return false;
6273 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6275 /* Generating permutation constant to shift all elements.
6276 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6277 for (i = 0; i < nelt; i++)
6278 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6279 indices.new_vector (sel, 2, nelt);
6280 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6282 if (dump_enabled_p ())
6283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6284 "shift permutation is not supported by target\n");
6285 return false;
6287 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6289 /* Generating permutation constant to shift all elements.
6290 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6291 for (i = 0; i < nelt; i++)
6292 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6293 indices.new_vector (sel, 2, nelt);
6294 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6296 if (dump_enabled_p ())
6297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6298 "shift permutation is not supported by target\n");
6299 return false;
6301 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6303 for (k = 0; k < 3; k++)
6305 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6306 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6307 dr_chain[k], dr_chain[k],
6308 perm3_mask);
6309 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6310 vect[k] = data_ref;
6313 for (k = 0; k < 3; k++)
6315 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6316 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6317 vect[k % 3], vect[(k + 1) % 3],
6318 shift1_mask);
6319 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6320 vect_shift[k] = data_ref;
6323 for (k = 0; k < 3; k++)
6325 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6326 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6327 vect_shift[(4 - k) % 3],
6328 vect_shift[(3 - k) % 3],
6329 shift2_mask);
6330 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6331 vect[k] = data_ref;
6334 (*result_chain)[3 - (nelt % 3)] = vect[2];
6336 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6337 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6338 vect[0], shift3_mask);
6339 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6340 (*result_chain)[nelt % 3] = data_ref;
6342 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6343 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6344 vect[1], shift4_mask);
6345 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6346 (*result_chain)[0] = data_ref;
6347 return true;
6349 return false;
6352 /* Function vect_transform_grouped_load.
6354 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6355 to perform their permutation and ascribe the result vectorized statements to
6356 the scalar statements.
6359 void
6360 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6361 gimple_stmt_iterator *gsi)
6363 machine_mode mode;
6364 vec<tree> result_chain = vNULL;
6366 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6367 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6368 vectors, that are ready for vector computation. */
6369 result_chain.create (size);
6371 /* If reassociation width for vector type is 2 or greater target machine can
6372 execute 2 or more vector instructions in parallel. Otherwise try to
6373 get chain for loads group using vect_shift_permute_load_chain. */
6374 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6375 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6376 || pow2p_hwi (size)
6377 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6378 gsi, &result_chain))
6379 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6380 vect_record_grouped_load_vectors (stmt, result_chain);
6381 result_chain.release ();
6384 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6385 generated as part of the vectorization of STMT. Assign the statement
6386 for each vector to the associated scalar statement. */
6388 void
6389 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6391 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6392 gimple *next_stmt, *new_stmt;
6393 unsigned int i, gap_count;
6394 tree tmp_data_ref;
6396 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6397 Since we scan the chain starting from it's first node, their order
6398 corresponds the order of data-refs in RESULT_CHAIN. */
6399 next_stmt = first_stmt;
6400 gap_count = 1;
6401 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6403 if (!next_stmt)
6404 break;
6406 /* Skip the gaps. Loads created for the gaps will be removed by dead
6407 code elimination pass later. No need to check for the first stmt in
6408 the group, since it always exists.
6409 GROUP_GAP is the number of steps in elements from the previous
6410 access (if there is no gap GROUP_GAP is 1). We skip loads that
6411 correspond to the gaps. */
6412 if (next_stmt != first_stmt
6413 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6415 gap_count++;
6416 continue;
6419 while (next_stmt)
6421 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6422 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6423 copies, and we put the new vector statement in the first available
6424 RELATED_STMT. */
6425 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6426 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6427 else
6429 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6431 gimple *prev_stmt =
6432 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6433 gimple *rel_stmt =
6434 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6435 while (rel_stmt)
6437 prev_stmt = rel_stmt;
6438 rel_stmt =
6439 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6442 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6443 new_stmt;
6447 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6448 gap_count = 1;
6449 /* If NEXT_STMT accesses the same DR as the previous statement,
6450 put the same TMP_DATA_REF as its vectorized statement; otherwise
6451 get the next data-ref from RESULT_CHAIN. */
6452 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6453 break;
6458 /* Function vect_force_dr_alignment_p.
6460 Returns whether the alignment of a DECL can be forced to be aligned
6461 on ALIGNMENT bit boundary. */
6463 bool
6464 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6466 if (!VAR_P (decl))
6467 return false;
6469 if (decl_in_symtab_p (decl)
6470 && !symtab_node::get (decl)->can_increase_alignment_p ())
6471 return false;
6473 if (TREE_STATIC (decl))
6474 return (alignment <= MAX_OFILE_ALIGNMENT);
6475 else
6476 return (alignment <= MAX_STACK_ALIGNMENT);
6480 /* Return whether the data reference DR is supported with respect to its
6481 alignment.
6482 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6483 it is aligned, i.e., check if it is possible to vectorize it with different
6484 alignment. */
6486 enum dr_alignment_support
6487 vect_supportable_dr_alignment (struct data_reference *dr,
6488 bool check_aligned_accesses)
6490 gimple *stmt = DR_STMT (dr);
6491 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6492 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6493 machine_mode mode = TYPE_MODE (vectype);
6494 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6495 struct loop *vect_loop = NULL;
6496 bool nested_in_vect_loop = false;
6498 if (aligned_access_p (dr) && !check_aligned_accesses)
6499 return dr_aligned;
6501 /* For now assume all conditional loads/stores support unaligned
6502 access without any special code. */
6503 if (is_gimple_call (stmt)
6504 && gimple_call_internal_p (stmt)
6505 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6506 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6507 return dr_unaligned_supported;
6509 if (loop_vinfo)
6511 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6512 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6515 /* Possibly unaligned access. */
6517 /* We can choose between using the implicit realignment scheme (generating
6518 a misaligned_move stmt) and the explicit realignment scheme (generating
6519 aligned loads with a REALIGN_LOAD). There are two variants to the
6520 explicit realignment scheme: optimized, and unoptimized.
6521 We can optimize the realignment only if the step between consecutive
6522 vector loads is equal to the vector size. Since the vector memory
6523 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6524 is guaranteed that the misalignment amount remains the same throughout the
6525 execution of the vectorized loop. Therefore, we can create the
6526 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6527 at the loop preheader.
6529 However, in the case of outer-loop vectorization, when vectorizing a
6530 memory access in the inner-loop nested within the LOOP that is now being
6531 vectorized, while it is guaranteed that the misalignment of the
6532 vectorized memory access will remain the same in different outer-loop
6533 iterations, it is *not* guaranteed that is will remain the same throughout
6534 the execution of the inner-loop. This is because the inner-loop advances
6535 with the original scalar step (and not in steps of VS). If the inner-loop
6536 step happens to be a multiple of VS, then the misalignment remains fixed
6537 and we can use the optimized realignment scheme. For example:
6539 for (i=0; i<N; i++)
6540 for (j=0; j<M; j++)
6541 s += a[i+j];
6543 When vectorizing the i-loop in the above example, the step between
6544 consecutive vector loads is 1, and so the misalignment does not remain
6545 fixed across the execution of the inner-loop, and the realignment cannot
6546 be optimized (as illustrated in the following pseudo vectorized loop):
6548 for (i=0; i<N; i+=4)
6549 for (j=0; j<M; j++){
6550 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6551 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6552 // (assuming that we start from an aligned address).
6555 We therefore have to use the unoptimized realignment scheme:
6557 for (i=0; i<N; i+=4)
6558 for (j=k; j<M; j+=4)
6559 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6560 // that the misalignment of the initial address is
6561 // 0).
6563 The loop can then be vectorized as follows:
6565 for (k=0; k<4; k++){
6566 rt = get_realignment_token (&vp[k]);
6567 for (i=0; i<N; i+=4){
6568 v1 = vp[i+k];
6569 for (j=k; j<M; j+=4){
6570 v2 = vp[i+j+VS-1];
6571 va = REALIGN_LOAD <v1,v2,rt>;
6572 vs += va;
6573 v1 = v2;
6576 } */
6578 if (DR_IS_READ (dr))
6580 bool is_packed = false;
6581 tree type = (TREE_TYPE (DR_REF (dr)));
6583 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6584 && (!targetm.vectorize.builtin_mask_for_load
6585 || targetm.vectorize.builtin_mask_for_load ()))
6587 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6589 /* If we are doing SLP then the accesses need not have the
6590 same alignment, instead it depends on the SLP group size. */
6591 if (loop_vinfo
6592 && STMT_SLP_TYPE (stmt_info)
6593 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6594 * GROUP_SIZE (vinfo_for_stmt
6595 (GROUP_FIRST_ELEMENT (stmt_info))),
6596 TYPE_VECTOR_SUBPARTS (vectype)))
6598 else if (!loop_vinfo
6599 || (nested_in_vect_loop
6600 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6601 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6602 return dr_explicit_realign;
6603 else
6604 return dr_explicit_realign_optimized;
6606 if (!known_alignment_for_access_p (dr))
6607 is_packed = not_size_aligned (DR_REF (dr));
6609 if (targetm.vectorize.support_vector_misalignment
6610 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6611 /* Can't software pipeline the loads, but can at least do them. */
6612 return dr_unaligned_supported;
6614 else
6616 bool is_packed = false;
6617 tree type = (TREE_TYPE (DR_REF (dr)));
6619 if (!known_alignment_for_access_p (dr))
6620 is_packed = not_size_aligned (DR_REF (dr));
6622 if (targetm.vectorize.support_vector_misalignment
6623 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6624 return dr_unaligned_supported;
6627 /* Unsupported. */
6628 return dr_unaligned_unsupported;