2018-05-29 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobf46eb467da61653ddcbea808b96735a19bdc290d
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
216 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
219 /* A subroutine of vect_analyze_data_ref_dependence. Handle
220 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
221 distances. These distances are conservatively correct but they don't
222 reflect a guaranteed dependence.
224 Return true if this function does all the work necessary to avoid
225 an alias or false if the caller should use the dependence distances
226 to limit the vectorization factor in the usual way. LOOP_DEPTH is
227 the depth of the loop described by LOOP_VINFO and the other arguments
228 are as for vect_analyze_data_ref_dependence. */
230 static bool
231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 int loop_depth, unsigned int *max_vf)
235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
236 lambda_vector dist_v;
237 unsigned int i;
238 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
240 int dist = dist_v[loop_depth];
241 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
243 /* If the user asserted safelen >= DIST consecutive iterations
244 can be executed concurrently, assume independence.
246 ??? An alternative would be to add the alias check even
247 in this case, and vectorize the fallback loop with the
248 maximum VF set to safelen. However, if the user has
249 explicitly given a length, it's less likely that that
250 would be a win. */
251 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
253 if ((unsigned int) loop->safelen < *max_vf)
254 *max_vf = loop->safelen;
255 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
256 continue;
259 /* For dependence distances of 2 or more, we have the option
260 of limiting VF or checking for an alias at runtime.
261 Prefer to check at runtime if we can, to avoid limiting
262 the VF unnecessarily when the bases are in fact independent.
264 Note that the alias checks will be removed if the VF ends up
265 being small enough. */
266 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
269 return true;
273 /* Function vect_analyze_data_ref_dependence.
275 Return TRUE if there (might) exist a dependence between a memory-reference
276 DRA and a memory-reference DRB. When versioning for alias may check a
277 dependence at run-time, return FALSE. Adjust *MAX_VF according to
278 the data dependence. */
280 static bool
281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
282 loop_vec_info loop_vinfo,
283 unsigned int *max_vf)
285 unsigned int i;
286 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
287 struct data_reference *dra = DDR_A (ddr);
288 struct data_reference *drb = DDR_B (ddr);
289 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
290 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
291 lambda_vector dist_v;
292 unsigned int loop_depth;
294 /* In loop analysis all data references should be vectorizable. */
295 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
296 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
297 gcc_unreachable ();
299 /* Independent data accesses. */
300 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
301 return false;
303 if (dra == drb
304 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
305 return false;
307 /* We do not have to consider dependences between accesses that belong
308 to the same group, unless the stride could be smaller than the
309 group size. */
310 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
311 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
312 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
313 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
314 return false;
316 /* Even if we have an anti-dependence then, as the vectorized loop covers at
317 least two scalar iterations, there is always also a true dependence.
318 As the vectorizer does not re-order loads and stores we can ignore
319 the anti-dependence if TBAA can disambiguate both DRs similar to the
320 case with known negative distance anti-dependences (positive
321 distance anti-dependences would violate TBAA constraints). */
322 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
323 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
324 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
325 get_alias_set (DR_REF (drb))))
326 return false;
328 /* Unknown data dependence. */
329 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
331 /* If user asserted safelen consecutive iterations can be
332 executed concurrently, assume independence. */
333 if (loop->safelen >= 2)
335 if ((unsigned int) loop->safelen < *max_vf)
336 *max_vf = loop->safelen;
337 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
338 return false;
341 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
342 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
344 if (dump_enabled_p ())
346 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
347 "versioning for alias not supported for: "
348 "can't determine dependence between ");
349 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
350 DR_REF (dra));
351 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
353 DR_REF (drb));
354 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 return true;
359 if (dump_enabled_p ())
361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
362 "versioning for alias required: "
363 "can't determine dependence between ");
364 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
365 DR_REF (dra));
366 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
367 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
368 DR_REF (drb));
369 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
372 /* Add to list of ddrs that need to be tested at run-time. */
373 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
376 /* Known data dependence. */
377 if (DDR_NUM_DIST_VECTS (ddr) == 0)
379 /* If user asserted safelen consecutive iterations can be
380 executed concurrently, assume independence. */
381 if (loop->safelen >= 2)
383 if ((unsigned int) loop->safelen < *max_vf)
384 *max_vf = loop->safelen;
385 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
386 return false;
389 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
390 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
392 if (dump_enabled_p ())
394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
395 "versioning for alias not supported for: "
396 "bad dist vector for ");
397 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
398 DR_REF (dra));
399 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
400 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
401 DR_REF (drb));
402 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
404 return true;
407 if (dump_enabled_p ())
409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
410 "versioning for alias required: "
411 "bad dist vector for ");
412 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
413 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
414 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
415 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
417 /* Add to list of ddrs that need to be tested at run-time. */
418 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
421 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
423 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
424 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
425 loop_depth, max_vf))
426 return false;
428 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
430 int dist = dist_v[loop_depth];
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_NOTE, vect_location,
434 "dependence distance = %d.\n", dist);
436 if (dist == 0)
438 if (dump_enabled_p ())
440 dump_printf_loc (MSG_NOTE, vect_location,
441 "dependence distance == 0 between ");
442 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
443 dump_printf (MSG_NOTE, " and ");
444 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
445 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
448 /* When we perform grouped accesses and perform implicit CSE
449 by detecting equal accesses and doing disambiguation with
450 runtime alias tests like for
451 .. = a[i];
452 .. = a[i+1];
453 a[i] = ..;
454 a[i+1] = ..;
455 *p = ..;
456 .. = a[i];
457 .. = a[i+1];
458 where we will end up loading { a[i], a[i+1] } once, make
459 sure that inserting group loads before the first load and
460 stores after the last store will do the right thing.
461 Similar for groups like
462 a[i] = ...;
463 ... = a[i];
464 a[i+1] = ...;
465 where loads from the group interleave with the store. */
466 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
470 "READ_WRITE dependence in interleaving.\n");
471 return true;
474 if (loop->safelen < 2)
476 tree indicator = dr_zero_step_indicator (dra);
477 if (TREE_CODE (indicator) != INTEGER_CST)
478 vect_check_nonzero_value (loop_vinfo, indicator);
479 else if (integer_zerop (indicator))
481 if (dump_enabled_p ())
482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
483 "access also has a zero step\n");
484 return true;
487 continue;
490 if (dist > 0 && DDR_REVERSED_P (ddr))
492 /* If DDR_REVERSED_P the order of the data-refs in DDR was
493 reversed (to make distance vector positive), and the actual
494 distance is negative. */
495 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497 "dependence distance negative.\n");
498 /* Record a negative dependence distance to later limit the
499 amount of stmt copying / unrolling we can perform.
500 Only need to handle read-after-write dependence. */
501 if (DR_IS_READ (drb)
502 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
503 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
504 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
505 continue;
508 unsigned int abs_dist = abs (dist);
509 if (abs_dist >= 2 && abs_dist < *max_vf)
511 /* The dependence distance requires reduction of the maximal
512 vectorization factor. */
513 *max_vf = abs (dist);
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location,
516 "adjusting maximal vectorization factor to %i\n",
517 *max_vf);
520 if (abs_dist >= *max_vf)
522 /* Dependence distance does not create dependence, as far as
523 vectorization is concerned, in this case. */
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location,
526 "dependence distance >= VF.\n");
527 continue;
530 if (dump_enabled_p ())
532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
533 "not vectorized, possible dependence "
534 "between data-refs ");
535 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
536 dump_printf (MSG_NOTE, " and ");
537 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
538 dump_printf (MSG_NOTE, "\n");
541 return true;
544 return false;
547 /* Function vect_analyze_data_ref_dependences.
549 Examine all the data references in the loop, and make sure there do not
550 exist any data dependences between them. Set *MAX_VF according to
551 the maximum vectorization factor the data dependences allow. */
553 bool
554 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
555 unsigned int *max_vf)
557 unsigned int i;
558 struct data_dependence_relation *ddr;
560 if (dump_enabled_p ())
561 dump_printf_loc (MSG_NOTE, vect_location,
562 "=== vect_analyze_data_ref_dependences ===\n");
564 LOOP_VINFO_DDRS (loop_vinfo)
565 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
566 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
567 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
568 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
569 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
570 &LOOP_VINFO_DDRS (loop_vinfo),
571 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
572 return false;
574 /* For epilogues we either have no aliases or alias versioning
575 was applied to original loop. Therefore we may just get max_vf
576 using VF of original loop. */
577 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
578 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
579 else
580 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
581 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
582 return false;
584 return true;
588 /* Function vect_slp_analyze_data_ref_dependence.
590 Return TRUE if there (might) exist a dependence between a memory-reference
591 DRA and a memory-reference DRB. When versioning for alias may check a
592 dependence at run-time, return FALSE. Adjust *MAX_VF according to
593 the data dependence. */
595 static bool
596 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
598 struct data_reference *dra = DDR_A (ddr);
599 struct data_reference *drb = DDR_B (ddr);
601 /* We need to check dependences of statements marked as unvectorizable
602 as well, they still can prohibit vectorization. */
604 /* Independent data accesses. */
605 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
606 return false;
608 if (dra == drb)
609 return false;
611 /* Read-read is OK. */
612 if (DR_IS_READ (dra) && DR_IS_READ (drb))
613 return false;
615 /* If dra and drb are part of the same interleaving chain consider
616 them independent. */
617 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
618 && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
619 == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
620 return false;
622 /* Unknown data dependence. */
623 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
625 if (dump_enabled_p ())
627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
628 "can't determine dependence between ");
629 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
630 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
631 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
632 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
635 else if (dump_enabled_p ())
637 dump_printf_loc (MSG_NOTE, vect_location,
638 "determined dependence between ");
639 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
640 dump_printf (MSG_NOTE, " and ");
641 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
642 dump_printf (MSG_NOTE, "\n");
645 return true;
649 /* Analyze dependences involved in the transform of SLP NODE. STORES
650 contain the vector of scalar stores of this instance if we are
651 disambiguating the loads. */
653 static bool
654 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
655 vec<gimple *> stores, gimple *last_store)
657 /* This walks over all stmts involved in the SLP load/store done
658 in NODE verifying we can sink them up to the last stmt in the
659 group. */
660 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
661 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
663 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
664 if (access == last_access)
665 continue;
666 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
667 ao_ref ref;
668 bool ref_initialized_p = false;
669 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
670 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
672 gimple *stmt = gsi_stmt (gsi);
673 if (! gimple_vuse (stmt)
674 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
675 continue;
677 /* If we couldn't record a (single) data reference for this
678 stmt we have to resort to the alias oracle. */
679 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
680 if (!dr_b)
682 /* We are moving a store or sinking a load - this means
683 we cannot use TBAA for disambiguation. */
684 if (!ref_initialized_p)
685 ao_ref_init (&ref, DR_REF (dr_a));
686 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
687 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
688 return false;
689 continue;
692 bool dependent = false;
693 /* If we run into a store of this same instance (we've just
694 marked those) then delay dependence checking until we run
695 into the last store because this is where it will have
696 been sunk to (and we verify if we can do that as well). */
697 if (gimple_visited_p (stmt))
699 if (stmt != last_store)
700 continue;
701 unsigned i;
702 gimple *store;
703 FOR_EACH_VEC_ELT (stores, i, store)
705 data_reference *store_dr
706 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
707 ddr_p ddr = initialize_data_dependence_relation
708 (dr_a, store_dr, vNULL);
709 dependent = vect_slp_analyze_data_ref_dependence (ddr);
710 free_dependence_relation (ddr);
711 if (dependent)
712 break;
715 else
717 ddr_p ddr = initialize_data_dependence_relation (dr_a,
718 dr_b, vNULL);
719 dependent = vect_slp_analyze_data_ref_dependence (ddr);
720 free_dependence_relation (ddr);
722 if (dependent)
723 return false;
726 return true;
730 /* Function vect_analyze_data_ref_dependences.
732 Examine all the data references in the basic-block, and make sure there
733 do not exist any data dependences between them. Set *MAX_VF according to
734 the maximum vectorization factor the data dependences allow. */
736 bool
737 vect_slp_analyze_instance_dependence (slp_instance instance)
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location,
741 "=== vect_slp_analyze_instance_dependence ===\n");
743 /* The stores of this instance are at the root of the SLP tree. */
744 slp_tree store = SLP_INSTANCE_TREE (instance);
745 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
746 store = NULL;
748 /* Verify we can sink stores to the vectorized stmt insert location. */
749 gimple *last_store = NULL;
750 if (store)
752 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
753 return false;
755 /* Mark stores in this instance and remember the last one. */
756 last_store = vect_find_last_scalar_stmt_in_slp (store);
757 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
758 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
761 bool res = true;
763 /* Verify we can sink loads to the vectorized stmt insert location,
764 special-casing stores of this instance. */
765 slp_tree load;
766 unsigned int i;
767 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
768 if (! vect_slp_analyze_node_dependences (instance, load,
769 store
770 ? SLP_TREE_SCALAR_STMTS (store)
771 : vNULL, last_store))
773 res = false;
774 break;
777 /* Unset the visited flag. */
778 if (store)
779 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
780 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
782 return res;
785 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
786 the statement that contains DRB, which is useful for recording in the
787 dump file. */
789 static void
790 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
791 innermost_loop_behavior *drb)
793 bool existed;
794 innermost_loop_behavior *&entry
795 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
796 if (!existed || entry->base_alignment < drb->base_alignment)
798 entry = drb;
799 if (dump_enabled_p ())
801 dump_printf_loc (MSG_NOTE, vect_location,
802 "recording new base alignment for ");
803 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
804 dump_printf (MSG_NOTE, "\n");
805 dump_printf_loc (MSG_NOTE, vect_location,
806 " alignment: %d\n", drb->base_alignment);
807 dump_printf_loc (MSG_NOTE, vect_location,
808 " misalignment: %d\n", drb->base_misalignment);
809 dump_printf_loc (MSG_NOTE, vect_location,
810 " based on: ");
811 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
816 /* If the region we're going to vectorize is reached, all unconditional
817 data references occur at least once. We can therefore pool the base
818 alignment guarantees from each unconditional reference. Do this by
819 going through all the data references in VINFO and checking whether
820 the containing statement makes the reference unconditionally. If so,
821 record the alignment of the base address in VINFO so that it can be
822 used for all other references with the same base. */
824 void
825 vect_record_base_alignments (vec_info *vinfo)
827 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
828 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
829 data_reference *dr;
830 unsigned int i;
831 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
833 gimple *stmt = DR_STMT (dr);
834 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
835 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
837 gimple *stmt = DR_STMT (dr);
838 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
840 /* If DR is nested in the loop that is being vectorized, we can also
841 record the alignment of the base wrt the outer loop. */
842 if (loop && nested_in_vect_loop_p (loop, stmt))
844 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
845 vect_record_base_alignment
846 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
852 /* Return the target alignment for the vectorized form of DR. */
854 static unsigned int
855 vect_calculate_target_alignment (struct data_reference *dr)
857 gimple *stmt = DR_STMT (dr);
858 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
859 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
860 return targetm.vectorize.preferred_vector_alignment (vectype);
863 /* Function vect_compute_data_ref_alignment
865 Compute the misalignment of the data reference DR.
867 Output:
868 1. If during the misalignment computation it is found that the data reference
869 cannot be vectorized then false is returned.
870 2. DR_MISALIGNMENT (DR) is defined.
872 FOR NOW: No analysis is actually performed. Misalignment is calculated
873 only for trivial cases. TODO. */
875 bool
876 vect_compute_data_ref_alignment (struct data_reference *dr)
878 gimple *stmt = DR_STMT (dr);
879 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
880 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
881 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
882 struct loop *loop = NULL;
883 tree ref = DR_REF (dr);
884 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
886 if (dump_enabled_p ())
887 dump_printf_loc (MSG_NOTE, vect_location,
888 "vect_compute_data_ref_alignment:\n");
890 if (loop_vinfo)
891 loop = LOOP_VINFO_LOOP (loop_vinfo);
893 /* Initialize misalignment to unknown. */
894 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
896 innermost_loop_behavior *drb = vect_dr_behavior (dr);
897 bool step_preserves_misalignment_p;
899 unsigned HOST_WIDE_INT vector_alignment
900 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
901 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
903 /* No step for BB vectorization. */
904 if (!loop)
906 gcc_assert (integer_zerop (drb->step));
907 step_preserves_misalignment_p = true;
910 /* In case the dataref is in an inner-loop of the loop that is being
911 vectorized (LOOP), we use the base and misalignment information
912 relative to the outer-loop (LOOP). This is ok only if the misalignment
913 stays the same throughout the execution of the inner-loop, which is why
914 we have to check that the stride of the dataref in the inner-loop evenly
915 divides by the vector alignment. */
916 else if (nested_in_vect_loop_p (loop, stmt))
918 step_preserves_misalignment_p
919 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
921 if (dump_enabled_p ())
923 if (step_preserves_misalignment_p)
924 dump_printf_loc (MSG_NOTE, vect_location,
925 "inner step divides the vector alignment.\n");
926 else
927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
928 "inner step doesn't divide the vector"
929 " alignment.\n");
933 /* Similarly we can only use base and misalignment information relative to
934 an innermost loop if the misalignment stays the same throughout the
935 execution of the loop. As above, this is the case if the stride of
936 the dataref evenly divides by the alignment. */
937 else
939 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
940 step_preserves_misalignment_p
941 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
943 if (!step_preserves_misalignment_p && dump_enabled_p ())
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
945 "step doesn't divide the vector alignment.\n");
948 unsigned int base_alignment = drb->base_alignment;
949 unsigned int base_misalignment = drb->base_misalignment;
951 /* Calculate the maximum of the pooled base address alignment and the
952 alignment that we can compute for DR itself. */
953 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
954 if (entry && base_alignment < (*entry)->base_alignment)
956 base_alignment = (*entry)->base_alignment;
957 base_misalignment = (*entry)->base_misalignment;
960 if (drb->offset_alignment < vector_alignment
961 || !step_preserves_misalignment_p
962 /* We need to know whether the step wrt the vectorized loop is
963 negative when computing the starting misalignment below. */
964 || TREE_CODE (drb->step) != INTEGER_CST)
966 if (dump_enabled_p ())
968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
969 "Unknown alignment for access: ");
970 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
971 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
973 return true;
976 if (base_alignment < vector_alignment)
978 unsigned int max_alignment;
979 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
980 if (max_alignment < vector_alignment
981 || !vect_can_force_dr_alignment_p (base,
982 vector_alignment * BITS_PER_UNIT))
984 if (dump_enabled_p ())
986 dump_printf_loc (MSG_NOTE, vect_location,
987 "can't force alignment of ref: ");
988 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
989 dump_printf (MSG_NOTE, "\n");
991 return true;
994 /* Force the alignment of the decl.
995 NOTE: This is the only change to the code we make during
996 the analysis phase, before deciding to vectorize the loop. */
997 if (dump_enabled_p ())
999 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1000 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1001 dump_printf (MSG_NOTE, "\n");
1004 DR_VECT_AUX (dr)->base_decl = base;
1005 DR_VECT_AUX (dr)->base_misaligned = true;
1006 base_misalignment = 0;
1008 poly_int64 misalignment
1009 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1011 /* If this is a backward running DR then first access in the larger
1012 vectype actually is N-1 elements before the address in the DR.
1013 Adjust misalign accordingly. */
1014 if (tree_int_cst_sgn (drb->step) < 0)
1015 /* PLUS because STEP is negative. */
1016 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1017 * TREE_INT_CST_LOW (drb->step));
1019 unsigned int const_misalignment;
1020 if (!known_misalignment (misalignment, vector_alignment,
1021 &const_misalignment))
1023 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026 "Non-constant misalignment for access: ");
1027 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1028 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1030 return true;
1033 SET_DR_MISALIGNMENT (dr, const_misalignment);
1035 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1038 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1039 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1040 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1043 return true;
1046 /* Function vect_update_misalignment_for_peel.
1047 Sets DR's misalignment
1048 - to 0 if it has the same alignment as DR_PEEL,
1049 - to the misalignment computed using NPEEL if DR's salignment is known,
1050 - to -1 (unknown) otherwise.
1052 DR - the data reference whose misalignment is to be adjusted.
1053 DR_PEEL - the data reference whose misalignment is being made
1054 zero in the vector loop by the peel.
1055 NPEEL - the number of iterations in the peel loop if the misalignment
1056 of DR_PEEL is known at compile time. */
1058 static void
1059 vect_update_misalignment_for_peel (struct data_reference *dr,
1060 struct data_reference *dr_peel, int npeel)
1062 unsigned int i;
1063 vec<dr_p> same_aligned_drs;
1064 struct data_reference *current_dr;
1065 int dr_size = vect_get_scalar_dr_size (dr);
1066 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1067 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1068 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1070 /* For interleaved data accesses the step in the loop must be multiplied by
1071 the size of the interleaving group. */
1072 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1073 dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1074 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1075 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1077 /* It can be assumed that the data refs with the same alignment as dr_peel
1078 are aligned in the vector loop. */
1079 same_aligned_drs
1080 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1081 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1083 if (current_dr != dr)
1084 continue;
1085 gcc_assert (!known_alignment_for_access_p (dr)
1086 || !known_alignment_for_access_p (dr_peel)
1087 || (DR_MISALIGNMENT (dr) / dr_size
1088 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1089 SET_DR_MISALIGNMENT (dr, 0);
1090 return;
1093 if (known_alignment_for_access_p (dr)
1094 && known_alignment_for_access_p (dr_peel))
1096 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1097 int misal = DR_MISALIGNMENT (dr);
1098 misal += negative ? -npeel * dr_size : npeel * dr_size;
1099 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1100 SET_DR_MISALIGNMENT (dr, misal);
1101 return;
1104 if (dump_enabled_p ())
1105 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1106 "to unknown (-1).\n");
1107 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1111 /* Function verify_data_ref_alignment
1113 Return TRUE if DR can be handled with respect to alignment. */
1115 static bool
1116 verify_data_ref_alignment (data_reference_p dr)
1118 enum dr_alignment_support supportable_dr_alignment
1119 = vect_supportable_dr_alignment (dr, false);
1120 if (!supportable_dr_alignment)
1122 if (dump_enabled_p ())
1124 if (DR_IS_READ (dr))
1125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1126 "not vectorized: unsupported unaligned load.");
1127 else
1128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1129 "not vectorized: unsupported unaligned "
1130 "store.");
1132 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1133 DR_REF (dr));
1134 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1136 return false;
1139 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1140 dump_printf_loc (MSG_NOTE, vect_location,
1141 "Vectorizing an unaligned access.\n");
1143 return true;
1146 /* Function vect_verify_datarefs_alignment
1148 Return TRUE if all data references in the loop can be
1149 handled with respect to alignment. */
1151 bool
1152 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1154 vec<data_reference_p> datarefs = vinfo->datarefs;
1155 struct data_reference *dr;
1156 unsigned int i;
1158 FOR_EACH_VEC_ELT (datarefs, i, dr)
1160 gimple *stmt = DR_STMT (dr);
1161 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1163 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1164 continue;
1166 /* For interleaving, only the alignment of the first access matters. */
1167 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1168 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1169 continue;
1171 /* Strided accesses perform only component accesses, alignment is
1172 irrelevant for them. */
1173 if (STMT_VINFO_STRIDED_P (stmt_info)
1174 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1175 continue;
1177 if (! verify_data_ref_alignment (dr))
1178 return false;
1181 return true;
1184 /* Given an memory reference EXP return whether its alignment is less
1185 than its size. */
1187 static bool
1188 not_size_aligned (tree exp)
1190 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1191 return true;
1193 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1194 > get_object_alignment (exp));
1197 /* Function vector_alignment_reachable_p
1199 Return true if vector alignment for DR is reachable by peeling
1200 a few loop iterations. Return false otherwise. */
1202 static bool
1203 vector_alignment_reachable_p (struct data_reference *dr)
1205 gimple *stmt = DR_STMT (dr);
1206 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1207 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1209 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1211 /* For interleaved access we peel only if number of iterations in
1212 the prolog loop ({VF - misalignment}), is a multiple of the
1213 number of the interleaved accesses. */
1214 int elem_size, mis_in_elements;
1216 /* FORNOW: handle only known alignment. */
1217 if (!known_alignment_for_access_p (dr))
1218 return false;
1220 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1221 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1222 elem_size = vector_element_size (vector_size, nelements);
1223 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1225 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1226 return false;
1229 /* If misalignment is known at the compile time then allow peeling
1230 only if natural alignment is reachable through peeling. */
1231 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1233 HOST_WIDE_INT elmsize =
1234 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1235 if (dump_enabled_p ())
1237 dump_printf_loc (MSG_NOTE, vect_location,
1238 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1239 dump_printf (MSG_NOTE,
1240 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1242 if (DR_MISALIGNMENT (dr) % elmsize)
1244 if (dump_enabled_p ())
1245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1246 "data size does not divide the misalignment.\n");
1247 return false;
1251 if (!known_alignment_for_access_p (dr))
1253 tree type = TREE_TYPE (DR_REF (dr));
1254 bool is_packed = not_size_aligned (DR_REF (dr));
1255 if (dump_enabled_p ())
1256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1257 "Unknown misalignment, %snaturally aligned\n",
1258 is_packed ? "not " : "");
1259 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1262 return true;
1266 /* Calculate the cost of the memory access represented by DR. */
1268 static void
1269 vect_get_data_access_cost (struct data_reference *dr,
1270 unsigned int *inside_cost,
1271 unsigned int *outside_cost,
1272 stmt_vector_for_cost *body_cost_vec,
1273 stmt_vector_for_cost *prologue_cost_vec)
1275 gimple *stmt = DR_STMT (dr);
1276 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1277 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1278 int ncopies;
1280 if (PURE_SLP_STMT (stmt_info))
1281 ncopies = 1;
1282 else
1283 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1285 if (DR_IS_READ (dr))
1286 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1287 prologue_cost_vec, body_cost_vec, false);
1288 else
1289 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_NOTE, vect_location,
1293 "vect_get_data_access_cost: inside_cost = %d, "
1294 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1298 typedef struct _vect_peel_info
1300 struct data_reference *dr;
1301 int npeel;
1302 unsigned int count;
1303 } *vect_peel_info;
1305 typedef struct _vect_peel_extended_info
1307 struct _vect_peel_info peel_info;
1308 unsigned int inside_cost;
1309 unsigned int outside_cost;
1310 } *vect_peel_extended_info;
1313 /* Peeling hashtable helpers. */
1315 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1317 static inline hashval_t hash (const _vect_peel_info *);
1318 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1321 inline hashval_t
1322 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1324 return (hashval_t) peel_info->npeel;
1327 inline bool
1328 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1330 return (a->npeel == b->npeel);
1334 /* Insert DR into peeling hash table with NPEEL as key. */
1336 static void
1337 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1338 loop_vec_info loop_vinfo, struct data_reference *dr,
1339 int npeel)
1341 struct _vect_peel_info elem, *slot;
1342 _vect_peel_info **new_slot;
1343 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1345 elem.npeel = npeel;
1346 slot = peeling_htab->find (&elem);
1347 if (slot)
1348 slot->count++;
1349 else
1351 slot = XNEW (struct _vect_peel_info);
1352 slot->npeel = npeel;
1353 slot->dr = dr;
1354 slot->count = 1;
1355 new_slot = peeling_htab->find_slot (slot, INSERT);
1356 *new_slot = slot;
1359 if (!supportable_dr_alignment
1360 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1361 slot->count += VECT_MAX_COST;
1365 /* Traverse peeling hash table to find peeling option that aligns maximum
1366 number of data accesses. */
1369 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1370 _vect_peel_extended_info *max)
1372 vect_peel_info elem = *slot;
1374 if (elem->count > max->peel_info.count
1375 || (elem->count == max->peel_info.count
1376 && max->peel_info.npeel > elem->npeel))
1378 max->peel_info.npeel = elem->npeel;
1379 max->peel_info.count = elem->count;
1380 max->peel_info.dr = elem->dr;
1383 return 1;
1386 /* Get the costs of peeling NPEEL iterations checking data access costs
1387 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1388 misalignment will be zero after peeling. */
1390 static void
1391 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1392 struct data_reference *dr0,
1393 unsigned int *inside_cost,
1394 unsigned int *outside_cost,
1395 stmt_vector_for_cost *body_cost_vec,
1396 stmt_vector_for_cost *prologue_cost_vec,
1397 unsigned int npeel,
1398 bool unknown_misalignment)
1400 unsigned i;
1401 data_reference *dr;
1403 FOR_EACH_VEC_ELT (datarefs, i, dr)
1405 gimple *stmt = DR_STMT (dr);
1406 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1407 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1408 continue;
1410 /* For interleaving, only the alignment of the first access
1411 matters. */
1412 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1413 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1414 continue;
1416 /* Strided accesses perform only component accesses, alignment is
1417 irrelevant for them. */
1418 if (STMT_VINFO_STRIDED_P (stmt_info)
1419 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1420 continue;
1422 int save_misalignment;
1423 save_misalignment = DR_MISALIGNMENT (dr);
1424 if (npeel == 0)
1426 else if (unknown_misalignment && dr == dr0)
1427 SET_DR_MISALIGNMENT (dr, 0);
1428 else
1429 vect_update_misalignment_for_peel (dr, dr0, npeel);
1430 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1431 body_cost_vec, prologue_cost_vec);
1432 SET_DR_MISALIGNMENT (dr, save_misalignment);
1436 /* Traverse peeling hash table and calculate cost for each peeling option.
1437 Find the one with the lowest cost. */
1440 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1441 _vect_peel_extended_info *min)
1443 vect_peel_info elem = *slot;
1444 int dummy;
1445 unsigned int inside_cost = 0, outside_cost = 0;
1446 gimple *stmt = DR_STMT (elem->dr);
1447 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1448 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1449 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1450 epilogue_cost_vec;
1452 prologue_cost_vec.create (2);
1453 body_cost_vec.create (2);
1454 epilogue_cost_vec.create (2);
1456 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1457 elem->dr, &inside_cost, &outside_cost,
1458 &body_cost_vec, &prologue_cost_vec,
1459 elem->npeel, false);
1461 body_cost_vec.release ();
1463 outside_cost += vect_get_known_peeling_cost
1464 (loop_vinfo, elem->npeel, &dummy,
1465 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1466 &prologue_cost_vec, &epilogue_cost_vec);
1468 /* Prologue and epilogue costs are added to the target model later.
1469 These costs depend only on the scalar iteration cost, the
1470 number of peeling iterations finally chosen, and the number of
1471 misaligned statements. So discard the information found here. */
1472 prologue_cost_vec.release ();
1473 epilogue_cost_vec.release ();
1475 if (inside_cost < min->inside_cost
1476 || (inside_cost == min->inside_cost
1477 && outside_cost < min->outside_cost))
1479 min->inside_cost = inside_cost;
1480 min->outside_cost = outside_cost;
1481 min->peel_info.dr = elem->dr;
1482 min->peel_info.npeel = elem->npeel;
1483 min->peel_info.count = elem->count;
1486 return 1;
1490 /* Choose best peeling option by traversing peeling hash table and either
1491 choosing an option with the lowest cost (if cost model is enabled) or the
1492 option that aligns as many accesses as possible. */
1494 static struct _vect_peel_extended_info
1495 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1496 loop_vec_info loop_vinfo)
1498 struct _vect_peel_extended_info res;
1500 res.peel_info.dr = NULL;
1502 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1504 res.inside_cost = INT_MAX;
1505 res.outside_cost = INT_MAX;
1506 peeling_htab->traverse <_vect_peel_extended_info *,
1507 vect_peeling_hash_get_lowest_cost> (&res);
1509 else
1511 res.peel_info.count = 0;
1512 peeling_htab->traverse <_vect_peel_extended_info *,
1513 vect_peeling_hash_get_most_frequent> (&res);
1514 res.inside_cost = 0;
1515 res.outside_cost = 0;
1518 return res;
1521 /* Return true if the new peeling NPEEL is supported. */
1523 static bool
1524 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1525 unsigned npeel)
1527 unsigned i;
1528 struct data_reference *dr = NULL;
1529 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1530 gimple *stmt;
1531 stmt_vec_info stmt_info;
1532 enum dr_alignment_support supportable_dr_alignment;
1534 /* Ensure that all data refs can be vectorized after the peel. */
1535 FOR_EACH_VEC_ELT (datarefs, i, dr)
1537 int save_misalignment;
1539 if (dr == dr0)
1540 continue;
1542 stmt = DR_STMT (dr);
1543 stmt_info = vinfo_for_stmt (stmt);
1544 /* For interleaving, only the alignment of the first access
1545 matters. */
1546 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1547 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1548 continue;
1550 /* Strided accesses perform only component accesses, alignment is
1551 irrelevant for them. */
1552 if (STMT_VINFO_STRIDED_P (stmt_info)
1553 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1554 continue;
1556 save_misalignment = DR_MISALIGNMENT (dr);
1557 vect_update_misalignment_for_peel (dr, dr0, npeel);
1558 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1559 SET_DR_MISALIGNMENT (dr, save_misalignment);
1561 if (!supportable_dr_alignment)
1562 return false;
1565 return true;
1568 /* Function vect_enhance_data_refs_alignment
1570 This pass will use loop versioning and loop peeling in order to enhance
1571 the alignment of data references in the loop.
1573 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1574 original loop is to be vectorized. Any other loops that are created by
1575 the transformations performed in this pass - are not supposed to be
1576 vectorized. This restriction will be relaxed.
1578 This pass will require a cost model to guide it whether to apply peeling
1579 or versioning or a combination of the two. For example, the scheme that
1580 intel uses when given a loop with several memory accesses, is as follows:
1581 choose one memory access ('p') which alignment you want to force by doing
1582 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1583 other accesses are not necessarily aligned, or (2) use loop versioning to
1584 generate one loop in which all accesses are aligned, and another loop in
1585 which only 'p' is necessarily aligned.
1587 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1588 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1589 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1591 Devising a cost model is the most critical aspect of this work. It will
1592 guide us on which access to peel for, whether to use loop versioning, how
1593 many versions to create, etc. The cost model will probably consist of
1594 generic considerations as well as target specific considerations (on
1595 powerpc for example, misaligned stores are more painful than misaligned
1596 loads).
1598 Here are the general steps involved in alignment enhancements:
1600 -- original loop, before alignment analysis:
1601 for (i=0; i<N; i++){
1602 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1603 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1606 -- After vect_compute_data_refs_alignment:
1607 for (i=0; i<N; i++){
1608 x = q[i]; # DR_MISALIGNMENT(q) = 3
1609 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1612 -- Possibility 1: we do loop versioning:
1613 if (p is aligned) {
1614 for (i=0; i<N; i++){ # loop 1A
1615 x = q[i]; # DR_MISALIGNMENT(q) = 3
1616 p[i] = y; # DR_MISALIGNMENT(p) = 0
1619 else {
1620 for (i=0; i<N; i++){ # loop 1B
1621 x = q[i]; # DR_MISALIGNMENT(q) = 3
1622 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1626 -- Possibility 2: we do loop peeling:
1627 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1628 x = q[i];
1629 p[i] = y;
1631 for (i = 3; i < N; i++){ # loop 2A
1632 x = q[i]; # DR_MISALIGNMENT(q) = 0
1633 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1636 -- Possibility 3: combination of loop peeling and versioning:
1637 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1638 x = q[i];
1639 p[i] = y;
1641 if (p is aligned) {
1642 for (i = 3; i<N; i++){ # loop 3A
1643 x = q[i]; # DR_MISALIGNMENT(q) = 0
1644 p[i] = y; # DR_MISALIGNMENT(p) = 0
1647 else {
1648 for (i = 3; i<N; i++){ # loop 3B
1649 x = q[i]; # DR_MISALIGNMENT(q) = 0
1650 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1654 These loops are later passed to loop_transform to be vectorized. The
1655 vectorizer will use the alignment information to guide the transformation
1656 (whether to generate regular loads/stores, or with special handling for
1657 misalignment). */
1659 bool
1660 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1662 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1663 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1664 enum dr_alignment_support supportable_dr_alignment;
1665 struct data_reference *dr0 = NULL, *first_store = NULL;
1666 struct data_reference *dr;
1667 unsigned int i, j;
1668 bool do_peeling = false;
1669 bool do_versioning = false;
1670 bool stat;
1671 gimple *stmt;
1672 stmt_vec_info stmt_info;
1673 unsigned int npeel = 0;
1674 bool one_misalignment_known = false;
1675 bool one_misalignment_unknown = false;
1676 bool one_dr_unsupportable = false;
1677 struct data_reference *unsupportable_dr = NULL;
1678 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1679 unsigned possible_npeel_number = 1;
1680 tree vectype;
1681 unsigned int mis, same_align_drs_max = 0;
1682 hash_table<peel_info_hasher> peeling_htab (1);
1684 if (dump_enabled_p ())
1685 dump_printf_loc (MSG_NOTE, vect_location,
1686 "=== vect_enhance_data_refs_alignment ===\n");
1688 /* Reset data so we can safely be called multiple times. */
1689 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1690 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1692 /* While cost model enhancements are expected in the future, the high level
1693 view of the code at this time is as follows:
1695 A) If there is a misaligned access then see if peeling to align
1696 this access can make all data references satisfy
1697 vect_supportable_dr_alignment. If so, update data structures
1698 as needed and return true.
1700 B) If peeling wasn't possible and there is a data reference with an
1701 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1702 then see if loop versioning checks can be used to make all data
1703 references satisfy vect_supportable_dr_alignment. If so, update
1704 data structures as needed and return true.
1706 C) If neither peeling nor versioning were successful then return false if
1707 any data reference does not satisfy vect_supportable_dr_alignment.
1709 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1711 Note, Possibility 3 above (which is peeling and versioning together) is not
1712 being done at this time. */
1714 /* (1) Peeling to force alignment. */
1716 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1717 Considerations:
1718 + How many accesses will become aligned due to the peeling
1719 - How many accesses will become unaligned due to the peeling,
1720 and the cost of misaligned accesses.
1721 - The cost of peeling (the extra runtime checks, the increase
1722 in code size). */
1724 FOR_EACH_VEC_ELT (datarefs, i, dr)
1726 stmt = DR_STMT (dr);
1727 stmt_info = vinfo_for_stmt (stmt);
1729 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1730 continue;
1732 /* For interleaving, only the alignment of the first access
1733 matters. */
1734 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1735 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1736 continue;
1738 /* For invariant accesses there is nothing to enhance. */
1739 if (integer_zerop (DR_STEP (dr)))
1740 continue;
1742 /* Strided accesses perform only component accesses, alignment is
1743 irrelevant for them. */
1744 if (STMT_VINFO_STRIDED_P (stmt_info)
1745 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1746 continue;
1748 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1749 do_peeling = vector_alignment_reachable_p (dr);
1750 if (do_peeling)
1752 if (known_alignment_for_access_p (dr))
1754 unsigned int npeel_tmp = 0;
1755 bool negative = tree_int_cst_compare (DR_STEP (dr),
1756 size_zero_node) < 0;
1758 vectype = STMT_VINFO_VECTYPE (stmt_info);
1759 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1760 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1761 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1762 if (DR_MISALIGNMENT (dr) != 0)
1763 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1765 /* For multiple types, it is possible that the bigger type access
1766 will have more than one peeling option. E.g., a loop with two
1767 types: one of size (vector size / 4), and the other one of
1768 size (vector size / 8). Vectorization factor will 8. If both
1769 accesses are misaligned by 3, the first one needs one scalar
1770 iteration to be aligned, and the second one needs 5. But the
1771 first one will be aligned also by peeling 5 scalar
1772 iterations, and in that case both accesses will be aligned.
1773 Hence, except for the immediate peeling amount, we also want
1774 to try to add full vector size, while we don't exceed
1775 vectorization factor.
1776 We do this automatically for cost model, since we calculate
1777 cost for every peeling option. */
1778 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1780 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1781 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1782 possible_npeel_number
1783 = vect_get_num_vectors (nscalars, vectype);
1785 /* NPEEL_TMP is 0 when there is no misalignment, but also
1786 allow peeling NELEMENTS. */
1787 if (DR_MISALIGNMENT (dr) == 0)
1788 possible_npeel_number++;
1791 /* Save info about DR in the hash table. Also include peeling
1792 amounts according to the explanation above. */
1793 for (j = 0; j < possible_npeel_number; j++)
1795 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1796 dr, npeel_tmp);
1797 npeel_tmp += target_align / dr_size;
1800 one_misalignment_known = true;
1802 else
1804 /* If we don't know any misalignment values, we prefer
1805 peeling for data-ref that has the maximum number of data-refs
1806 with the same alignment, unless the target prefers to align
1807 stores over load. */
1808 unsigned same_align_drs
1809 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1810 if (!dr0
1811 || same_align_drs_max < same_align_drs)
1813 same_align_drs_max = same_align_drs;
1814 dr0 = dr;
1816 /* For data-refs with the same number of related
1817 accesses prefer the one where the misalign
1818 computation will be invariant in the outermost loop. */
1819 else if (same_align_drs_max == same_align_drs)
1821 struct loop *ivloop0, *ivloop;
1822 ivloop0 = outermost_invariant_loop_for_expr
1823 (loop, DR_BASE_ADDRESS (dr0));
1824 ivloop = outermost_invariant_loop_for_expr
1825 (loop, DR_BASE_ADDRESS (dr));
1826 if ((ivloop && !ivloop0)
1827 || (ivloop && ivloop0
1828 && flow_loop_nested_p (ivloop, ivloop0)))
1829 dr0 = dr;
1832 one_misalignment_unknown = true;
1834 /* Check for data refs with unsupportable alignment that
1835 can be peeled. */
1836 if (!supportable_dr_alignment)
1838 one_dr_unsupportable = true;
1839 unsupportable_dr = dr;
1842 if (!first_store && DR_IS_WRITE (dr))
1843 first_store = dr;
1846 else
1848 if (!aligned_access_p (dr))
1850 if (dump_enabled_p ())
1851 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1852 "vector alignment may not be reachable\n");
1853 break;
1858 /* Check if we can possibly peel the loop. */
1859 if (!vect_can_advance_ivs_p (loop_vinfo)
1860 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1861 || loop->inner)
1862 do_peeling = false;
1864 struct _vect_peel_extended_info peel_for_known_alignment;
1865 struct _vect_peel_extended_info peel_for_unknown_alignment;
1866 struct _vect_peel_extended_info best_peel;
1868 peel_for_unknown_alignment.inside_cost = INT_MAX;
1869 peel_for_unknown_alignment.outside_cost = INT_MAX;
1870 peel_for_unknown_alignment.peel_info.count = 0;
1872 if (do_peeling
1873 && one_misalignment_unknown)
1875 /* Check if the target requires to prefer stores over loads, i.e., if
1876 misaligned stores are more expensive than misaligned loads (taking
1877 drs with same alignment into account). */
1878 unsigned int load_inside_cost = 0;
1879 unsigned int load_outside_cost = 0;
1880 unsigned int store_inside_cost = 0;
1881 unsigned int store_outside_cost = 0;
1882 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1884 stmt_vector_for_cost dummy;
1885 dummy.create (2);
1886 vect_get_peeling_costs_all_drs (datarefs, dr0,
1887 &load_inside_cost,
1888 &load_outside_cost,
1889 &dummy, &dummy, estimated_npeels, true);
1890 dummy.release ();
1892 if (first_store)
1894 dummy.create (2);
1895 vect_get_peeling_costs_all_drs (datarefs, first_store,
1896 &store_inside_cost,
1897 &store_outside_cost,
1898 &dummy, &dummy,
1899 estimated_npeels, true);
1900 dummy.release ();
1902 else
1904 store_inside_cost = INT_MAX;
1905 store_outside_cost = INT_MAX;
1908 if (load_inside_cost > store_inside_cost
1909 || (load_inside_cost == store_inside_cost
1910 && load_outside_cost > store_outside_cost))
1912 dr0 = first_store;
1913 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1914 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1916 else
1918 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1919 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1922 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1923 prologue_cost_vec.create (2);
1924 epilogue_cost_vec.create (2);
1926 int dummy2;
1927 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1928 (loop_vinfo, estimated_npeels, &dummy2,
1929 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1930 &prologue_cost_vec, &epilogue_cost_vec);
1932 prologue_cost_vec.release ();
1933 epilogue_cost_vec.release ();
1935 peel_for_unknown_alignment.peel_info.count = 1
1936 + STMT_VINFO_SAME_ALIGN_REFS
1937 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1940 peel_for_unknown_alignment.peel_info.npeel = 0;
1941 peel_for_unknown_alignment.peel_info.dr = dr0;
1943 best_peel = peel_for_unknown_alignment;
1945 peel_for_known_alignment.inside_cost = INT_MAX;
1946 peel_for_known_alignment.outside_cost = INT_MAX;
1947 peel_for_known_alignment.peel_info.count = 0;
1948 peel_for_known_alignment.peel_info.dr = NULL;
1950 if (do_peeling && one_misalignment_known)
1952 /* Peeling is possible, but there is no data access that is not supported
1953 unless aligned. So we try to choose the best possible peeling from
1954 the hash table. */
1955 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1956 (&peeling_htab, loop_vinfo);
1959 /* Compare costs of peeling for known and unknown alignment. */
1960 if (peel_for_known_alignment.peel_info.dr != NULL
1961 && peel_for_unknown_alignment.inside_cost
1962 >= peel_for_known_alignment.inside_cost)
1964 best_peel = peel_for_known_alignment;
1966 /* If the best peeling for known alignment has NPEEL == 0, perform no
1967 peeling at all except if there is an unsupportable dr that we can
1968 align. */
1969 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1970 do_peeling = false;
1973 /* If there is an unsupportable data ref, prefer this over all choices so far
1974 since we'd have to discard a chosen peeling except when it accidentally
1975 aligned the unsupportable data ref. */
1976 if (one_dr_unsupportable)
1977 dr0 = unsupportable_dr;
1978 else if (do_peeling)
1980 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1981 TODO: Use nopeel_outside_cost or get rid of it? */
1982 unsigned nopeel_inside_cost = 0;
1983 unsigned nopeel_outside_cost = 0;
1985 stmt_vector_for_cost dummy;
1986 dummy.create (2);
1987 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1988 &nopeel_outside_cost, &dummy, &dummy,
1989 0, false);
1990 dummy.release ();
1992 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1993 costs will be recorded. */
1994 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1995 prologue_cost_vec.create (2);
1996 epilogue_cost_vec.create (2);
1998 int dummy2;
1999 nopeel_outside_cost += vect_get_known_peeling_cost
2000 (loop_vinfo, 0, &dummy2,
2001 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2002 &prologue_cost_vec, &epilogue_cost_vec);
2004 prologue_cost_vec.release ();
2005 epilogue_cost_vec.release ();
2007 npeel = best_peel.peel_info.npeel;
2008 dr0 = best_peel.peel_info.dr;
2010 /* If no peeling is not more expensive than the best peeling we
2011 have so far, don't perform any peeling. */
2012 if (nopeel_inside_cost <= best_peel.inside_cost)
2013 do_peeling = false;
2016 if (do_peeling)
2018 stmt = DR_STMT (dr0);
2019 stmt_info = vinfo_for_stmt (stmt);
2020 vectype = STMT_VINFO_VECTYPE (stmt_info);
2022 if (known_alignment_for_access_p (dr0))
2024 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2025 size_zero_node) < 0;
2026 if (!npeel)
2028 /* Since it's known at compile time, compute the number of
2029 iterations in the peeled loop (the peeling factor) for use in
2030 updating DR_MISALIGNMENT values. The peeling factor is the
2031 vectorization factor minus the misalignment as an element
2032 count. */
2033 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2034 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2035 npeel = ((mis & (target_align - 1))
2036 / vect_get_scalar_dr_size (dr0));
2039 /* For interleaved data access every iteration accesses all the
2040 members of the group, therefore we divide the number of iterations
2041 by the group size. */
2042 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2043 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2044 npeel /= DR_GROUP_SIZE (stmt_info);
2046 if (dump_enabled_p ())
2047 dump_printf_loc (MSG_NOTE, vect_location,
2048 "Try peeling by %d\n", npeel);
2051 /* Ensure that all datarefs can be vectorized after the peel. */
2052 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2053 do_peeling = false;
2055 /* Check if all datarefs are supportable and log. */
2056 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2058 stat = vect_verify_datarefs_alignment (loop_vinfo);
2059 if (!stat)
2060 do_peeling = false;
2061 else
2062 return stat;
2065 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2066 if (do_peeling)
2068 unsigned max_allowed_peel
2069 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2070 if (max_allowed_peel != (unsigned)-1)
2072 unsigned max_peel = npeel;
2073 if (max_peel == 0)
2075 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2076 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2078 if (max_peel > max_allowed_peel)
2080 do_peeling = false;
2081 if (dump_enabled_p ())
2082 dump_printf_loc (MSG_NOTE, vect_location,
2083 "Disable peeling, max peels reached: %d\n", max_peel);
2088 /* Cost model #2 - if peeling may result in a remaining loop not
2089 iterating enough to be vectorized then do not peel. Since this
2090 is a cost heuristic rather than a correctness decision, use the
2091 most likely runtime value for variable vectorization factors. */
2092 if (do_peeling
2093 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2095 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2096 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2097 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2098 < assumed_vf + max_peel)
2099 do_peeling = false;
2102 if (do_peeling)
2104 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2105 If the misalignment of DR_i is identical to that of dr0 then set
2106 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2107 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2108 by the peeling factor times the element size of DR_i (MOD the
2109 vectorization factor times the size). Otherwise, the
2110 misalignment of DR_i must be set to unknown. */
2111 FOR_EACH_VEC_ELT (datarefs, i, dr)
2112 if (dr != dr0)
2114 /* Strided accesses perform only component accesses, alignment
2115 is irrelevant for them. */
2116 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2117 if (STMT_VINFO_STRIDED_P (stmt_info)
2118 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2119 continue;
2121 vect_update_misalignment_for_peel (dr, dr0, npeel);
2124 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2125 if (npeel)
2126 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2127 else
2128 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2129 = DR_MISALIGNMENT (dr0);
2130 SET_DR_MISALIGNMENT (dr0, 0);
2131 if (dump_enabled_p ())
2133 dump_printf_loc (MSG_NOTE, vect_location,
2134 "Alignment of access forced using peeling.\n");
2135 dump_printf_loc (MSG_NOTE, vect_location,
2136 "Peeling for alignment will be applied.\n");
2139 /* The inside-loop cost will be accounted for in vectorizable_load
2140 and vectorizable_store correctly with adjusted alignments.
2141 Drop the body_cst_vec on the floor here. */
2142 stat = vect_verify_datarefs_alignment (loop_vinfo);
2143 gcc_assert (stat);
2144 return stat;
2148 /* (2) Versioning to force alignment. */
2150 /* Try versioning if:
2151 1) optimize loop for speed
2152 2) there is at least one unsupported misaligned data ref with an unknown
2153 misalignment, and
2154 3) all misaligned data refs with a known misalignment are supported, and
2155 4) the number of runtime alignment checks is within reason. */
2157 do_versioning =
2158 optimize_loop_nest_for_speed_p (loop)
2159 && (!loop->inner); /* FORNOW */
2161 if (do_versioning)
2163 FOR_EACH_VEC_ELT (datarefs, i, dr)
2165 stmt = DR_STMT (dr);
2166 stmt_info = vinfo_for_stmt (stmt);
2168 /* For interleaving, only the alignment of the first access
2169 matters. */
2170 if (aligned_access_p (dr)
2171 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2172 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2173 continue;
2175 if (STMT_VINFO_STRIDED_P (stmt_info))
2177 /* Strided loads perform only component accesses, alignment is
2178 irrelevant for them. */
2179 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2180 continue;
2181 do_versioning = false;
2182 break;
2185 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2187 if (!supportable_dr_alignment)
2189 gimple *stmt;
2190 int mask;
2191 tree vectype;
2193 if (known_alignment_for_access_p (dr)
2194 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2195 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2197 do_versioning = false;
2198 break;
2201 stmt = DR_STMT (dr);
2202 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2203 gcc_assert (vectype);
2205 /* At present we don't support versioning for alignment
2206 with variable VF, since there's no guarantee that the
2207 VF is a power of two. We could relax this if we added
2208 a way of enforcing a power-of-two size. */
2209 unsigned HOST_WIDE_INT size;
2210 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2212 do_versioning = false;
2213 break;
2216 /* The rightmost bits of an aligned address must be zeros.
2217 Construct the mask needed for this test. For example,
2218 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2219 mask must be 15 = 0xf. */
2220 mask = size - 1;
2222 /* FORNOW: use the same mask to test all potentially unaligned
2223 references in the loop. The vectorizer currently supports
2224 a single vector size, see the reference to
2225 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2226 vectorization factor is computed. */
2227 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2228 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2229 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2230 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2231 DR_STMT (dr));
2235 /* Versioning requires at least one misaligned data reference. */
2236 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2237 do_versioning = false;
2238 else if (!do_versioning)
2239 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2242 if (do_versioning)
2244 vec<gimple *> may_misalign_stmts
2245 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2246 gimple *stmt;
2248 /* It can now be assumed that the data references in the statements
2249 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2250 of the loop being vectorized. */
2251 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2253 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2254 dr = STMT_VINFO_DATA_REF (stmt_info);
2255 SET_DR_MISALIGNMENT (dr, 0);
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_NOTE, vect_location,
2258 "Alignment of access forced using versioning.\n");
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_NOTE, vect_location,
2263 "Versioning for alignment will be applied.\n");
2265 /* Peeling and versioning can't be done together at this time. */
2266 gcc_assert (! (do_peeling && do_versioning));
2268 stat = vect_verify_datarefs_alignment (loop_vinfo);
2269 gcc_assert (stat);
2270 return stat;
2273 /* This point is reached if neither peeling nor versioning is being done. */
2274 gcc_assert (! (do_peeling || do_versioning));
2276 stat = vect_verify_datarefs_alignment (loop_vinfo);
2277 return stat;
2281 /* Function vect_find_same_alignment_drs.
2283 Update group and alignment relations according to the chosen
2284 vectorization factor. */
2286 static void
2287 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2289 struct data_reference *dra = DDR_A (ddr);
2290 struct data_reference *drb = DDR_B (ddr);
2291 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2292 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2294 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2295 return;
2297 if (dra == drb)
2298 return;
2300 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2301 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2302 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2303 return;
2305 /* Two references with distance zero have the same alignment. */
2306 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2307 - wi::to_poly_offset (DR_INIT (drb)));
2308 if (maybe_ne (diff, 0))
2310 /* Get the wider of the two alignments. */
2311 unsigned int align_a = (vect_calculate_target_alignment (dra)
2312 / BITS_PER_UNIT);
2313 unsigned int align_b = (vect_calculate_target_alignment (drb)
2314 / BITS_PER_UNIT);
2315 unsigned int max_align = MAX (align_a, align_b);
2317 /* Require the gap to be a multiple of the larger vector alignment. */
2318 if (!multiple_p (diff, max_align))
2319 return;
2322 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2323 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2324 if (dump_enabled_p ())
2326 dump_printf_loc (MSG_NOTE, vect_location,
2327 "accesses have the same alignment: ");
2328 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2329 dump_printf (MSG_NOTE, " and ");
2330 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2331 dump_printf (MSG_NOTE, "\n");
2336 /* Function vect_analyze_data_refs_alignment
2338 Analyze the alignment of the data-references in the loop.
2339 Return FALSE if a data reference is found that cannot be vectorized. */
2341 bool
2342 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_NOTE, vect_location,
2346 "=== vect_analyze_data_refs_alignment ===\n");
2348 /* Mark groups of data references with same alignment using
2349 data dependence information. */
2350 vec<ddr_p> ddrs = vinfo->ddrs;
2351 struct data_dependence_relation *ddr;
2352 unsigned int i;
2354 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2355 vect_find_same_alignment_drs (ddr);
2357 vec<data_reference_p> datarefs = vinfo->datarefs;
2358 struct data_reference *dr;
2360 vect_record_base_alignments (vinfo);
2361 FOR_EACH_VEC_ELT (datarefs, i, dr)
2363 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2364 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2365 && !vect_compute_data_ref_alignment (dr))
2367 /* Strided accesses perform only component accesses, misalignment
2368 information is irrelevant for them. */
2369 if (STMT_VINFO_STRIDED_P (stmt_info)
2370 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2371 continue;
2373 if (dump_enabled_p ())
2374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2375 "not vectorized: can't calculate alignment "
2376 "for data ref.\n");
2378 return false;
2382 return true;
2386 /* Analyze alignment of DRs of stmts in NODE. */
2388 static bool
2389 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2391 /* We vectorize from the first scalar stmt in the node unless
2392 the node is permuted in which case we start from the first
2393 element in the group. */
2394 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2395 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2396 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2397 first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2399 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2400 if (! vect_compute_data_ref_alignment (dr)
2401 /* For creating the data-ref pointer we need alignment of the
2402 first element anyway. */
2403 || (dr != first_dr
2404 && ! vect_compute_data_ref_alignment (first_dr))
2405 || ! verify_data_ref_alignment (dr))
2407 if (dump_enabled_p ())
2408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2409 "not vectorized: bad data alignment in basic "
2410 "block.\n");
2411 return false;
2414 return true;
2417 /* Function vect_slp_analyze_instance_alignment
2419 Analyze the alignment of the data-references in the SLP instance.
2420 Return FALSE if a data reference is found that cannot be vectorized. */
2422 bool
2423 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2425 if (dump_enabled_p ())
2426 dump_printf_loc (MSG_NOTE, vect_location,
2427 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2429 slp_tree node;
2430 unsigned i;
2431 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2432 if (! vect_slp_analyze_and_verify_node_alignment (node))
2433 return false;
2435 node = SLP_INSTANCE_TREE (instance);
2436 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2437 && ! vect_slp_analyze_and_verify_node_alignment
2438 (SLP_INSTANCE_TREE (instance)))
2439 return false;
2441 return true;
2445 /* Analyze groups of accesses: check that DR belongs to a group of
2446 accesses of legal size, step, etc. Detect gaps, single element
2447 interleaving, and other special cases. Set grouped access info.
2448 Collect groups of strided stores for further use in SLP analysis.
2449 Worker for vect_analyze_group_access. */
2451 static bool
2452 vect_analyze_group_access_1 (struct data_reference *dr)
2454 tree step = DR_STEP (dr);
2455 tree scalar_type = TREE_TYPE (DR_REF (dr));
2456 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2457 gimple *stmt = DR_STMT (dr);
2458 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2460 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2461 HOST_WIDE_INT dr_step = -1;
2462 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2463 bool slp_impossible = false;
2465 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2466 size of the interleaving group (including gaps). */
2467 if (tree_fits_shwi_p (step))
2469 dr_step = tree_to_shwi (step);
2470 /* Check that STEP is a multiple of type size. Otherwise there is
2471 a non-element-sized gap at the end of the group which we
2472 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2473 ??? As we can handle non-constant step fine here we should
2474 simply remove uses of DR_GROUP_GAP between the last and first
2475 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2476 simply not include that gap. */
2477 if ((dr_step % type_size) != 0)
2479 if (dump_enabled_p ())
2481 dump_printf_loc (MSG_NOTE, vect_location,
2482 "Step ");
2483 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2484 dump_printf (MSG_NOTE,
2485 " is not a multiple of the element size for ");
2486 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2487 dump_printf (MSG_NOTE, "\n");
2489 return false;
2491 groupsize = absu_hwi (dr_step) / type_size;
2493 else
2494 groupsize = 0;
2496 /* Not consecutive access is possible only if it is a part of interleaving. */
2497 if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2499 /* Check if it this DR is a part of interleaving, and is a single
2500 element of the group that is accessed in the loop. */
2502 /* Gaps are supported only for loads. STEP must be a multiple of the type
2503 size. */
2504 if (DR_IS_READ (dr)
2505 && (dr_step % type_size) == 0
2506 && groupsize > 0)
2508 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2509 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2510 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2511 if (dump_enabled_p ())
2513 dump_printf_loc (MSG_NOTE, vect_location,
2514 "Detected single element interleaving ");
2515 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2516 dump_printf (MSG_NOTE, " step ");
2517 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2518 dump_printf (MSG_NOTE, "\n");
2521 return true;
2524 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2527 "not consecutive access ");
2528 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2531 if (bb_vinfo)
2533 /* Mark the statement as unvectorizable. */
2534 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2535 return true;
2538 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2539 STMT_VINFO_STRIDED_P (stmt_info) = true;
2540 return true;
2543 if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2545 /* First stmt in the interleaving chain. Check the chain. */
2546 gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2547 struct data_reference *data_ref = dr;
2548 unsigned int count = 1;
2549 tree prev_init = DR_INIT (data_ref);
2550 gimple *prev = stmt;
2551 HOST_WIDE_INT diff, gaps = 0;
2553 /* By construction, all group members have INTEGER_CST DR_INITs. */
2554 while (next)
2556 /* Skip same data-refs. In case that two or more stmts share
2557 data-ref (supported only for loads), we vectorize only the first
2558 stmt, and the rest get their vectorized loads from the first
2559 one. */
2560 if (!tree_int_cst_compare (DR_INIT (data_ref),
2561 DR_INIT (STMT_VINFO_DATA_REF (
2562 vinfo_for_stmt (next)))))
2564 if (DR_IS_WRITE (data_ref))
2566 if (dump_enabled_p ())
2567 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2568 "Two store stmts share the same dr.\n");
2569 return false;
2572 if (dump_enabled_p ())
2573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2574 "Two or more load stmts share the same dr.\n");
2576 /* For load use the same data-ref load. */
2577 DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2579 prev = next;
2580 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2581 continue;
2584 prev = next;
2585 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2587 /* All group members have the same STEP by construction. */
2588 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2590 /* Check that the distance between two accesses is equal to the type
2591 size. Otherwise, we have gaps. */
2592 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2593 - TREE_INT_CST_LOW (prev_init)) / type_size;
2594 if (diff != 1)
2596 /* FORNOW: SLP of accesses with gaps is not supported. */
2597 slp_impossible = true;
2598 if (DR_IS_WRITE (data_ref))
2600 if (dump_enabled_p ())
2601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2602 "interleaved store with gaps\n");
2603 return false;
2606 gaps += diff - 1;
2609 last_accessed_element += diff;
2611 /* Store the gap from the previous member of the group. If there is no
2612 gap in the access, DR_GROUP_GAP is always 1. */
2613 DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2615 prev_init = DR_INIT (data_ref);
2616 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2617 /* Count the number of data-refs in the chain. */
2618 count++;
2621 if (groupsize == 0)
2622 groupsize = count + gaps;
2624 /* This could be UINT_MAX but as we are generating code in a very
2625 inefficient way we have to cap earlier. See PR78699 for example. */
2626 if (groupsize > 4096)
2628 if (dump_enabled_p ())
2629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2630 "group is too large\n");
2631 return false;
2634 /* Check that the size of the interleaving is equal to count for stores,
2635 i.e., that there are no gaps. */
2636 if (groupsize != count
2637 && !DR_IS_READ (dr))
2639 if (dump_enabled_p ())
2640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2641 "interleaved store with gaps\n");
2642 return false;
2645 /* If there is a gap after the last load in the group it is the
2646 difference between the groupsize and the last accessed
2647 element.
2648 When there is no gap, this difference should be 0. */
2649 DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2651 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2652 if (dump_enabled_p ())
2654 dump_printf_loc (MSG_NOTE, vect_location,
2655 "Detected interleaving ");
2656 if (DR_IS_READ (dr))
2657 dump_printf (MSG_NOTE, "load ");
2658 else
2659 dump_printf (MSG_NOTE, "store ");
2660 dump_printf (MSG_NOTE, "of size %u starting with ",
2661 (unsigned)groupsize);
2662 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2663 if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2664 dump_printf_loc (MSG_NOTE, vect_location,
2665 "There is a gap of %u elements after the group\n",
2666 DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2669 /* SLP: create an SLP data structure for every interleaving group of
2670 stores for further analysis in vect_analyse_slp. */
2671 if (DR_IS_WRITE (dr) && !slp_impossible)
2673 if (loop_vinfo)
2674 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2675 if (bb_vinfo)
2676 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2680 return true;
2683 /* Analyze groups of accesses: check that DR belongs to a group of
2684 accesses of legal size, step, etc. Detect gaps, single element
2685 interleaving, and other special cases. Set grouped access info.
2686 Collect groups of strided stores for further use in SLP analysis. */
2688 static bool
2689 vect_analyze_group_access (struct data_reference *dr)
2691 if (!vect_analyze_group_access_1 (dr))
2693 /* Dissolve the group if present. */
2694 gimple *next;
2695 gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2696 while (stmt)
2698 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2699 next = DR_GROUP_NEXT_ELEMENT (vinfo);
2700 DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2701 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2702 stmt = next;
2704 return false;
2706 return true;
2709 /* Analyze the access pattern of the data-reference DR.
2710 In case of non-consecutive accesses call vect_analyze_group_access() to
2711 analyze groups of accesses. */
2713 static bool
2714 vect_analyze_data_ref_access (struct data_reference *dr)
2716 tree step = DR_STEP (dr);
2717 tree scalar_type = TREE_TYPE (DR_REF (dr));
2718 gimple *stmt = DR_STMT (dr);
2719 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2720 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2721 struct loop *loop = NULL;
2723 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2724 return true;
2726 if (loop_vinfo)
2727 loop = LOOP_VINFO_LOOP (loop_vinfo);
2729 if (loop_vinfo && !step)
2731 if (dump_enabled_p ())
2732 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2733 "bad data-ref access in loop\n");
2734 return false;
2737 /* Allow loads with zero step in inner-loop vectorization. */
2738 if (loop_vinfo && integer_zerop (step))
2740 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2741 if (!nested_in_vect_loop_p (loop, stmt))
2742 return DR_IS_READ (dr);
2743 /* Allow references with zero step for outer loops marked
2744 with pragma omp simd only - it guarantees absence of
2745 loop-carried dependencies between inner loop iterations. */
2746 if (loop->safelen < 2)
2748 if (dump_enabled_p ())
2749 dump_printf_loc (MSG_NOTE, vect_location,
2750 "zero step in inner loop of nest\n");
2751 return false;
2755 if (loop && nested_in_vect_loop_p (loop, stmt))
2757 /* Interleaved accesses are not yet supported within outer-loop
2758 vectorization for references in the inner-loop. */
2759 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2761 /* For the rest of the analysis we use the outer-loop step. */
2762 step = STMT_VINFO_DR_STEP (stmt_info);
2763 if (integer_zerop (step))
2765 if (dump_enabled_p ())
2766 dump_printf_loc (MSG_NOTE, vect_location,
2767 "zero step in outer loop.\n");
2768 return DR_IS_READ (dr);
2772 /* Consecutive? */
2773 if (TREE_CODE (step) == INTEGER_CST)
2775 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2776 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2777 || (dr_step < 0
2778 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2780 /* Mark that it is not interleaving. */
2781 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2782 return true;
2786 if (loop && nested_in_vect_loop_p (loop, stmt))
2788 if (dump_enabled_p ())
2789 dump_printf_loc (MSG_NOTE, vect_location,
2790 "grouped access in outer loop.\n");
2791 return false;
2795 /* Assume this is a DR handled by non-constant strided load case. */
2796 if (TREE_CODE (step) != INTEGER_CST)
2797 return (STMT_VINFO_STRIDED_P (stmt_info)
2798 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2799 || vect_analyze_group_access (dr)));
2801 /* Not consecutive access - check if it's a part of interleaving group. */
2802 return vect_analyze_group_access (dr);
2805 /* Compare two data-references DRA and DRB to group them into chunks
2806 suitable for grouping. */
2808 static int
2809 dr_group_sort_cmp (const void *dra_, const void *drb_)
2811 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2812 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2813 int cmp;
2815 /* Stabilize sort. */
2816 if (dra == drb)
2817 return 0;
2819 /* DRs in different loops never belong to the same group. */
2820 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2821 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2822 if (loopa != loopb)
2823 return loopa->num < loopb->num ? -1 : 1;
2825 /* Ordering of DRs according to base. */
2826 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2827 DR_BASE_ADDRESS (drb));
2828 if (cmp != 0)
2829 return cmp;
2831 /* And according to DR_OFFSET. */
2832 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2833 if (cmp != 0)
2834 return cmp;
2836 /* Put reads before writes. */
2837 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2838 return DR_IS_READ (dra) ? -1 : 1;
2840 /* Then sort after access size. */
2841 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2842 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2843 if (cmp != 0)
2844 return cmp;
2846 /* And after step. */
2847 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2848 if (cmp != 0)
2849 return cmp;
2851 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2852 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2853 if (cmp == 0)
2854 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2855 return cmp;
2858 /* If OP is the result of a conversion, return the unconverted value,
2859 otherwise return null. */
2861 static tree
2862 strip_conversion (tree op)
2864 if (TREE_CODE (op) != SSA_NAME)
2865 return NULL_TREE;
2866 gimple *stmt = SSA_NAME_DEF_STMT (op);
2867 if (!is_gimple_assign (stmt)
2868 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2869 return NULL_TREE;
2870 return gimple_assign_rhs1 (stmt);
2873 /* Return true if vectorizable_* routines can handle statements STMT1
2874 and STMT2 being in a single group. */
2876 static bool
2877 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2879 if (gimple_assign_single_p (stmt1))
2880 return gimple_assign_single_p (stmt2);
2882 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2884 /* Check for two masked loads or two masked stores. */
2885 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2886 return false;
2887 internal_fn ifn = gimple_call_internal_fn (stmt1);
2888 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2889 return false;
2890 if (ifn != gimple_call_internal_fn (stmt2))
2891 return false;
2893 /* Check that the masks are the same. Cope with casts of masks,
2894 like those created by build_mask_conversion. */
2895 tree mask1 = gimple_call_arg (stmt1, 2);
2896 tree mask2 = gimple_call_arg (stmt2, 2);
2897 if (!operand_equal_p (mask1, mask2, 0))
2899 mask1 = strip_conversion (mask1);
2900 if (!mask1)
2901 return false;
2902 mask2 = strip_conversion (mask2);
2903 if (!mask2)
2904 return false;
2905 if (!operand_equal_p (mask1, mask2, 0))
2906 return false;
2908 return true;
2911 return false;
2914 /* Function vect_analyze_data_ref_accesses.
2916 Analyze the access pattern of all the data references in the loop.
2918 FORNOW: the only access pattern that is considered vectorizable is a
2919 simple step 1 (consecutive) access.
2921 FORNOW: handle only arrays and pointer accesses. */
2923 bool
2924 vect_analyze_data_ref_accesses (vec_info *vinfo)
2926 unsigned int i;
2927 vec<data_reference_p> datarefs = vinfo->datarefs;
2928 struct data_reference *dr;
2930 if (dump_enabled_p ())
2931 dump_printf_loc (MSG_NOTE, vect_location,
2932 "=== vect_analyze_data_ref_accesses ===\n");
2934 if (datarefs.is_empty ())
2935 return true;
2937 /* Sort the array of datarefs to make building the interleaving chains
2938 linear. Don't modify the original vector's order, it is needed for
2939 determining what dependencies are reversed. */
2940 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2941 datarefs_copy.qsort (dr_group_sort_cmp);
2943 /* Build the interleaving chains. */
2944 for (i = 0; i < datarefs_copy.length () - 1;)
2946 data_reference_p dra = datarefs_copy[i];
2947 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2948 stmt_vec_info lastinfo = NULL;
2949 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2950 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2952 ++i;
2953 continue;
2955 for (i = i + 1; i < datarefs_copy.length (); ++i)
2957 data_reference_p drb = datarefs_copy[i];
2958 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2959 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2960 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2961 break;
2963 /* ??? Imperfect sorting (non-compatible types, non-modulo
2964 accesses, same accesses) can lead to a group to be artificially
2965 split here as we don't just skip over those. If it really
2966 matters we can push those to a worklist and re-iterate
2967 over them. The we can just skip ahead to the next DR here. */
2969 /* DRs in a different loop should not be put into the same
2970 interleaving group. */
2971 if (gimple_bb (DR_STMT (dra))->loop_father
2972 != gimple_bb (DR_STMT (drb))->loop_father)
2973 break;
2975 /* Check that the data-refs have same first location (except init)
2976 and they are both either store or load (not load and store,
2977 not masked loads or stores). */
2978 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2979 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2980 DR_BASE_ADDRESS (drb)) != 0
2981 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2982 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2983 break;
2985 /* Check that the data-refs have the same constant size. */
2986 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2987 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2988 if (!tree_fits_uhwi_p (sza)
2989 || !tree_fits_uhwi_p (szb)
2990 || !tree_int_cst_equal (sza, szb))
2991 break;
2993 /* Check that the data-refs have the same step. */
2994 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2995 break;
2997 /* Check the types are compatible.
2998 ??? We don't distinguish this during sorting. */
2999 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3000 TREE_TYPE (DR_REF (drb))))
3001 break;
3003 /* Check that the DR_INITs are compile-time constants. */
3004 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3005 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3006 break;
3008 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3009 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3010 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3011 HOST_WIDE_INT init_prev
3012 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3013 gcc_assert (init_a <= init_b
3014 && init_a <= init_prev
3015 && init_prev <= init_b);
3017 /* Do not place the same access in the interleaving chain twice. */
3018 if (init_b == init_prev)
3020 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3021 < gimple_uid (DR_STMT (drb)));
3022 /* ??? For now we simply "drop" the later reference which is
3023 otherwise the same rather than finishing off this group.
3024 In the end we'd want to re-process duplicates forming
3025 multiple groups from the refs, likely by just collecting
3026 all candidates (including duplicates and split points
3027 below) in a vector and then process them together. */
3028 continue;
3031 /* If init_b == init_a + the size of the type * k, we have an
3032 interleaving, and DRA is accessed before DRB. */
3033 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3034 if (type_size_a == 0
3035 || (init_b - init_a) % type_size_a != 0)
3036 break;
3038 /* If we have a store, the accesses are adjacent. This splits
3039 groups into chunks we support (we don't support vectorization
3040 of stores with gaps). */
3041 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3042 break;
3044 /* If the step (if not zero or non-constant) is greater than the
3045 difference between data-refs' inits this splits groups into
3046 suitable sizes. */
3047 if (tree_fits_shwi_p (DR_STEP (dra)))
3049 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3050 if (step != 0 && step <= (init_b - init_a))
3051 break;
3054 if (dump_enabled_p ())
3056 dump_printf_loc (MSG_NOTE, vect_location,
3057 "Detected interleaving ");
3058 if (DR_IS_READ (dra))
3059 dump_printf (MSG_NOTE, "load ");
3060 else
3061 dump_printf (MSG_NOTE, "store ");
3062 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3063 dump_printf (MSG_NOTE, " and ");
3064 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3065 dump_printf (MSG_NOTE, "\n");
3068 /* Link the found element into the group list. */
3069 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3071 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3072 lastinfo = stmtinfo_a;
3074 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3075 DR_GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3076 lastinfo = stmtinfo_b;
3080 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3081 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3082 && !vect_analyze_data_ref_access (dr))
3084 if (dump_enabled_p ())
3085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3086 "not vectorized: complicated access pattern.\n");
3088 if (is_a <bb_vec_info> (vinfo))
3090 /* Mark the statement as not vectorizable. */
3091 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3092 continue;
3094 else
3096 datarefs_copy.release ();
3097 return false;
3101 datarefs_copy.release ();
3102 return true;
3105 /* Function vect_vfa_segment_size.
3107 Input:
3108 DR: The data reference.
3109 LENGTH_FACTOR: segment length to consider.
3111 Return a value suitable for the dr_with_seg_len::seg_len field.
3112 This is the "distance travelled" by the pointer from the first
3113 iteration in the segment to the last. Note that it does not include
3114 the size of the access; in effect it only describes the first byte. */
3116 static tree
3117 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3119 length_factor = size_binop (MINUS_EXPR,
3120 fold_convert (sizetype, length_factor),
3121 size_one_node);
3122 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3123 length_factor);
3126 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3127 gives the worst-case number of bytes covered by the segment. */
3129 static unsigned HOST_WIDE_INT
3130 vect_vfa_access_size (data_reference *dr)
3132 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3133 tree ref_type = TREE_TYPE (DR_REF (dr));
3134 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3135 unsigned HOST_WIDE_INT access_size = ref_size;
3136 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3138 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3139 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3141 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3142 && (vect_supportable_dr_alignment (dr, false)
3143 == dr_explicit_realign_optimized))
3145 /* We might access a full vector's worth. */
3146 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3147 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3149 return access_size;
3152 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3154 static unsigned int
3155 vect_vfa_align (const data_reference *dr)
3157 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3160 /* Function vect_no_alias_p.
3162 Given data references A and B with equal base and offset, see whether
3163 the alias relation can be decided at compilation time. Return 1 if
3164 it can and the references alias, 0 if it can and the references do
3165 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3166 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3167 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3169 static int
3170 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3171 tree segment_length_a, tree segment_length_b,
3172 unsigned HOST_WIDE_INT access_size_a,
3173 unsigned HOST_WIDE_INT access_size_b)
3175 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3176 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3177 poly_uint64 const_length_a;
3178 poly_uint64 const_length_b;
3180 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3181 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3182 [a, a+12) */
3183 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3185 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3186 offset_a = (offset_a + access_size_a) - const_length_a;
3188 else
3189 const_length_a = tree_to_poly_uint64 (segment_length_a);
3190 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3192 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3193 offset_b = (offset_b + access_size_b) - const_length_b;
3195 else
3196 const_length_b = tree_to_poly_uint64 (segment_length_b);
3198 const_length_a += access_size_a;
3199 const_length_b += access_size_b;
3201 if (ranges_known_overlap_p (offset_a, const_length_a,
3202 offset_b, const_length_b))
3203 return 1;
3205 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3206 offset_b, const_length_b))
3207 return 0;
3209 return -1;
3212 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3213 in DDR is >= VF. */
3215 static bool
3216 dependence_distance_ge_vf (data_dependence_relation *ddr,
3217 unsigned int loop_depth, poly_uint64 vf)
3219 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3220 || DDR_NUM_DIST_VECTS (ddr) == 0)
3221 return false;
3223 /* If the dependence is exact, we should have limited the VF instead. */
3224 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3226 unsigned int i;
3227 lambda_vector dist_v;
3228 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3230 HOST_WIDE_INT dist = dist_v[loop_depth];
3231 if (dist != 0
3232 && !(dist > 0 && DDR_REVERSED_P (ddr))
3233 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3234 return false;
3237 if (dump_enabled_p ())
3239 dump_printf_loc (MSG_NOTE, vect_location,
3240 "dependence distance between ");
3241 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3242 dump_printf (MSG_NOTE, " and ");
3243 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3244 dump_printf (MSG_NOTE, " is >= VF\n");
3247 return true;
3250 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3252 static void
3253 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3255 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3256 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3257 dump_printf (dump_kind, ") >= ");
3258 dump_dec (dump_kind, lower_bound.min_value);
3261 /* Record that the vectorized loop requires the vec_lower_bound described
3262 by EXPR, UNSIGNED_P and MIN_VALUE. */
3264 static void
3265 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3266 poly_uint64 min_value)
3268 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3269 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3270 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3272 unsigned_p &= lower_bounds[i].unsigned_p;
3273 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3274 if (lower_bounds[i].unsigned_p != unsigned_p
3275 || maybe_lt (lower_bounds[i].min_value, min_value))
3277 lower_bounds[i].unsigned_p = unsigned_p;
3278 lower_bounds[i].min_value = min_value;
3279 if (dump_enabled_p ())
3281 dump_printf_loc (MSG_NOTE, vect_location,
3282 "updating run-time check to ");
3283 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3284 dump_printf (MSG_NOTE, "\n");
3287 return;
3290 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3291 if (dump_enabled_p ())
3293 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3294 dump_lower_bound (MSG_NOTE, lower_bound);
3295 dump_printf (MSG_NOTE, "\n");
3297 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3300 /* Return true if it's unlikely that the step of the vectorized form of DR
3301 will span fewer than GAP bytes. */
3303 static bool
3304 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3306 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3307 HOST_WIDE_INT count
3308 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3309 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3310 count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3311 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3314 /* Return true if we know that there is no alias between DR_A and DR_B
3315 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3316 *LOWER_BOUND_OUT to this N. */
3318 static bool
3319 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3320 poly_uint64 *lower_bound_out)
3322 /* Check that there is a constant gap of known sign between DR_A
3323 and DR_B. */
3324 poly_int64 init_a, init_b;
3325 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3326 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3327 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3328 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3329 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3330 || !ordered_p (init_a, init_b))
3331 return false;
3333 /* Sort DR_A and DR_B by the address they access. */
3334 if (maybe_lt (init_b, init_a))
3336 std::swap (init_a, init_b);
3337 std::swap (dr_a, dr_b);
3340 /* If the two accesses could be dependent within a scalar iteration,
3341 make sure that we'd retain their order. */
3342 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3343 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3344 return false;
3346 /* There is no alias if abs (DR_STEP) is greater than or equal to
3347 the bytes spanned by the combination of the two accesses. */
3348 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3349 return true;
3352 /* Function vect_prune_runtime_alias_test_list.
3354 Prune a list of ddrs to be tested at run-time by versioning for alias.
3355 Merge several alias checks into one if possible.
3356 Return FALSE if resulting list of ddrs is longer then allowed by
3357 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3359 bool
3360 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3362 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3363 hash_set <tree_pair_hash> compared_objects;
3365 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3366 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3367 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3368 vec<vec_object_pair> &check_unequal_addrs
3369 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3370 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3371 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3373 ddr_p ddr;
3374 unsigned int i;
3375 tree length_factor;
3377 if (dump_enabled_p ())
3378 dump_printf_loc (MSG_NOTE, vect_location,
3379 "=== vect_prune_runtime_alias_test_list ===\n");
3381 /* Step values are irrelevant for aliasing if the number of vector
3382 iterations is equal to the number of scalar iterations (which can
3383 happen for fully-SLP loops). */
3384 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3386 if (!ignore_step_p)
3388 /* Convert the checks for nonzero steps into bound tests. */
3389 tree value;
3390 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3391 vect_check_lower_bound (loop_vinfo, value, true, 1);
3394 if (may_alias_ddrs.is_empty ())
3395 return true;
3397 comp_alias_ddrs.create (may_alias_ddrs.length ());
3399 unsigned int loop_depth
3400 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3401 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3403 /* First, we collect all data ref pairs for aliasing checks. */
3404 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3406 int comp_res;
3407 poly_uint64 lower_bound;
3408 struct data_reference *dr_a, *dr_b;
3409 gimple *dr_group_first_a, *dr_group_first_b;
3410 tree segment_length_a, segment_length_b;
3411 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3412 unsigned int align_a, align_b;
3413 gimple *stmt_a, *stmt_b;
3415 /* Ignore the alias if the VF we chose ended up being no greater
3416 than the dependence distance. */
3417 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3418 continue;
3420 if (DDR_OBJECT_A (ddr))
3422 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3423 if (!compared_objects.add (new_pair))
3425 if (dump_enabled_p ())
3427 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3428 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3429 dump_printf (MSG_NOTE, " and ");
3430 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3431 dump_printf (MSG_NOTE, " have different addresses\n");
3433 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3435 continue;
3438 dr_a = DDR_A (ddr);
3439 stmt_a = DR_STMT (DDR_A (ddr));
3441 dr_b = DDR_B (ddr);
3442 stmt_b = DR_STMT (DDR_B (ddr));
3444 /* Skip the pair if inter-iteration dependencies are irrelevant
3445 and intra-iteration dependencies are guaranteed to be honored. */
3446 if (ignore_step_p
3447 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3448 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3450 if (dump_enabled_p ())
3452 dump_printf_loc (MSG_NOTE, vect_location,
3453 "no need for alias check between ");
3454 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3455 dump_printf (MSG_NOTE, " and ");
3456 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3457 dump_printf (MSG_NOTE, " when VF is 1\n");
3459 continue;
3462 /* See whether we can handle the alias using a bounds check on
3463 the step, and whether that's likely to be the best approach.
3464 (It might not be, for example, if the minimum step is much larger
3465 than the number of bytes handled by one vector iteration.) */
3466 if (!ignore_step_p
3467 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3468 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3469 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3470 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3472 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3473 if (dump_enabled_p ())
3475 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3476 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3477 dump_printf (MSG_NOTE, " and ");
3478 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3479 dump_printf (MSG_NOTE, " when the step ");
3480 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3481 dump_printf (MSG_NOTE, " is outside ");
3482 if (unsigned_p)
3483 dump_printf (MSG_NOTE, "[0");
3484 else
3486 dump_printf (MSG_NOTE, "(");
3487 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3489 dump_printf (MSG_NOTE, ", ");
3490 dump_dec (MSG_NOTE, lower_bound);
3491 dump_printf (MSG_NOTE, ")\n");
3493 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3494 lower_bound);
3495 continue;
3498 dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3499 if (dr_group_first_a)
3501 stmt_a = dr_group_first_a;
3502 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3505 dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3506 if (dr_group_first_b)
3508 stmt_b = dr_group_first_b;
3509 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3512 if (ignore_step_p)
3514 segment_length_a = size_zero_node;
3515 segment_length_b = size_zero_node;
3517 else
3519 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3520 length_factor = scalar_loop_iters;
3521 else
3522 length_factor = size_int (vect_factor);
3523 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3524 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3526 access_size_a = vect_vfa_access_size (dr_a);
3527 access_size_b = vect_vfa_access_size (dr_b);
3528 align_a = vect_vfa_align (dr_a);
3529 align_b = vect_vfa_align (dr_b);
3531 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3532 DR_BASE_ADDRESS (dr_b));
3533 if (comp_res == 0)
3534 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3535 DR_OFFSET (dr_b));
3537 /* See whether the alias is known at compilation time. */
3538 if (comp_res == 0
3539 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3540 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3541 && poly_int_tree_p (segment_length_a)
3542 && poly_int_tree_p (segment_length_b))
3544 int res = vect_compile_time_alias (dr_a, dr_b,
3545 segment_length_a,
3546 segment_length_b,
3547 access_size_a,
3548 access_size_b);
3549 if (res >= 0 && dump_enabled_p ())
3551 dump_printf_loc (MSG_NOTE, vect_location,
3552 "can tell at compile time that ");
3553 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3554 dump_printf (MSG_NOTE, " and ");
3555 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3556 if (res == 0)
3557 dump_printf (MSG_NOTE, " do not alias\n");
3558 else
3559 dump_printf (MSG_NOTE, " alias\n");
3562 if (res == 0)
3563 continue;
3565 if (res == 1)
3567 if (dump_enabled_p ())
3568 dump_printf_loc (MSG_NOTE, vect_location,
3569 "not vectorized: compilation time alias.\n");
3570 return false;
3574 dr_with_seg_len_pair_t dr_with_seg_len_pair
3575 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3576 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3578 /* Canonicalize pairs by sorting the two DR members. */
3579 if (comp_res > 0)
3580 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3582 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3585 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3587 unsigned int count = (comp_alias_ddrs.length ()
3588 + check_unequal_addrs.length ());
3590 dump_printf_loc (MSG_NOTE, vect_location,
3591 "improved number of alias checks from %d to %d\n",
3592 may_alias_ddrs.length (), count);
3593 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3595 if (dump_enabled_p ())
3596 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3597 "number of versioning for alias "
3598 "run-time tests exceeds %d "
3599 "(--param vect-max-version-for-alias-checks)\n",
3600 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3601 return false;
3604 return true;
3607 /* Check whether we can use an internal function for a gather load
3608 or scatter store. READ_P is true for loads and false for stores.
3609 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3610 the type of the memory elements being loaded or stored. OFFSET_BITS
3611 is the number of bits in each scalar offset and OFFSET_SIGN is the
3612 sign of the offset. SCALE is the amount by which the offset should
3613 be multiplied *after* it has been converted to address width.
3615 Return true if the function is supported, storing the function
3616 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3618 bool
3619 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3620 tree memory_type, unsigned int offset_bits,
3621 signop offset_sign, int scale,
3622 internal_fn *ifn_out, tree *element_type_out)
3624 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3625 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3626 if (offset_bits > element_bits)
3627 /* Internal functions require the offset to be the same width as
3628 the vector elements. We can extend narrower offsets, but it isn't
3629 safe to truncate wider offsets. */
3630 return false;
3632 if (element_bits != memory_bits)
3633 /* For now the vector elements must be the same width as the
3634 memory elements. */
3635 return false;
3637 /* Work out which function we need. */
3638 internal_fn ifn;
3639 if (read_p)
3640 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3641 else
3642 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3644 /* Test whether the target supports this combination. */
3645 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3646 offset_sign, scale))
3647 return false;
3649 *ifn_out = ifn;
3650 *element_type_out = TREE_TYPE (vectype);
3651 return true;
3654 /* CALL is a call to an internal gather load or scatter store function.
3655 Describe the operation in INFO. */
3657 static void
3658 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3660 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3661 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3662 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3664 info->ifn = gimple_call_internal_fn (call);
3665 info->decl = NULL_TREE;
3666 info->base = gimple_call_arg (call, 0);
3667 info->offset = gimple_call_arg (call, 1);
3668 info->offset_dt = vect_unknown_def_type;
3669 info->offset_vectype = NULL_TREE;
3670 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3671 info->element_type = TREE_TYPE (vectype);
3672 info->memory_type = TREE_TYPE (DR_REF (dr));
3675 /* Return true if a non-affine read or write in STMT is suitable for a
3676 gather load or scatter store. Describe the operation in *INFO if so. */
3678 bool
3679 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3680 gather_scatter_info *info)
3682 HOST_WIDE_INT scale = 1;
3683 poly_int64 pbitpos, pbitsize;
3684 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3685 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3686 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3687 tree offtype = NULL_TREE;
3688 tree decl = NULL_TREE, base, off;
3689 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3690 tree memory_type = TREE_TYPE (DR_REF (dr));
3691 machine_mode pmode;
3692 int punsignedp, reversep, pvolatilep = 0;
3693 internal_fn ifn;
3694 tree element_type;
3695 bool masked_p = false;
3697 /* See whether this is already a call to a gather/scatter internal function.
3698 If not, see whether it's a masked load or store. */
3699 gcall *call = dyn_cast <gcall *> (stmt);
3700 if (call && gimple_call_internal_p (call))
3702 ifn = gimple_call_internal_fn (stmt);
3703 if (internal_gather_scatter_fn_p (ifn))
3705 vect_describe_gather_scatter_call (call, info);
3706 return true;
3708 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3711 /* True if we should aim to use internal functions rather than
3712 built-in functions. */
3713 bool use_ifn_p = (DR_IS_READ (dr)
3714 ? supports_vec_gather_load_p ()
3715 : supports_vec_scatter_store_p ());
3717 base = DR_REF (dr);
3718 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3719 see if we can use the def stmt of the address. */
3720 if (masked_p
3721 && TREE_CODE (base) == MEM_REF
3722 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3723 && integer_zerop (TREE_OPERAND (base, 1))
3724 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3726 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3727 if (is_gimple_assign (def_stmt)
3728 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3729 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3732 /* The gather and scatter builtins need address of the form
3733 loop_invariant + vector * {1, 2, 4, 8}
3735 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3736 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3737 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3738 multiplications and additions in it. To get a vector, we need
3739 a single SSA_NAME that will be defined in the loop and will
3740 contain everything that is not loop invariant and that can be
3741 vectorized. The following code attempts to find such a preexistng
3742 SSA_NAME OFF and put the loop invariants into a tree BASE
3743 that can be gimplified before the loop. */
3744 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3745 &punsignedp, &reversep, &pvolatilep);
3746 gcc_assert (base && !reversep);
3747 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3749 if (TREE_CODE (base) == MEM_REF)
3751 if (!integer_zerop (TREE_OPERAND (base, 1)))
3753 if (off == NULL_TREE)
3754 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3755 else
3756 off = size_binop (PLUS_EXPR, off,
3757 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3759 base = TREE_OPERAND (base, 0);
3761 else
3762 base = build_fold_addr_expr (base);
3764 if (off == NULL_TREE)
3765 off = size_zero_node;
3767 /* If base is not loop invariant, either off is 0, then we start with just
3768 the constant offset in the loop invariant BASE and continue with base
3769 as OFF, otherwise give up.
3770 We could handle that case by gimplifying the addition of base + off
3771 into some SSA_NAME and use that as off, but for now punt. */
3772 if (!expr_invariant_in_loop_p (loop, base))
3774 if (!integer_zerop (off))
3775 return false;
3776 off = base;
3777 base = size_int (pbytepos);
3779 /* Otherwise put base + constant offset into the loop invariant BASE
3780 and continue with OFF. */
3781 else
3783 base = fold_convert (sizetype, base);
3784 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3787 /* OFF at this point may be either a SSA_NAME or some tree expression
3788 from get_inner_reference. Try to peel off loop invariants from it
3789 into BASE as long as possible. */
3790 STRIP_NOPS (off);
3791 while (offtype == NULL_TREE)
3793 enum tree_code code;
3794 tree op0, op1, add = NULL_TREE;
3796 if (TREE_CODE (off) == SSA_NAME)
3798 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3800 if (expr_invariant_in_loop_p (loop, off))
3801 return false;
3803 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3804 break;
3806 op0 = gimple_assign_rhs1 (def_stmt);
3807 code = gimple_assign_rhs_code (def_stmt);
3808 op1 = gimple_assign_rhs2 (def_stmt);
3810 else
3812 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3813 return false;
3814 code = TREE_CODE (off);
3815 extract_ops_from_tree (off, &code, &op0, &op1);
3817 switch (code)
3819 case POINTER_PLUS_EXPR:
3820 case PLUS_EXPR:
3821 if (expr_invariant_in_loop_p (loop, op0))
3823 add = op0;
3824 off = op1;
3825 do_add:
3826 add = fold_convert (sizetype, add);
3827 if (scale != 1)
3828 add = size_binop (MULT_EXPR, add, size_int (scale));
3829 base = size_binop (PLUS_EXPR, base, add);
3830 continue;
3832 if (expr_invariant_in_loop_p (loop, op1))
3834 add = op1;
3835 off = op0;
3836 goto do_add;
3838 break;
3839 case MINUS_EXPR:
3840 if (expr_invariant_in_loop_p (loop, op1))
3842 add = fold_convert (sizetype, op1);
3843 add = size_binop (MINUS_EXPR, size_zero_node, add);
3844 off = op0;
3845 goto do_add;
3847 break;
3848 case MULT_EXPR:
3849 if (scale == 1 && tree_fits_shwi_p (op1))
3851 int new_scale = tree_to_shwi (op1);
3852 /* Only treat this as a scaling operation if the target
3853 supports it. */
3854 if (use_ifn_p
3855 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3856 vectype, memory_type, 1,
3857 TYPE_SIGN (TREE_TYPE (op0)),
3858 new_scale, &ifn,
3859 &element_type))
3860 break;
3861 scale = new_scale;
3862 off = op0;
3863 continue;
3865 break;
3866 case SSA_NAME:
3867 off = op0;
3868 continue;
3869 CASE_CONVERT:
3870 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3871 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3872 break;
3873 if (TYPE_PRECISION (TREE_TYPE (op0))
3874 == TYPE_PRECISION (TREE_TYPE (off)))
3876 off = op0;
3877 continue;
3880 /* The internal functions need the offset to be the same width
3881 as the elements of VECTYPE. Don't include operations that
3882 cast the offset from that width to a different width. */
3883 if (use_ifn_p
3884 && (int_size_in_bytes (TREE_TYPE (vectype))
3885 == int_size_in_bytes (TREE_TYPE (off))))
3886 break;
3888 if (TYPE_PRECISION (TREE_TYPE (op0))
3889 < TYPE_PRECISION (TREE_TYPE (off)))
3891 off = op0;
3892 offtype = TREE_TYPE (off);
3893 STRIP_NOPS (off);
3894 continue;
3896 break;
3897 default:
3898 break;
3900 break;
3903 /* If at the end OFF still isn't a SSA_NAME or isn't
3904 defined in the loop, punt. */
3905 if (TREE_CODE (off) != SSA_NAME
3906 || expr_invariant_in_loop_p (loop, off))
3907 return false;
3909 if (offtype == NULL_TREE)
3910 offtype = TREE_TYPE (off);
3912 if (use_ifn_p)
3914 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3915 memory_type, TYPE_PRECISION (offtype),
3916 TYPE_SIGN (offtype), scale, &ifn,
3917 &element_type))
3918 return false;
3920 else
3922 if (DR_IS_READ (dr))
3924 if (targetm.vectorize.builtin_gather)
3925 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3927 else
3929 if (targetm.vectorize.builtin_scatter)
3930 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3933 if (!decl)
3934 return false;
3936 ifn = IFN_LAST;
3937 element_type = TREE_TYPE (vectype);
3940 info->ifn = ifn;
3941 info->decl = decl;
3942 info->base = base;
3943 info->offset = off;
3944 info->offset_dt = vect_unknown_def_type;
3945 info->offset_vectype = NULL_TREE;
3946 info->scale = scale;
3947 info->element_type = element_type;
3948 info->memory_type = memory_type;
3949 return true;
3952 /* Find the data references in STMT, analyze them with respect to LOOP and
3953 append them to DATAREFS. Return false if datarefs in this stmt cannot
3954 be handled. */
3956 bool
3957 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3958 vec<data_reference_p> *datarefs)
3960 /* We can ignore clobbers for dataref analysis - they are removed during
3961 loop vectorization and BB vectorization checks dependences with a
3962 stmt walk. */
3963 if (gimple_clobber_p (stmt))
3964 return true;
3966 if (gimple_has_volatile_ops (stmt))
3968 if (dump_enabled_p ())
3970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3971 "not vectorized: volatile type ");
3972 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3974 return false;
3977 if (stmt_can_throw_internal (stmt))
3979 if (dump_enabled_p ())
3981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3982 "not vectorized: statement can throw an "
3983 "exception ");
3984 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3986 return false;
3989 auto_vec<data_reference_p, 2> refs;
3990 if (!find_data_references_in_stmt (loop, stmt, &refs))
3991 return false;
3993 if (refs.is_empty ())
3994 return true;
3996 if (refs.length () > 1)
3998 if (dump_enabled_p ())
4000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4001 "not vectorized: more than one data ref "
4002 "in stmt: ");
4003 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4005 return false;
4008 if (gcall *call = dyn_cast <gcall *> (stmt))
4009 if (!gimple_call_internal_p (call)
4010 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4011 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4013 if (dump_enabled_p ())
4015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4016 "not vectorized: dr in a call ");
4017 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4019 return false;
4022 data_reference_p dr = refs.pop ();
4023 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4024 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4026 if (dump_enabled_p ())
4028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4029 "not vectorized: statement is bitfield "
4030 "access ");
4031 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4033 return false;
4036 if (DR_BASE_ADDRESS (dr)
4037 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4039 if (dump_enabled_p ())
4040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4041 "not vectorized: base addr of dr is a "
4042 "constant\n");
4043 return false;
4046 datarefs->safe_push (dr);
4047 return true;
4050 /* Function vect_analyze_data_refs.
4052 Find all the data references in the loop or basic block.
4054 The general structure of the analysis of data refs in the vectorizer is as
4055 follows:
4056 1- vect_analyze_data_refs(loop/bb): call
4057 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4058 in the loop/bb and their dependences.
4059 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4060 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4061 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4065 bool
4066 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4068 struct loop *loop = NULL;
4069 unsigned int i;
4070 struct data_reference *dr;
4071 tree scalar_type;
4073 if (dump_enabled_p ())
4074 dump_printf_loc (MSG_NOTE, vect_location,
4075 "=== vect_analyze_data_refs ===\n");
4077 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4078 loop = LOOP_VINFO_LOOP (loop_vinfo);
4080 /* Go through the data-refs, check that the analysis succeeded. Update
4081 pointer from stmt_vec_info struct to DR and vectype. */
4083 vec<data_reference_p> datarefs = vinfo->datarefs;
4084 FOR_EACH_VEC_ELT (datarefs, i, dr)
4086 gimple *stmt;
4087 stmt_vec_info stmt_info;
4088 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4089 bool simd_lane_access = false;
4090 poly_uint64 vf;
4092 gcc_assert (DR_REF (dr));
4093 stmt = DR_STMT (dr);
4094 stmt_info = vinfo_for_stmt (stmt);
4096 /* Check that analysis of the data-ref succeeded. */
4097 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4098 || !DR_STEP (dr))
4100 bool maybe_gather
4101 = DR_IS_READ (dr)
4102 && !TREE_THIS_VOLATILE (DR_REF (dr))
4103 && (targetm.vectorize.builtin_gather != NULL
4104 || supports_vec_gather_load_p ());
4105 bool maybe_scatter
4106 = DR_IS_WRITE (dr)
4107 && !TREE_THIS_VOLATILE (DR_REF (dr))
4108 && (targetm.vectorize.builtin_scatter != NULL
4109 || supports_vec_scatter_store_p ());
4110 bool maybe_simd_lane_access
4111 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4113 /* If target supports vector gather loads or scatter stores, or if
4114 this might be a SIMD lane access, see if they can't be used. */
4115 if (is_a <loop_vec_info> (vinfo)
4116 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4117 && !nested_in_vect_loop_p (loop, stmt))
4119 struct data_reference *newdr
4120 = create_data_ref (NULL, loop_containing_stmt (stmt),
4121 DR_REF (dr), stmt, !maybe_scatter,
4122 DR_IS_CONDITIONAL_IN_STMT (dr));
4123 gcc_assert (newdr != NULL && DR_REF (newdr));
4124 if (DR_BASE_ADDRESS (newdr)
4125 && DR_OFFSET (newdr)
4126 && DR_INIT (newdr)
4127 && DR_STEP (newdr)
4128 && integer_zerop (DR_STEP (newdr)))
4130 if (maybe_simd_lane_access)
4132 tree off = DR_OFFSET (newdr);
4133 STRIP_NOPS (off);
4134 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4135 && TREE_CODE (off) == MULT_EXPR
4136 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4138 tree step = TREE_OPERAND (off, 1);
4139 off = TREE_OPERAND (off, 0);
4140 STRIP_NOPS (off);
4141 if (CONVERT_EXPR_P (off)
4142 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4143 0)))
4144 < TYPE_PRECISION (TREE_TYPE (off)))
4145 off = TREE_OPERAND (off, 0);
4146 if (TREE_CODE (off) == SSA_NAME)
4148 gimple *def = SSA_NAME_DEF_STMT (off);
4149 tree reft = TREE_TYPE (DR_REF (newdr));
4150 if (is_gimple_call (def)
4151 && gimple_call_internal_p (def)
4152 && (gimple_call_internal_fn (def)
4153 == IFN_GOMP_SIMD_LANE))
4155 tree arg = gimple_call_arg (def, 0);
4156 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4157 arg = SSA_NAME_VAR (arg);
4158 if (arg == loop->simduid
4159 /* For now. */
4160 && tree_int_cst_equal
4161 (TYPE_SIZE_UNIT (reft),
4162 step))
4164 DR_OFFSET (newdr) = ssize_int (0);
4165 DR_STEP (newdr) = step;
4166 DR_OFFSET_ALIGNMENT (newdr)
4167 = BIGGEST_ALIGNMENT;
4168 DR_STEP_ALIGNMENT (newdr)
4169 = highest_pow2_factor (step);
4170 dr = newdr;
4171 simd_lane_access = true;
4177 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4179 dr = newdr;
4180 if (maybe_gather)
4181 gatherscatter = GATHER;
4182 else
4183 gatherscatter = SCATTER;
4186 if (gatherscatter == SG_NONE && !simd_lane_access)
4187 free_data_ref (newdr);
4190 if (gatherscatter == SG_NONE && !simd_lane_access)
4192 if (dump_enabled_p ())
4194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4195 "not vectorized: data ref analysis "
4196 "failed ");
4197 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4199 if (is_a <bb_vec_info> (vinfo))
4201 /* In BB vectorization the ref can still participate
4202 in dependence analysis, we just can't vectorize it. */
4203 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4204 continue;
4206 return false;
4210 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4211 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4212 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4214 if (dump_enabled_p ())
4216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4217 "not vectorized: base object not addressable "
4218 "for stmt: ");
4219 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4221 if (is_a <bb_vec_info> (vinfo))
4223 /* In BB vectorization the ref can still participate
4224 in dependence analysis, we just can't vectorize it. */
4225 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4226 continue;
4228 return false;
4231 if (is_a <loop_vec_info> (vinfo)
4232 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4234 if (nested_in_vect_loop_p (loop, stmt))
4236 if (dump_enabled_p ())
4238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4239 "not vectorized: not suitable for strided "
4240 "load ");
4241 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4243 return false;
4245 STMT_VINFO_STRIDED_P (stmt_info) = true;
4248 /* Update DR field in stmt_vec_info struct. */
4250 /* If the dataref is in an inner-loop of the loop that is considered for
4251 for vectorization, we also want to analyze the access relative to
4252 the outer-loop (DR contains information only relative to the
4253 inner-most enclosing loop). We do that by building a reference to the
4254 first location accessed by the inner-loop, and analyze it relative to
4255 the outer-loop. */
4256 if (loop && nested_in_vect_loop_p (loop, stmt))
4258 /* Build a reference to the first location accessed by the
4259 inner loop: *(BASE + INIT + OFFSET). By construction,
4260 this address must be invariant in the inner loop, so we
4261 can consider it as being used in the outer loop. */
4262 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4263 tree offset = unshare_expr (DR_OFFSET (dr));
4264 tree init = unshare_expr (DR_INIT (dr));
4265 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4266 init, offset);
4267 tree init_addr = fold_build_pointer_plus (base, init_offset);
4268 tree init_ref = build_fold_indirect_ref (init_addr);
4270 if (dump_enabled_p ())
4272 dump_printf_loc (MSG_NOTE, vect_location,
4273 "analyze in outer loop: ");
4274 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4275 dump_printf (MSG_NOTE, "\n");
4278 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4279 init_ref, loop))
4280 /* dr_analyze_innermost already explained the failure. */
4281 return false;
4283 if (dump_enabled_p ())
4285 dump_printf_loc (MSG_NOTE, vect_location,
4286 "\touter base_address: ");
4287 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4288 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4289 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4290 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4291 STMT_VINFO_DR_OFFSET (stmt_info));
4292 dump_printf (MSG_NOTE,
4293 "\n\touter constant offset from base address: ");
4294 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4295 STMT_VINFO_DR_INIT (stmt_info));
4296 dump_printf (MSG_NOTE, "\n\touter step: ");
4297 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4298 STMT_VINFO_DR_STEP (stmt_info));
4299 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4300 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4301 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4302 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4303 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4304 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4305 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4306 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4310 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4311 STMT_VINFO_DATA_REF (stmt_info) = dr;
4312 if (simd_lane_access)
4314 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4315 free_data_ref (datarefs[i]);
4316 datarefs[i] = dr;
4319 /* Set vectype for STMT. */
4320 scalar_type = TREE_TYPE (DR_REF (dr));
4321 STMT_VINFO_VECTYPE (stmt_info)
4322 = get_vectype_for_scalar_type (scalar_type);
4323 if (!STMT_VINFO_VECTYPE (stmt_info))
4325 if (dump_enabled_p ())
4327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4328 "not vectorized: no vectype for stmt: ");
4329 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4330 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4331 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4332 scalar_type);
4333 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4336 if (is_a <bb_vec_info> (vinfo))
4338 /* No vector type is fine, the ref can still participate
4339 in dependence analysis, we just can't vectorize it. */
4340 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4341 continue;
4344 if (gatherscatter != SG_NONE || simd_lane_access)
4346 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4347 if (gatherscatter != SG_NONE)
4348 free_data_ref (dr);
4350 return false;
4352 else
4354 if (dump_enabled_p ())
4356 dump_printf_loc (MSG_NOTE, vect_location,
4357 "got vectype for stmt: ");
4358 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4359 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4360 STMT_VINFO_VECTYPE (stmt_info));
4361 dump_printf (MSG_NOTE, "\n");
4365 /* Adjust the minimal vectorization factor according to the
4366 vector type. */
4367 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4368 *min_vf = upper_bound (*min_vf, vf);
4370 if (gatherscatter != SG_NONE)
4372 gather_scatter_info gs_info;
4373 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4374 &gs_info)
4375 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4377 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4378 free_data_ref (dr);
4379 if (dump_enabled_p ())
4381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4382 (gatherscatter == GATHER) ?
4383 "not vectorized: not suitable for gather "
4384 "load " :
4385 "not vectorized: not suitable for scatter "
4386 "store ");
4387 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4389 return false;
4392 free_data_ref (datarefs[i]);
4393 datarefs[i] = dr;
4394 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4398 /* We used to stop processing and prune the list here. Verify we no
4399 longer need to. */
4400 gcc_assert (i == datarefs.length ());
4402 return true;
4406 /* Function vect_get_new_vect_var.
4408 Returns a name for a new variable. The current naming scheme appends the
4409 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4410 the name of vectorizer generated variables, and appends that to NAME if
4411 provided. */
4413 tree
4414 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4416 const char *prefix;
4417 tree new_vect_var;
4419 switch (var_kind)
4421 case vect_simple_var:
4422 prefix = "vect";
4423 break;
4424 case vect_scalar_var:
4425 prefix = "stmp";
4426 break;
4427 case vect_mask_var:
4428 prefix = "mask";
4429 break;
4430 case vect_pointer_var:
4431 prefix = "vectp";
4432 break;
4433 default:
4434 gcc_unreachable ();
4437 if (name)
4439 char* tmp = concat (prefix, "_", name, NULL);
4440 new_vect_var = create_tmp_reg (type, tmp);
4441 free (tmp);
4443 else
4444 new_vect_var = create_tmp_reg (type, prefix);
4446 return new_vect_var;
4449 /* Like vect_get_new_vect_var but return an SSA name. */
4451 tree
4452 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4454 const char *prefix;
4455 tree new_vect_var;
4457 switch (var_kind)
4459 case vect_simple_var:
4460 prefix = "vect";
4461 break;
4462 case vect_scalar_var:
4463 prefix = "stmp";
4464 break;
4465 case vect_pointer_var:
4466 prefix = "vectp";
4467 break;
4468 default:
4469 gcc_unreachable ();
4472 if (name)
4474 char* tmp = concat (prefix, "_", name, NULL);
4475 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4476 free (tmp);
4478 else
4479 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4481 return new_vect_var;
4484 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4486 static void
4487 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4489 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4490 int misalign = DR_MISALIGNMENT (dr);
4491 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4492 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4493 else
4494 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4495 DR_TARGET_ALIGNMENT (dr), misalign);
4498 /* Function vect_create_addr_base_for_vector_ref.
4500 Create an expression that computes the address of the first memory location
4501 that will be accessed for a data reference.
4503 Input:
4504 STMT: The statement containing the data reference.
4505 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4506 OFFSET: Optional. If supplied, it is be added to the initial address.
4507 LOOP: Specify relative to which loop-nest should the address be computed.
4508 For example, when the dataref is in an inner-loop nested in an
4509 outer-loop that is now being vectorized, LOOP can be either the
4510 outer-loop, or the inner-loop. The first memory location accessed
4511 by the following dataref ('in' points to short):
4513 for (i=0; i<N; i++)
4514 for (j=0; j<M; j++)
4515 s += in[i+j]
4517 is as follows:
4518 if LOOP=i_loop: &in (relative to i_loop)
4519 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4520 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4521 initial address. Unlike OFFSET, which is number of elements to
4522 be added, BYTE_OFFSET is measured in bytes.
4524 Output:
4525 1. Return an SSA_NAME whose value is the address of the memory location of
4526 the first vector of the data reference.
4527 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4528 these statement(s) which define the returned SSA_NAME.
4530 FORNOW: We are only handling array accesses with step 1. */
4532 tree
4533 vect_create_addr_base_for_vector_ref (gimple *stmt,
4534 gimple_seq *new_stmt_list,
4535 tree offset,
4536 tree byte_offset)
4538 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4539 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4540 const char *base_name;
4541 tree addr_base;
4542 tree dest;
4543 gimple_seq seq = NULL;
4544 tree vect_ptr_type;
4545 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4546 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4547 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4549 tree data_ref_base = unshare_expr (drb->base_address);
4550 tree base_offset = unshare_expr (drb->offset);
4551 tree init = unshare_expr (drb->init);
4553 if (loop_vinfo)
4554 base_name = get_name (data_ref_base);
4555 else
4557 base_offset = ssize_int (0);
4558 init = ssize_int (0);
4559 base_name = get_name (DR_REF (dr));
4562 /* Create base_offset */
4563 base_offset = size_binop (PLUS_EXPR,
4564 fold_convert (sizetype, base_offset),
4565 fold_convert (sizetype, init));
4567 if (offset)
4569 offset = fold_build2 (MULT_EXPR, sizetype,
4570 fold_convert (sizetype, offset), step);
4571 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4572 base_offset, offset);
4574 if (byte_offset)
4576 byte_offset = fold_convert (sizetype, byte_offset);
4577 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4578 base_offset, byte_offset);
4581 /* base + base_offset */
4582 if (loop_vinfo)
4583 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4584 else
4586 addr_base = build1 (ADDR_EXPR,
4587 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4588 unshare_expr (DR_REF (dr)));
4591 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4592 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4593 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4594 gimple_seq_add_seq (new_stmt_list, seq);
4596 if (DR_PTR_INFO (dr)
4597 && TREE_CODE (addr_base) == SSA_NAME
4598 && !SSA_NAME_PTR_INFO (addr_base))
4600 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4601 if (offset || byte_offset)
4602 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4605 if (dump_enabled_p ())
4607 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4608 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4609 dump_printf (MSG_NOTE, "\n");
4612 return addr_base;
4616 /* Function vect_create_data_ref_ptr.
4618 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4619 location accessed in the loop by STMT, along with the def-use update
4620 chain to appropriately advance the pointer through the loop iterations.
4621 Also set aliasing information for the pointer. This pointer is used by
4622 the callers to this function to create a memory reference expression for
4623 vector load/store access.
4625 Input:
4626 1. STMT: a stmt that references memory. Expected to be of the form
4627 GIMPLE_ASSIGN <name, data-ref> or
4628 GIMPLE_ASSIGN <data-ref, name>.
4629 2. AGGR_TYPE: the type of the reference, which should be either a vector
4630 or an array.
4631 3. AT_LOOP: the loop where the vector memref is to be created.
4632 4. OFFSET (optional): an offset to be added to the initial address accessed
4633 by the data-ref in STMT.
4634 5. BSI: location where the new stmts are to be placed if there is no loop
4635 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4636 pointing to the initial address.
4637 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4638 to the initial address accessed by the data-ref in STMT. This is
4639 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4640 in bytes.
4641 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4642 to the IV during each iteration of the loop. NULL says to move
4643 by one copy of AGGR_TYPE up or down, depending on the step of the
4644 data reference.
4646 Output:
4647 1. Declare a new ptr to vector_type, and have it point to the base of the
4648 data reference (initial addressed accessed by the data reference).
4649 For example, for vector of type V8HI, the following code is generated:
4651 v8hi *ap;
4652 ap = (v8hi *)initial_address;
4654 if OFFSET is not supplied:
4655 initial_address = &a[init];
4656 if OFFSET is supplied:
4657 initial_address = &a[init + OFFSET];
4658 if BYTE_OFFSET is supplied:
4659 initial_address = &a[init] + BYTE_OFFSET;
4661 Return the initial_address in INITIAL_ADDRESS.
4663 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4664 update the pointer in each iteration of the loop.
4666 Return the increment stmt that updates the pointer in PTR_INCR.
4668 3. Set INV_P to true if the access pattern of the data reference in the
4669 vectorized loop is invariant. Set it to false otherwise.
4671 4. Return the pointer. */
4673 tree
4674 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4675 tree offset, tree *initial_address,
4676 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4677 bool only_init, bool *inv_p, tree byte_offset,
4678 tree iv_step)
4680 const char *base_name;
4681 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4682 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4683 struct loop *loop = NULL;
4684 bool nested_in_vect_loop = false;
4685 struct loop *containing_loop = NULL;
4686 tree aggr_ptr_type;
4687 tree aggr_ptr;
4688 tree new_temp;
4689 gimple_seq new_stmt_list = NULL;
4690 edge pe = NULL;
4691 basic_block new_bb;
4692 tree aggr_ptr_init;
4693 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4694 tree aptr;
4695 gimple_stmt_iterator incr_gsi;
4696 bool insert_after;
4697 tree indx_before_incr, indx_after_incr;
4698 gimple *incr;
4699 tree step;
4700 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4702 gcc_assert (iv_step != NULL_TREE
4703 || TREE_CODE (aggr_type) == ARRAY_TYPE
4704 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4706 if (loop_vinfo)
4708 loop = LOOP_VINFO_LOOP (loop_vinfo);
4709 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4710 containing_loop = (gimple_bb (stmt))->loop_father;
4711 pe = loop_preheader_edge (loop);
4713 else
4715 gcc_assert (bb_vinfo);
4716 only_init = true;
4717 *ptr_incr = NULL;
4720 /* Check the step (evolution) of the load in LOOP, and record
4721 whether it's invariant. */
4722 step = vect_dr_behavior (dr)->step;
4723 if (integer_zerop (step))
4724 *inv_p = true;
4725 else
4726 *inv_p = false;
4728 /* Create an expression for the first address accessed by this load
4729 in LOOP. */
4730 base_name = get_name (DR_BASE_ADDRESS (dr));
4732 if (dump_enabled_p ())
4734 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4735 dump_printf_loc (MSG_NOTE, vect_location,
4736 "create %s-pointer variable to type: ",
4737 get_tree_code_name (TREE_CODE (aggr_type)));
4738 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4739 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4740 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4741 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4742 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4743 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4744 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4745 else
4746 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4747 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4748 dump_printf (MSG_NOTE, "\n");
4751 /* (1) Create the new aggregate-pointer variable.
4752 Vector and array types inherit the alias set of their component
4753 type by default so we need to use a ref-all pointer if the data
4754 reference does not conflict with the created aggregated data
4755 reference because it is not addressable. */
4756 bool need_ref_all = false;
4757 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4758 get_alias_set (DR_REF (dr))))
4759 need_ref_all = true;
4760 /* Likewise for any of the data references in the stmt group. */
4761 else if (DR_GROUP_SIZE (stmt_info) > 1)
4763 gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4766 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4767 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4768 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4769 get_alias_set (DR_REF (sdr))))
4771 need_ref_all = true;
4772 break;
4774 orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4776 while (orig_stmt);
4778 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4779 need_ref_all);
4780 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4783 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4784 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4785 def-use update cycles for the pointer: one relative to the outer-loop
4786 (LOOP), which is what steps (3) and (4) below do. The other is relative
4787 to the inner-loop (which is the inner-most loop containing the dataref),
4788 and this is done be step (5) below.
4790 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4791 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4792 redundant. Steps (3),(4) create the following:
4794 vp0 = &base_addr;
4795 LOOP: vp1 = phi(vp0,vp2)
4798 vp2 = vp1 + step
4799 goto LOOP
4801 If there is an inner-loop nested in loop, then step (5) will also be
4802 applied, and an additional update in the inner-loop will be created:
4804 vp0 = &base_addr;
4805 LOOP: vp1 = phi(vp0,vp2)
4807 inner: vp3 = phi(vp1,vp4)
4808 vp4 = vp3 + inner_step
4809 if () goto inner
4811 vp2 = vp1 + step
4812 if () goto LOOP */
4814 /* (2) Calculate the initial address of the aggregate-pointer, and set
4815 the aggregate-pointer to point to it before the loop. */
4817 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4819 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4820 offset, byte_offset);
4821 if (new_stmt_list)
4823 if (pe)
4825 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4826 gcc_assert (!new_bb);
4828 else
4829 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4832 *initial_address = new_temp;
4833 aggr_ptr_init = new_temp;
4835 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4836 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4837 inner-loop nested in LOOP (during outer-loop vectorization). */
4839 /* No update in loop is required. */
4840 if (only_init && (!loop_vinfo || at_loop == loop))
4841 aptr = aggr_ptr_init;
4842 else
4844 if (iv_step == NULL_TREE)
4846 /* The step of the aggregate pointer is the type size. */
4847 iv_step = TYPE_SIZE_UNIT (aggr_type);
4848 /* One exception to the above is when the scalar step of the load in
4849 LOOP is zero. In this case the step here is also zero. */
4850 if (*inv_p)
4851 iv_step = size_zero_node;
4852 else if (tree_int_cst_sgn (step) == -1)
4853 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4856 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4858 create_iv (aggr_ptr_init,
4859 fold_convert (aggr_ptr_type, iv_step),
4860 aggr_ptr, loop, &incr_gsi, insert_after,
4861 &indx_before_incr, &indx_after_incr);
4862 incr = gsi_stmt (incr_gsi);
4863 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4865 /* Copy the points-to information if it exists. */
4866 if (DR_PTR_INFO (dr))
4868 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4869 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4871 if (ptr_incr)
4872 *ptr_incr = incr;
4874 aptr = indx_before_incr;
4877 if (!nested_in_vect_loop || only_init)
4878 return aptr;
4881 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4882 nested in LOOP, if exists. */
4884 gcc_assert (nested_in_vect_loop);
4885 if (!only_init)
4887 standard_iv_increment_position (containing_loop, &incr_gsi,
4888 &insert_after);
4889 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4890 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4891 &indx_after_incr);
4892 incr = gsi_stmt (incr_gsi);
4893 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4895 /* Copy the points-to information if it exists. */
4896 if (DR_PTR_INFO (dr))
4898 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4899 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4901 if (ptr_incr)
4902 *ptr_incr = incr;
4904 return indx_before_incr;
4906 else
4907 gcc_unreachable ();
4911 /* Function bump_vector_ptr
4913 Increment a pointer (to a vector type) by vector-size. If requested,
4914 i.e. if PTR-INCR is given, then also connect the new increment stmt
4915 to the existing def-use update-chain of the pointer, by modifying
4916 the PTR_INCR as illustrated below:
4918 The pointer def-use update-chain before this function:
4919 DATAREF_PTR = phi (p_0, p_2)
4920 ....
4921 PTR_INCR: p_2 = DATAREF_PTR + step
4923 The pointer def-use update-chain after this function:
4924 DATAREF_PTR = phi (p_0, p_2)
4925 ....
4926 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4927 ....
4928 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4930 Input:
4931 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4932 in the loop.
4933 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4934 the loop. The increment amount across iterations is expected
4935 to be vector_size.
4936 BSI - location where the new update stmt is to be placed.
4937 STMT - the original scalar memory-access stmt that is being vectorized.
4938 BUMP - optional. The offset by which to bump the pointer. If not given,
4939 the offset is assumed to be vector_size.
4941 Output: Return NEW_DATAREF_PTR as illustrated above.
4945 tree
4946 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4947 gimple *stmt, tree bump)
4949 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4950 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4951 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4952 tree update = TYPE_SIZE_UNIT (vectype);
4953 gassign *incr_stmt;
4954 ssa_op_iter iter;
4955 use_operand_p use_p;
4956 tree new_dataref_ptr;
4958 if (bump)
4959 update = bump;
4961 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4962 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4963 else
4964 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4965 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4966 dataref_ptr, update);
4967 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4969 /* Copy the points-to information if it exists. */
4970 if (DR_PTR_INFO (dr))
4972 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4973 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4976 if (!ptr_incr)
4977 return new_dataref_ptr;
4979 /* Update the vector-pointer's cross-iteration increment. */
4980 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4982 tree use = USE_FROM_PTR (use_p);
4984 if (use == dataref_ptr)
4985 SET_USE (use_p, new_dataref_ptr);
4986 else
4987 gcc_assert (operand_equal_p (use, update, 0));
4990 return new_dataref_ptr;
4994 /* Copy memory reference info such as base/clique from the SRC reference
4995 to the DEST MEM_REF. */
4997 void
4998 vect_copy_ref_info (tree dest, tree src)
5000 if (TREE_CODE (dest) != MEM_REF)
5001 return;
5003 tree src_base = src;
5004 while (handled_component_p (src_base))
5005 src_base = TREE_OPERAND (src_base, 0);
5006 if (TREE_CODE (src_base) != MEM_REF
5007 && TREE_CODE (src_base) != TARGET_MEM_REF)
5008 return;
5010 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5011 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5015 /* Function vect_create_destination_var.
5017 Create a new temporary of type VECTYPE. */
5019 tree
5020 vect_create_destination_var (tree scalar_dest, tree vectype)
5022 tree vec_dest;
5023 const char *name;
5024 char *new_name;
5025 tree type;
5026 enum vect_var_kind kind;
5028 kind = vectype
5029 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5030 ? vect_mask_var
5031 : vect_simple_var
5032 : vect_scalar_var;
5033 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5035 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5037 name = get_name (scalar_dest);
5038 if (name)
5039 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5040 else
5041 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5042 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5043 free (new_name);
5045 return vec_dest;
5048 /* Function vect_grouped_store_supported.
5050 Returns TRUE if interleave high and interleave low permutations
5051 are supported, and FALSE otherwise. */
5053 bool
5054 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5056 machine_mode mode = TYPE_MODE (vectype);
5058 /* vect_permute_store_chain requires the group size to be equal to 3 or
5059 be a power of two. */
5060 if (count != 3 && exact_log2 (count) == -1)
5062 if (dump_enabled_p ())
5063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5064 "the size of the group of accesses"
5065 " is not a power of 2 or not eqaul to 3\n");
5066 return false;
5069 /* Check that the permutation is supported. */
5070 if (VECTOR_MODE_P (mode))
5072 unsigned int i;
5073 if (count == 3)
5075 unsigned int j0 = 0, j1 = 0, j2 = 0;
5076 unsigned int i, j;
5078 unsigned int nelt;
5079 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5081 if (dump_enabled_p ())
5082 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5083 "cannot handle groups of 3 stores for"
5084 " variable-length vectors\n");
5085 return false;
5088 vec_perm_builder sel (nelt, nelt, 1);
5089 sel.quick_grow (nelt);
5090 vec_perm_indices indices;
5091 for (j = 0; j < 3; j++)
5093 int nelt0 = ((3 - j) * nelt) % 3;
5094 int nelt1 = ((3 - j) * nelt + 1) % 3;
5095 int nelt2 = ((3 - j) * nelt + 2) % 3;
5096 for (i = 0; i < nelt; i++)
5098 if (3 * i + nelt0 < nelt)
5099 sel[3 * i + nelt0] = j0++;
5100 if (3 * i + nelt1 < nelt)
5101 sel[3 * i + nelt1] = nelt + j1++;
5102 if (3 * i + nelt2 < nelt)
5103 sel[3 * i + nelt2] = 0;
5105 indices.new_vector (sel, 2, nelt);
5106 if (!can_vec_perm_const_p (mode, indices))
5108 if (dump_enabled_p ())
5109 dump_printf (MSG_MISSED_OPTIMIZATION,
5110 "permutation op not supported by target.\n");
5111 return false;
5114 for (i = 0; i < nelt; i++)
5116 if (3 * i + nelt0 < nelt)
5117 sel[3 * i + nelt0] = 3 * i + nelt0;
5118 if (3 * i + nelt1 < nelt)
5119 sel[3 * i + nelt1] = 3 * i + nelt1;
5120 if (3 * i + nelt2 < nelt)
5121 sel[3 * i + nelt2] = nelt + j2++;
5123 indices.new_vector (sel, 2, nelt);
5124 if (!can_vec_perm_const_p (mode, indices))
5126 if (dump_enabled_p ())
5127 dump_printf (MSG_MISSED_OPTIMIZATION,
5128 "permutation op not supported by target.\n");
5129 return false;
5132 return true;
5134 else
5136 /* If length is not equal to 3 then only power of 2 is supported. */
5137 gcc_assert (pow2p_hwi (count));
5138 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5140 /* The encoding has 2 interleaved stepped patterns. */
5141 vec_perm_builder sel (nelt, 2, 3);
5142 sel.quick_grow (6);
5143 for (i = 0; i < 3; i++)
5145 sel[i * 2] = i;
5146 sel[i * 2 + 1] = i + nelt;
5148 vec_perm_indices indices (sel, 2, nelt);
5149 if (can_vec_perm_const_p (mode, indices))
5151 for (i = 0; i < 6; i++)
5152 sel[i] += exact_div (nelt, 2);
5153 indices.new_vector (sel, 2, nelt);
5154 if (can_vec_perm_const_p (mode, indices))
5155 return true;
5160 if (dump_enabled_p ())
5161 dump_printf (MSG_MISSED_OPTIMIZATION,
5162 "permutaion op not supported by target.\n");
5163 return false;
5167 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5168 type VECTYPE. MASKED_P says whether the masked form is needed. */
5170 bool
5171 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5172 bool masked_p)
5174 if (masked_p)
5175 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5176 vec_mask_store_lanes_optab,
5177 vectype, count);
5178 else
5179 return vect_lanes_optab_supported_p ("vec_store_lanes",
5180 vec_store_lanes_optab,
5181 vectype, count);
5185 /* Function vect_permute_store_chain.
5187 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5188 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5189 the data correctly for the stores. Return the final references for stores
5190 in RESULT_CHAIN.
5192 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5193 The input is 4 vectors each containing 8 elements. We assign a number to
5194 each element, the input sequence is:
5196 1st vec: 0 1 2 3 4 5 6 7
5197 2nd vec: 8 9 10 11 12 13 14 15
5198 3rd vec: 16 17 18 19 20 21 22 23
5199 4th vec: 24 25 26 27 28 29 30 31
5201 The output sequence should be:
5203 1st vec: 0 8 16 24 1 9 17 25
5204 2nd vec: 2 10 18 26 3 11 19 27
5205 3rd vec: 4 12 20 28 5 13 21 30
5206 4th vec: 6 14 22 30 7 15 23 31
5208 i.e., we interleave the contents of the four vectors in their order.
5210 We use interleave_high/low instructions to create such output. The input of
5211 each interleave_high/low operation is two vectors:
5212 1st vec 2nd vec
5213 0 1 2 3 4 5 6 7
5214 the even elements of the result vector are obtained left-to-right from the
5215 high/low elements of the first vector. The odd elements of the result are
5216 obtained left-to-right from the high/low elements of the second vector.
5217 The output of interleave_high will be: 0 4 1 5
5218 and of interleave_low: 2 6 3 7
5221 The permutation is done in log LENGTH stages. In each stage interleave_high
5222 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5223 where the first argument is taken from the first half of DR_CHAIN and the
5224 second argument from it's second half.
5225 In our example,
5227 I1: interleave_high (1st vec, 3rd vec)
5228 I2: interleave_low (1st vec, 3rd vec)
5229 I3: interleave_high (2nd vec, 4th vec)
5230 I4: interleave_low (2nd vec, 4th vec)
5232 The output for the first stage is:
5234 I1: 0 16 1 17 2 18 3 19
5235 I2: 4 20 5 21 6 22 7 23
5236 I3: 8 24 9 25 10 26 11 27
5237 I4: 12 28 13 29 14 30 15 31
5239 The output of the second stage, i.e. the final result is:
5241 I1: 0 8 16 24 1 9 17 25
5242 I2: 2 10 18 26 3 11 19 27
5243 I3: 4 12 20 28 5 13 21 30
5244 I4: 6 14 22 30 7 15 23 31. */
5246 void
5247 vect_permute_store_chain (vec<tree> dr_chain,
5248 unsigned int length,
5249 gimple *stmt,
5250 gimple_stmt_iterator *gsi,
5251 vec<tree> *result_chain)
5253 tree vect1, vect2, high, low;
5254 gimple *perm_stmt;
5255 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5256 tree perm_mask_low, perm_mask_high;
5257 tree data_ref;
5258 tree perm3_mask_low, perm3_mask_high;
5259 unsigned int i, j, n, log_length = exact_log2 (length);
5261 result_chain->quick_grow (length);
5262 memcpy (result_chain->address (), dr_chain.address (),
5263 length * sizeof (tree));
5265 if (length == 3)
5267 /* vect_grouped_store_supported ensures that this is constant. */
5268 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5269 unsigned int j0 = 0, j1 = 0, j2 = 0;
5271 vec_perm_builder sel (nelt, nelt, 1);
5272 sel.quick_grow (nelt);
5273 vec_perm_indices indices;
5274 for (j = 0; j < 3; j++)
5276 int nelt0 = ((3 - j) * nelt) % 3;
5277 int nelt1 = ((3 - j) * nelt + 1) % 3;
5278 int nelt2 = ((3 - j) * nelt + 2) % 3;
5280 for (i = 0; i < nelt; i++)
5282 if (3 * i + nelt0 < nelt)
5283 sel[3 * i + nelt0] = j0++;
5284 if (3 * i + nelt1 < nelt)
5285 sel[3 * i + nelt1] = nelt + j1++;
5286 if (3 * i + nelt2 < nelt)
5287 sel[3 * i + nelt2] = 0;
5289 indices.new_vector (sel, 2, nelt);
5290 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5292 for (i = 0; i < nelt; i++)
5294 if (3 * i + nelt0 < nelt)
5295 sel[3 * i + nelt0] = 3 * i + nelt0;
5296 if (3 * i + nelt1 < nelt)
5297 sel[3 * i + nelt1] = 3 * i + nelt1;
5298 if (3 * i + nelt2 < nelt)
5299 sel[3 * i + nelt2] = nelt + j2++;
5301 indices.new_vector (sel, 2, nelt);
5302 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5304 vect1 = dr_chain[0];
5305 vect2 = dr_chain[1];
5307 /* Create interleaving stmt:
5308 low = VEC_PERM_EXPR <vect1, vect2,
5309 {j, nelt, *, j + 1, nelt + j + 1, *,
5310 j + 2, nelt + j + 2, *, ...}> */
5311 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5312 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5313 vect2, perm3_mask_low);
5314 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5316 vect1 = data_ref;
5317 vect2 = dr_chain[2];
5318 /* Create interleaving stmt:
5319 low = VEC_PERM_EXPR <vect1, vect2,
5320 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5321 6, 7, nelt + j + 2, ...}> */
5322 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5323 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5324 vect2, perm3_mask_high);
5325 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5326 (*result_chain)[j] = data_ref;
5329 else
5331 /* If length is not equal to 3 then only power of 2 is supported. */
5332 gcc_assert (pow2p_hwi (length));
5334 /* The encoding has 2 interleaved stepped patterns. */
5335 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5336 vec_perm_builder sel (nelt, 2, 3);
5337 sel.quick_grow (6);
5338 for (i = 0; i < 3; i++)
5340 sel[i * 2] = i;
5341 sel[i * 2 + 1] = i + nelt;
5343 vec_perm_indices indices (sel, 2, nelt);
5344 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5346 for (i = 0; i < 6; i++)
5347 sel[i] += exact_div (nelt, 2);
5348 indices.new_vector (sel, 2, nelt);
5349 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5351 for (i = 0, n = log_length; i < n; i++)
5353 for (j = 0; j < length/2; j++)
5355 vect1 = dr_chain[j];
5356 vect2 = dr_chain[j+length/2];
5358 /* Create interleaving stmt:
5359 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5360 ...}> */
5361 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5362 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5363 vect2, perm_mask_high);
5364 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5365 (*result_chain)[2*j] = high;
5367 /* Create interleaving stmt:
5368 low = VEC_PERM_EXPR <vect1, vect2,
5369 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5370 ...}> */
5371 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5372 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5373 vect2, perm_mask_low);
5374 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5375 (*result_chain)[2*j+1] = low;
5377 memcpy (dr_chain.address (), result_chain->address (),
5378 length * sizeof (tree));
5383 /* Function vect_setup_realignment
5385 This function is called when vectorizing an unaligned load using
5386 the dr_explicit_realign[_optimized] scheme.
5387 This function generates the following code at the loop prolog:
5389 p = initial_addr;
5390 x msq_init = *(floor(p)); # prolog load
5391 realignment_token = call target_builtin;
5392 loop:
5393 x msq = phi (msq_init, ---)
5395 The stmts marked with x are generated only for the case of
5396 dr_explicit_realign_optimized.
5398 The code above sets up a new (vector) pointer, pointing to the first
5399 location accessed by STMT, and a "floor-aligned" load using that pointer.
5400 It also generates code to compute the "realignment-token" (if the relevant
5401 target hook was defined), and creates a phi-node at the loop-header bb
5402 whose arguments are the result of the prolog-load (created by this
5403 function) and the result of a load that takes place in the loop (to be
5404 created by the caller to this function).
5406 For the case of dr_explicit_realign_optimized:
5407 The caller to this function uses the phi-result (msq) to create the
5408 realignment code inside the loop, and sets up the missing phi argument,
5409 as follows:
5410 loop:
5411 msq = phi (msq_init, lsq)
5412 lsq = *(floor(p')); # load in loop
5413 result = realign_load (msq, lsq, realignment_token);
5415 For the case of dr_explicit_realign:
5416 loop:
5417 msq = *(floor(p)); # load in loop
5418 p' = p + (VS-1);
5419 lsq = *(floor(p')); # load in loop
5420 result = realign_load (msq, lsq, realignment_token);
5422 Input:
5423 STMT - (scalar) load stmt to be vectorized. This load accesses
5424 a memory location that may be unaligned.
5425 BSI - place where new code is to be inserted.
5426 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5427 is used.
5429 Output:
5430 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5431 target hook, if defined.
5432 Return value - the result of the loop-header phi node. */
5434 tree
5435 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5436 tree *realignment_token,
5437 enum dr_alignment_support alignment_support_scheme,
5438 tree init_addr,
5439 struct loop **at_loop)
5441 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5442 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5443 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5444 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5445 struct loop *loop = NULL;
5446 edge pe = NULL;
5447 tree scalar_dest = gimple_assign_lhs (stmt);
5448 tree vec_dest;
5449 gimple *inc;
5450 tree ptr;
5451 tree data_ref;
5452 basic_block new_bb;
5453 tree msq_init = NULL_TREE;
5454 tree new_temp;
5455 gphi *phi_stmt;
5456 tree msq = NULL_TREE;
5457 gimple_seq stmts = NULL;
5458 bool inv_p;
5459 bool compute_in_loop = false;
5460 bool nested_in_vect_loop = false;
5461 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5462 struct loop *loop_for_initial_load = NULL;
5464 if (loop_vinfo)
5466 loop = LOOP_VINFO_LOOP (loop_vinfo);
5467 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5470 gcc_assert (alignment_support_scheme == dr_explicit_realign
5471 || alignment_support_scheme == dr_explicit_realign_optimized);
5473 /* We need to generate three things:
5474 1. the misalignment computation
5475 2. the extra vector load (for the optimized realignment scheme).
5476 3. the phi node for the two vectors from which the realignment is
5477 done (for the optimized realignment scheme). */
5479 /* 1. Determine where to generate the misalignment computation.
5481 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5482 calculation will be generated by this function, outside the loop (in the
5483 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5484 caller, inside the loop.
5486 Background: If the misalignment remains fixed throughout the iterations of
5487 the loop, then both realignment schemes are applicable, and also the
5488 misalignment computation can be done outside LOOP. This is because we are
5489 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5490 are a multiple of VS (the Vector Size), and therefore the misalignment in
5491 different vectorized LOOP iterations is always the same.
5492 The problem arises only if the memory access is in an inner-loop nested
5493 inside LOOP, which is now being vectorized using outer-loop vectorization.
5494 This is the only case when the misalignment of the memory access may not
5495 remain fixed throughout the iterations of the inner-loop (as explained in
5496 detail in vect_supportable_dr_alignment). In this case, not only is the
5497 optimized realignment scheme not applicable, but also the misalignment
5498 computation (and generation of the realignment token that is passed to
5499 REALIGN_LOAD) have to be done inside the loop.
5501 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5502 or not, which in turn determines if the misalignment is computed inside
5503 the inner-loop, or outside LOOP. */
5505 if (init_addr != NULL_TREE || !loop_vinfo)
5507 compute_in_loop = true;
5508 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5512 /* 2. Determine where to generate the extra vector load.
5514 For the optimized realignment scheme, instead of generating two vector
5515 loads in each iteration, we generate a single extra vector load in the
5516 preheader of the loop, and in each iteration reuse the result of the
5517 vector load from the previous iteration. In case the memory access is in
5518 an inner-loop nested inside LOOP, which is now being vectorized using
5519 outer-loop vectorization, we need to determine whether this initial vector
5520 load should be generated at the preheader of the inner-loop, or can be
5521 generated at the preheader of LOOP. If the memory access has no evolution
5522 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5523 to be generated inside LOOP (in the preheader of the inner-loop). */
5525 if (nested_in_vect_loop)
5527 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5528 bool invariant_in_outerloop =
5529 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5530 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5532 else
5533 loop_for_initial_load = loop;
5534 if (at_loop)
5535 *at_loop = loop_for_initial_load;
5537 if (loop_for_initial_load)
5538 pe = loop_preheader_edge (loop_for_initial_load);
5540 /* 3. For the case of the optimized realignment, create the first vector
5541 load at the loop preheader. */
5543 if (alignment_support_scheme == dr_explicit_realign_optimized)
5545 /* Create msq_init = *(floor(p1)) in the loop preheader */
5546 gassign *new_stmt;
5548 gcc_assert (!compute_in_loop);
5549 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5550 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5551 NULL_TREE, &init_addr, NULL, &inc,
5552 true, &inv_p);
5553 if (TREE_CODE (ptr) == SSA_NAME)
5554 new_temp = copy_ssa_name (ptr);
5555 else
5556 new_temp = make_ssa_name (TREE_TYPE (ptr));
5557 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5558 new_stmt = gimple_build_assign
5559 (new_temp, BIT_AND_EXPR, ptr,
5560 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5561 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5562 gcc_assert (!new_bb);
5563 data_ref
5564 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5565 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5566 vect_copy_ref_info (data_ref, DR_REF (dr));
5567 new_stmt = gimple_build_assign (vec_dest, data_ref);
5568 new_temp = make_ssa_name (vec_dest, new_stmt);
5569 gimple_assign_set_lhs (new_stmt, new_temp);
5570 if (pe)
5572 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5573 gcc_assert (!new_bb);
5575 else
5576 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5578 msq_init = gimple_assign_lhs (new_stmt);
5581 /* 4. Create realignment token using a target builtin, if available.
5582 It is done either inside the containing loop, or before LOOP (as
5583 determined above). */
5585 if (targetm.vectorize.builtin_mask_for_load)
5587 gcall *new_stmt;
5588 tree builtin_decl;
5590 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5591 if (!init_addr)
5593 /* Generate the INIT_ADDR computation outside LOOP. */
5594 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5595 NULL_TREE);
5596 if (loop)
5598 pe = loop_preheader_edge (loop);
5599 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5600 gcc_assert (!new_bb);
5602 else
5603 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5606 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5607 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5608 vec_dest =
5609 vect_create_destination_var (scalar_dest,
5610 gimple_call_return_type (new_stmt));
5611 new_temp = make_ssa_name (vec_dest, new_stmt);
5612 gimple_call_set_lhs (new_stmt, new_temp);
5614 if (compute_in_loop)
5615 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5616 else
5618 /* Generate the misalignment computation outside LOOP. */
5619 pe = loop_preheader_edge (loop);
5620 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5621 gcc_assert (!new_bb);
5624 *realignment_token = gimple_call_lhs (new_stmt);
5626 /* The result of the CALL_EXPR to this builtin is determined from
5627 the value of the parameter and no global variables are touched
5628 which makes the builtin a "const" function. Requiring the
5629 builtin to have the "const" attribute makes it unnecessary
5630 to call mark_call_clobbered. */
5631 gcc_assert (TREE_READONLY (builtin_decl));
5634 if (alignment_support_scheme == dr_explicit_realign)
5635 return msq;
5637 gcc_assert (!compute_in_loop);
5638 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5641 /* 5. Create msq = phi <msq_init, lsq> in loop */
5643 pe = loop_preheader_edge (containing_loop);
5644 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5645 msq = make_ssa_name (vec_dest);
5646 phi_stmt = create_phi_node (msq, containing_loop->header);
5647 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5649 return msq;
5653 /* Function vect_grouped_load_supported.
5655 COUNT is the size of the load group (the number of statements plus the
5656 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5657 only one statement, with a gap of COUNT - 1.
5659 Returns true if a suitable permute exists. */
5661 bool
5662 vect_grouped_load_supported (tree vectype, bool single_element_p,
5663 unsigned HOST_WIDE_INT count)
5665 machine_mode mode = TYPE_MODE (vectype);
5667 /* If this is single-element interleaving with an element distance
5668 that leaves unused vector loads around punt - we at least create
5669 very sub-optimal code in that case (and blow up memory,
5670 see PR65518). */
5671 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5673 if (dump_enabled_p ())
5674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5675 "single-element interleaving not supported "
5676 "for not adjacent vector loads\n");
5677 return false;
5680 /* vect_permute_load_chain requires the group size to be equal to 3 or
5681 be a power of two. */
5682 if (count != 3 && exact_log2 (count) == -1)
5684 if (dump_enabled_p ())
5685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5686 "the size of the group of accesses"
5687 " is not a power of 2 or not equal to 3\n");
5688 return false;
5691 /* Check that the permutation is supported. */
5692 if (VECTOR_MODE_P (mode))
5694 unsigned int i, j;
5695 if (count == 3)
5697 unsigned int nelt;
5698 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5700 if (dump_enabled_p ())
5701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702 "cannot handle groups of 3 loads for"
5703 " variable-length vectors\n");
5704 return false;
5707 vec_perm_builder sel (nelt, nelt, 1);
5708 sel.quick_grow (nelt);
5709 vec_perm_indices indices;
5710 unsigned int k;
5711 for (k = 0; k < 3; k++)
5713 for (i = 0; i < nelt; i++)
5714 if (3 * i + k < 2 * nelt)
5715 sel[i] = 3 * i + k;
5716 else
5717 sel[i] = 0;
5718 indices.new_vector (sel, 2, nelt);
5719 if (!can_vec_perm_const_p (mode, indices))
5721 if (dump_enabled_p ())
5722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5723 "shuffle of 3 loads is not supported by"
5724 " target\n");
5725 return false;
5727 for (i = 0, j = 0; i < nelt; i++)
5728 if (3 * i + k < 2 * nelt)
5729 sel[i] = i;
5730 else
5731 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5732 indices.new_vector (sel, 2, nelt);
5733 if (!can_vec_perm_const_p (mode, indices))
5735 if (dump_enabled_p ())
5736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5737 "shuffle of 3 loads is not supported by"
5738 " target\n");
5739 return false;
5742 return true;
5744 else
5746 /* If length is not equal to 3 then only power of 2 is supported. */
5747 gcc_assert (pow2p_hwi (count));
5748 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5750 /* The encoding has a single stepped pattern. */
5751 vec_perm_builder sel (nelt, 1, 3);
5752 sel.quick_grow (3);
5753 for (i = 0; i < 3; i++)
5754 sel[i] = i * 2;
5755 vec_perm_indices indices (sel, 2, nelt);
5756 if (can_vec_perm_const_p (mode, indices))
5758 for (i = 0; i < 3; i++)
5759 sel[i] = i * 2 + 1;
5760 indices.new_vector (sel, 2, nelt);
5761 if (can_vec_perm_const_p (mode, indices))
5762 return true;
5767 if (dump_enabled_p ())
5768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5769 "extract even/odd not supported by target\n");
5770 return false;
5773 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5774 type VECTYPE. MASKED_P says whether the masked form is needed. */
5776 bool
5777 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5778 bool masked_p)
5780 if (masked_p)
5781 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5782 vec_mask_load_lanes_optab,
5783 vectype, count);
5784 else
5785 return vect_lanes_optab_supported_p ("vec_load_lanes",
5786 vec_load_lanes_optab,
5787 vectype, count);
5790 /* Function vect_permute_load_chain.
5792 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5793 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5794 the input data correctly. Return the final references for loads in
5795 RESULT_CHAIN.
5797 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5798 The input is 4 vectors each containing 8 elements. We assign a number to each
5799 element, the input sequence is:
5801 1st vec: 0 1 2 3 4 5 6 7
5802 2nd vec: 8 9 10 11 12 13 14 15
5803 3rd vec: 16 17 18 19 20 21 22 23
5804 4th vec: 24 25 26 27 28 29 30 31
5806 The output sequence should be:
5808 1st vec: 0 4 8 12 16 20 24 28
5809 2nd vec: 1 5 9 13 17 21 25 29
5810 3rd vec: 2 6 10 14 18 22 26 30
5811 4th vec: 3 7 11 15 19 23 27 31
5813 i.e., the first output vector should contain the first elements of each
5814 interleaving group, etc.
5816 We use extract_even/odd instructions to create such output. The input of
5817 each extract_even/odd operation is two vectors
5818 1st vec 2nd vec
5819 0 1 2 3 4 5 6 7
5821 and the output is the vector of extracted even/odd elements. The output of
5822 extract_even will be: 0 2 4 6
5823 and of extract_odd: 1 3 5 7
5826 The permutation is done in log LENGTH stages. In each stage extract_even
5827 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5828 their order. In our example,
5830 E1: extract_even (1st vec, 2nd vec)
5831 E2: extract_odd (1st vec, 2nd vec)
5832 E3: extract_even (3rd vec, 4th vec)
5833 E4: extract_odd (3rd vec, 4th vec)
5835 The output for the first stage will be:
5837 E1: 0 2 4 6 8 10 12 14
5838 E2: 1 3 5 7 9 11 13 15
5839 E3: 16 18 20 22 24 26 28 30
5840 E4: 17 19 21 23 25 27 29 31
5842 In order to proceed and create the correct sequence for the next stage (or
5843 for the correct output, if the second stage is the last one, as in our
5844 example), we first put the output of extract_even operation and then the
5845 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5846 The input for the second stage is:
5848 1st vec (E1): 0 2 4 6 8 10 12 14
5849 2nd vec (E3): 16 18 20 22 24 26 28 30
5850 3rd vec (E2): 1 3 5 7 9 11 13 15
5851 4th vec (E4): 17 19 21 23 25 27 29 31
5853 The output of the second stage:
5855 E1: 0 4 8 12 16 20 24 28
5856 E2: 2 6 10 14 18 22 26 30
5857 E3: 1 5 9 13 17 21 25 29
5858 E4: 3 7 11 15 19 23 27 31
5860 And RESULT_CHAIN after reordering:
5862 1st vec (E1): 0 4 8 12 16 20 24 28
5863 2nd vec (E3): 1 5 9 13 17 21 25 29
5864 3rd vec (E2): 2 6 10 14 18 22 26 30
5865 4th vec (E4): 3 7 11 15 19 23 27 31. */
5867 static void
5868 vect_permute_load_chain (vec<tree> dr_chain,
5869 unsigned int length,
5870 gimple *stmt,
5871 gimple_stmt_iterator *gsi,
5872 vec<tree> *result_chain)
5874 tree data_ref, first_vect, second_vect;
5875 tree perm_mask_even, perm_mask_odd;
5876 tree perm3_mask_low, perm3_mask_high;
5877 gimple *perm_stmt;
5878 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5879 unsigned int i, j, log_length = exact_log2 (length);
5881 result_chain->quick_grow (length);
5882 memcpy (result_chain->address (), dr_chain.address (),
5883 length * sizeof (tree));
5885 if (length == 3)
5887 /* vect_grouped_load_supported ensures that this is constant. */
5888 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5889 unsigned int k;
5891 vec_perm_builder sel (nelt, nelt, 1);
5892 sel.quick_grow (nelt);
5893 vec_perm_indices indices;
5894 for (k = 0; k < 3; k++)
5896 for (i = 0; i < nelt; i++)
5897 if (3 * i + k < 2 * nelt)
5898 sel[i] = 3 * i + k;
5899 else
5900 sel[i] = 0;
5901 indices.new_vector (sel, 2, nelt);
5902 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5904 for (i = 0, j = 0; i < nelt; i++)
5905 if (3 * i + k < 2 * nelt)
5906 sel[i] = i;
5907 else
5908 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5909 indices.new_vector (sel, 2, nelt);
5910 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5912 first_vect = dr_chain[0];
5913 second_vect = dr_chain[1];
5915 /* Create interleaving stmt (low part of):
5916 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5917 ...}> */
5918 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5919 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5920 second_vect, perm3_mask_low);
5921 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5923 /* Create interleaving stmt (high part of):
5924 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5925 ...}> */
5926 first_vect = data_ref;
5927 second_vect = dr_chain[2];
5928 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5929 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5930 second_vect, perm3_mask_high);
5931 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5932 (*result_chain)[k] = data_ref;
5935 else
5937 /* If length is not equal to 3 then only power of 2 is supported. */
5938 gcc_assert (pow2p_hwi (length));
5940 /* The encoding has a single stepped pattern. */
5941 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5942 vec_perm_builder sel (nelt, 1, 3);
5943 sel.quick_grow (3);
5944 for (i = 0; i < 3; ++i)
5945 sel[i] = i * 2;
5946 vec_perm_indices indices (sel, 2, nelt);
5947 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5949 for (i = 0; i < 3; ++i)
5950 sel[i] = i * 2 + 1;
5951 indices.new_vector (sel, 2, nelt);
5952 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5954 for (i = 0; i < log_length; i++)
5956 for (j = 0; j < length; j += 2)
5958 first_vect = dr_chain[j];
5959 second_vect = dr_chain[j+1];
5961 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5962 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5963 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5964 first_vect, second_vect,
5965 perm_mask_even);
5966 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5967 (*result_chain)[j/2] = data_ref;
5969 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5970 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5971 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5972 first_vect, second_vect,
5973 perm_mask_odd);
5974 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5975 (*result_chain)[j/2+length/2] = data_ref;
5977 memcpy (dr_chain.address (), result_chain->address (),
5978 length * sizeof (tree));
5983 /* Function vect_shift_permute_load_chain.
5985 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5986 sequence of stmts to reorder the input data accordingly.
5987 Return the final references for loads in RESULT_CHAIN.
5988 Return true if successed, false otherwise.
5990 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5991 The input is 3 vectors each containing 8 elements. We assign a
5992 number to each element, the input sequence is:
5994 1st vec: 0 1 2 3 4 5 6 7
5995 2nd vec: 8 9 10 11 12 13 14 15
5996 3rd vec: 16 17 18 19 20 21 22 23
5998 The output sequence should be:
6000 1st vec: 0 3 6 9 12 15 18 21
6001 2nd vec: 1 4 7 10 13 16 19 22
6002 3rd vec: 2 5 8 11 14 17 20 23
6004 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6006 First we shuffle all 3 vectors to get correct elements order:
6008 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6009 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6010 3rd vec: (16 19 22) (17 20 23) (18 21)
6012 Next we unite and shift vector 3 times:
6014 1st step:
6015 shift right by 6 the concatenation of:
6016 "1st vec" and "2nd vec"
6017 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6018 "2nd vec" and "3rd vec"
6019 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6020 "3rd vec" and "1st vec"
6021 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6022 | New vectors |
6024 So that now new vectors are:
6026 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6027 2nd vec: (10 13) (16 19 22) (17 20 23)
6028 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6030 2nd step:
6031 shift right by 5 the concatenation of:
6032 "1st vec" and "3rd vec"
6033 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6034 "2nd vec" and "1st vec"
6035 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6036 "3rd vec" and "2nd vec"
6037 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6038 | New vectors |
6040 So that now new vectors are:
6042 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6043 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6044 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6046 3rd step:
6047 shift right by 5 the concatenation of:
6048 "1st vec" and "1st vec"
6049 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6050 shift right by 3 the concatenation of:
6051 "2nd vec" and "2nd vec"
6052 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6053 | New vectors |
6055 So that now all vectors are READY:
6056 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6057 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6058 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6060 This algorithm is faster than one in vect_permute_load_chain if:
6061 1. "shift of a concatination" is faster than general permutation.
6062 This is usually so.
6063 2. The TARGET machine can't execute vector instructions in parallel.
6064 This is because each step of the algorithm depends on previous.
6065 The algorithm in vect_permute_load_chain is much more parallel.
6067 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6070 static bool
6071 vect_shift_permute_load_chain (vec<tree> dr_chain,
6072 unsigned int length,
6073 gimple *stmt,
6074 gimple_stmt_iterator *gsi,
6075 vec<tree> *result_chain)
6077 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6078 tree perm2_mask1, perm2_mask2, perm3_mask;
6079 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6080 gimple *perm_stmt;
6082 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6083 unsigned int i;
6084 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6085 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6087 unsigned HOST_WIDE_INT nelt, vf;
6088 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6089 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6090 /* Not supported for variable-length vectors. */
6091 return false;
6093 vec_perm_builder sel (nelt, nelt, 1);
6094 sel.quick_grow (nelt);
6096 result_chain->quick_grow (length);
6097 memcpy (result_chain->address (), dr_chain.address (),
6098 length * sizeof (tree));
6100 if (pow2p_hwi (length) && vf > 4)
6102 unsigned int j, log_length = exact_log2 (length);
6103 for (i = 0; i < nelt / 2; ++i)
6104 sel[i] = i * 2;
6105 for (i = 0; i < nelt / 2; ++i)
6106 sel[nelt / 2 + i] = i * 2 + 1;
6107 vec_perm_indices indices (sel, 2, nelt);
6108 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6110 if (dump_enabled_p ())
6111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6112 "shuffle of 2 fields structure is not \
6113 supported by target\n");
6114 return false;
6116 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6118 for (i = 0; i < nelt / 2; ++i)
6119 sel[i] = i * 2 + 1;
6120 for (i = 0; i < nelt / 2; ++i)
6121 sel[nelt / 2 + i] = i * 2;
6122 indices.new_vector (sel, 2, nelt);
6123 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6125 if (dump_enabled_p ())
6126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6127 "shuffle of 2 fields structure is not \
6128 supported by target\n");
6129 return false;
6131 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6133 /* Generating permutation constant to shift all elements.
6134 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6135 for (i = 0; i < nelt; i++)
6136 sel[i] = nelt / 2 + i;
6137 indices.new_vector (sel, 2, nelt);
6138 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6140 if (dump_enabled_p ())
6141 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6142 "shift permutation is not supported by target\n");
6143 return false;
6145 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6147 /* Generating permutation constant to select vector from 2.
6148 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6149 for (i = 0; i < nelt / 2; i++)
6150 sel[i] = i;
6151 for (i = nelt / 2; i < nelt; i++)
6152 sel[i] = nelt + i;
6153 indices.new_vector (sel, 2, nelt);
6154 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6156 if (dump_enabled_p ())
6157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6158 "select is not supported by target\n");
6159 return false;
6161 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6163 for (i = 0; i < log_length; i++)
6165 for (j = 0; j < length; j += 2)
6167 first_vect = dr_chain[j];
6168 second_vect = dr_chain[j + 1];
6170 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6171 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6172 first_vect, first_vect,
6173 perm2_mask1);
6174 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6175 vect[0] = data_ref;
6177 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6178 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6179 second_vect, second_vect,
6180 perm2_mask2);
6181 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6182 vect[1] = data_ref;
6184 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6185 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6186 vect[0], vect[1], shift1_mask);
6187 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6188 (*result_chain)[j/2 + length/2] = data_ref;
6190 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6191 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6192 vect[0], vect[1], select_mask);
6193 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6194 (*result_chain)[j/2] = data_ref;
6196 memcpy (dr_chain.address (), result_chain->address (),
6197 length * sizeof (tree));
6199 return true;
6201 if (length == 3 && vf > 2)
6203 unsigned int k = 0, l = 0;
6205 /* Generating permutation constant to get all elements in rigth order.
6206 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6207 for (i = 0; i < nelt; i++)
6209 if (3 * k + (l % 3) >= nelt)
6211 k = 0;
6212 l += (3 - (nelt % 3));
6214 sel[i] = 3 * k + (l % 3);
6215 k++;
6217 vec_perm_indices indices (sel, 2, nelt);
6218 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6220 if (dump_enabled_p ())
6221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6222 "shuffle of 3 fields structure is not \
6223 supported by target\n");
6224 return false;
6226 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6228 /* Generating permutation constant to shift all elements.
6229 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6230 for (i = 0; i < nelt; i++)
6231 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6232 indices.new_vector (sel, 2, nelt);
6233 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6235 if (dump_enabled_p ())
6236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6237 "shift permutation is not supported by target\n");
6238 return false;
6240 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6242 /* Generating permutation constant to shift all elements.
6243 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6244 for (i = 0; i < nelt; i++)
6245 sel[i] = 2 * (nelt / 3) + 1 + i;
6246 indices.new_vector (sel, 2, nelt);
6247 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6249 if (dump_enabled_p ())
6250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6251 "shift permutation is not supported by target\n");
6252 return false;
6254 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6256 /* Generating permutation constant to shift all elements.
6257 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6258 for (i = 0; i < nelt; i++)
6259 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6260 indices.new_vector (sel, 2, nelt);
6261 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6263 if (dump_enabled_p ())
6264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6265 "shift permutation is not supported by target\n");
6266 return false;
6268 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6270 /* Generating permutation constant to shift all elements.
6271 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6272 for (i = 0; i < nelt; i++)
6273 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6274 indices.new_vector (sel, 2, nelt);
6275 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6277 if (dump_enabled_p ())
6278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6279 "shift permutation is not supported by target\n");
6280 return false;
6282 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6284 for (k = 0; k < 3; k++)
6286 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6287 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6288 dr_chain[k], dr_chain[k],
6289 perm3_mask);
6290 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6291 vect[k] = data_ref;
6294 for (k = 0; k < 3; k++)
6296 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6297 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6298 vect[k % 3], vect[(k + 1) % 3],
6299 shift1_mask);
6300 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6301 vect_shift[k] = data_ref;
6304 for (k = 0; k < 3; k++)
6306 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6307 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6308 vect_shift[(4 - k) % 3],
6309 vect_shift[(3 - k) % 3],
6310 shift2_mask);
6311 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6312 vect[k] = data_ref;
6315 (*result_chain)[3 - (nelt % 3)] = vect[2];
6317 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6318 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6319 vect[0], shift3_mask);
6320 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6321 (*result_chain)[nelt % 3] = data_ref;
6323 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6324 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6325 vect[1], shift4_mask);
6326 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6327 (*result_chain)[0] = data_ref;
6328 return true;
6330 return false;
6333 /* Function vect_transform_grouped_load.
6335 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6336 to perform their permutation and ascribe the result vectorized statements to
6337 the scalar statements.
6340 void
6341 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6342 gimple_stmt_iterator *gsi)
6344 machine_mode mode;
6345 vec<tree> result_chain = vNULL;
6347 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6348 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6349 vectors, that are ready for vector computation. */
6350 result_chain.create (size);
6352 /* If reassociation width for vector type is 2 or greater target machine can
6353 execute 2 or more vector instructions in parallel. Otherwise try to
6354 get chain for loads group using vect_shift_permute_load_chain. */
6355 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6356 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6357 || pow2p_hwi (size)
6358 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6359 gsi, &result_chain))
6360 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6361 vect_record_grouped_load_vectors (stmt, result_chain);
6362 result_chain.release ();
6365 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6366 generated as part of the vectorization of STMT. Assign the statement
6367 for each vector to the associated scalar statement. */
6369 void
6370 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6372 gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6373 gimple *next_stmt, *new_stmt;
6374 unsigned int i, gap_count;
6375 tree tmp_data_ref;
6377 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6378 Since we scan the chain starting from it's first node, their order
6379 corresponds the order of data-refs in RESULT_CHAIN. */
6380 next_stmt = first_stmt;
6381 gap_count = 1;
6382 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6384 if (!next_stmt)
6385 break;
6387 /* Skip the gaps. Loads created for the gaps will be removed by dead
6388 code elimination pass later. No need to check for the first stmt in
6389 the group, since it always exists.
6390 DR_GROUP_GAP is the number of steps in elements from the previous
6391 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6392 correspond to the gaps. */
6393 if (next_stmt != first_stmt
6394 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6396 gap_count++;
6397 continue;
6400 while (next_stmt)
6402 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6403 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6404 copies, and we put the new vector statement in the first available
6405 RELATED_STMT. */
6406 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6407 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6408 else
6410 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6412 gimple *prev_stmt =
6413 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6414 gimple *rel_stmt =
6415 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6416 while (rel_stmt)
6418 prev_stmt = rel_stmt;
6419 rel_stmt =
6420 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6423 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6424 new_stmt;
6428 next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6429 gap_count = 1;
6430 /* If NEXT_STMT accesses the same DR as the previous statement,
6431 put the same TMP_DATA_REF as its vectorized statement; otherwise
6432 get the next data-ref from RESULT_CHAIN. */
6433 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6434 break;
6439 /* Function vect_force_dr_alignment_p.
6441 Returns whether the alignment of a DECL can be forced to be aligned
6442 on ALIGNMENT bit boundary. */
6444 bool
6445 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6447 if (!VAR_P (decl))
6448 return false;
6450 if (decl_in_symtab_p (decl)
6451 && !symtab_node::get (decl)->can_increase_alignment_p ())
6452 return false;
6454 if (TREE_STATIC (decl))
6455 return (alignment <= MAX_OFILE_ALIGNMENT);
6456 else
6457 return (alignment <= MAX_STACK_ALIGNMENT);
6461 /* Return whether the data reference DR is supported with respect to its
6462 alignment.
6463 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6464 it is aligned, i.e., check if it is possible to vectorize it with different
6465 alignment. */
6467 enum dr_alignment_support
6468 vect_supportable_dr_alignment (struct data_reference *dr,
6469 bool check_aligned_accesses)
6471 gimple *stmt = DR_STMT (dr);
6472 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6473 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6474 machine_mode mode = TYPE_MODE (vectype);
6475 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6476 struct loop *vect_loop = NULL;
6477 bool nested_in_vect_loop = false;
6479 if (aligned_access_p (dr) && !check_aligned_accesses)
6480 return dr_aligned;
6482 /* For now assume all conditional loads/stores support unaligned
6483 access without any special code. */
6484 if (is_gimple_call (stmt)
6485 && gimple_call_internal_p (stmt)
6486 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6487 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6488 return dr_unaligned_supported;
6490 if (loop_vinfo)
6492 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6493 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6496 /* Possibly unaligned access. */
6498 /* We can choose between using the implicit realignment scheme (generating
6499 a misaligned_move stmt) and the explicit realignment scheme (generating
6500 aligned loads with a REALIGN_LOAD). There are two variants to the
6501 explicit realignment scheme: optimized, and unoptimized.
6502 We can optimize the realignment only if the step between consecutive
6503 vector loads is equal to the vector size. Since the vector memory
6504 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6505 is guaranteed that the misalignment amount remains the same throughout the
6506 execution of the vectorized loop. Therefore, we can create the
6507 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6508 at the loop preheader.
6510 However, in the case of outer-loop vectorization, when vectorizing a
6511 memory access in the inner-loop nested within the LOOP that is now being
6512 vectorized, while it is guaranteed that the misalignment of the
6513 vectorized memory access will remain the same in different outer-loop
6514 iterations, it is *not* guaranteed that is will remain the same throughout
6515 the execution of the inner-loop. This is because the inner-loop advances
6516 with the original scalar step (and not in steps of VS). If the inner-loop
6517 step happens to be a multiple of VS, then the misalignment remains fixed
6518 and we can use the optimized realignment scheme. For example:
6520 for (i=0; i<N; i++)
6521 for (j=0; j<M; j++)
6522 s += a[i+j];
6524 When vectorizing the i-loop in the above example, the step between
6525 consecutive vector loads is 1, and so the misalignment does not remain
6526 fixed across the execution of the inner-loop, and the realignment cannot
6527 be optimized (as illustrated in the following pseudo vectorized loop):
6529 for (i=0; i<N; i+=4)
6530 for (j=0; j<M; j++){
6531 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6532 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6533 // (assuming that we start from an aligned address).
6536 We therefore have to use the unoptimized realignment scheme:
6538 for (i=0; i<N; i+=4)
6539 for (j=k; j<M; j+=4)
6540 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6541 // that the misalignment of the initial address is
6542 // 0).
6544 The loop can then be vectorized as follows:
6546 for (k=0; k<4; k++){
6547 rt = get_realignment_token (&vp[k]);
6548 for (i=0; i<N; i+=4){
6549 v1 = vp[i+k];
6550 for (j=k; j<M; j+=4){
6551 v2 = vp[i+j+VS-1];
6552 va = REALIGN_LOAD <v1,v2,rt>;
6553 vs += va;
6554 v1 = v2;
6557 } */
6559 if (DR_IS_READ (dr))
6561 bool is_packed = false;
6562 tree type = (TREE_TYPE (DR_REF (dr)));
6564 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6565 && (!targetm.vectorize.builtin_mask_for_load
6566 || targetm.vectorize.builtin_mask_for_load ()))
6568 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6570 /* If we are doing SLP then the accesses need not have the
6571 same alignment, instead it depends on the SLP group size. */
6572 if (loop_vinfo
6573 && STMT_SLP_TYPE (stmt_info)
6574 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6575 * DR_GROUP_SIZE (vinfo_for_stmt
6576 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6577 TYPE_VECTOR_SUBPARTS (vectype)))
6579 else if (!loop_vinfo
6580 || (nested_in_vect_loop
6581 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6582 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6583 return dr_explicit_realign;
6584 else
6585 return dr_explicit_realign_optimized;
6587 if (!known_alignment_for_access_p (dr))
6588 is_packed = not_size_aligned (DR_REF (dr));
6590 if (targetm.vectorize.support_vector_misalignment
6591 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6592 /* Can't software pipeline the loads, but can at least do them. */
6593 return dr_unaligned_supported;
6595 else
6597 bool is_packed = false;
6598 tree type = (TREE_TYPE (DR_REF (dr)));
6600 if (!known_alignment_for_access_p (dr))
6601 is_packed = not_size_aligned (DR_REF (dr));
6603 if (targetm.vectorize.support_vector_misalignment
6604 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6605 return dr_unaligned_supported;
6608 /* Unsupported. */
6609 return dr_unaligned_unsupported;