2018-05-25 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob65b2366c86dab4c21e934dab345d6ec0ce654a88
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
216 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
219 /* A subroutine of vect_analyze_data_ref_dependence. Handle
220 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
221 distances. These distances are conservatively correct but they don't
222 reflect a guaranteed dependence.
224 Return true if this function does all the work necessary to avoid
225 an alias or false if the caller should use the dependence distances
226 to limit the vectorization factor in the usual way. LOOP_DEPTH is
227 the depth of the loop described by LOOP_VINFO and the other arguments
228 are as for vect_analyze_data_ref_dependence. */
230 static bool
231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 int loop_depth, unsigned int *max_vf)
235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
236 lambda_vector dist_v;
237 unsigned int i;
238 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
240 int dist = dist_v[loop_depth];
241 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
243 /* If the user asserted safelen >= DIST consecutive iterations
244 can be executed concurrently, assume independence.
246 ??? An alternative would be to add the alias check even
247 in this case, and vectorize the fallback loop with the
248 maximum VF set to safelen. However, if the user has
249 explicitly given a length, it's less likely that that
250 would be a win. */
251 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
253 if ((unsigned int) loop->safelen < *max_vf)
254 *max_vf = loop->safelen;
255 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
256 continue;
259 /* For dependence distances of 2 or more, we have the option
260 of limiting VF or checking for an alias at runtime.
261 Prefer to check at runtime if we can, to avoid limiting
262 the VF unnecessarily when the bases are in fact independent.
264 Note that the alias checks will be removed if the VF ends up
265 being small enough. */
266 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
269 return true;
273 /* Function vect_analyze_data_ref_dependence.
275 Return TRUE if there (might) exist a dependence between a memory-reference
276 DRA and a memory-reference DRB. When versioning for alias may check a
277 dependence at run-time, return FALSE. Adjust *MAX_VF according to
278 the data dependence. */
280 static bool
281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
282 loop_vec_info loop_vinfo,
283 unsigned int *max_vf)
285 unsigned int i;
286 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
287 struct data_reference *dra = DDR_A (ddr);
288 struct data_reference *drb = DDR_B (ddr);
289 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
290 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
291 lambda_vector dist_v;
292 unsigned int loop_depth;
294 /* In loop analysis all data references should be vectorizable. */
295 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
296 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
297 gcc_unreachable ();
299 /* Independent data accesses. */
300 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
301 return false;
303 if (dra == drb
304 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
305 return false;
307 /* We do not have to consider dependences between accesses that belong
308 to the same group, unless the stride could be smaller than the
309 group size. */
310 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
311 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
312 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
313 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
314 return false;
316 /* Even if we have an anti-dependence then, as the vectorized loop covers at
317 least two scalar iterations, there is always also a true dependence.
318 As the vectorizer does not re-order loads and stores we can ignore
319 the anti-dependence if TBAA can disambiguate both DRs similar to the
320 case with known negative distance anti-dependences (positive
321 distance anti-dependences would violate TBAA constraints). */
322 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
323 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
324 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
325 get_alias_set (DR_REF (drb))))
326 return false;
328 /* Unknown data dependence. */
329 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
331 /* If user asserted safelen consecutive iterations can be
332 executed concurrently, assume independence. */
333 if (loop->safelen >= 2)
335 if ((unsigned int) loop->safelen < *max_vf)
336 *max_vf = loop->safelen;
337 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
338 return false;
341 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
342 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
344 if (dump_enabled_p ())
346 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
347 "versioning for alias not supported for: "
348 "can't determine dependence between ");
349 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
350 DR_REF (dra));
351 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
353 DR_REF (drb));
354 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 return true;
359 if (dump_enabled_p ())
361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
362 "versioning for alias required: "
363 "can't determine dependence between ");
364 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
365 DR_REF (dra));
366 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
367 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
368 DR_REF (drb));
369 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
372 /* Add to list of ddrs that need to be tested at run-time. */
373 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
376 /* Known data dependence. */
377 if (DDR_NUM_DIST_VECTS (ddr) == 0)
379 /* If user asserted safelen consecutive iterations can be
380 executed concurrently, assume independence. */
381 if (loop->safelen >= 2)
383 if ((unsigned int) loop->safelen < *max_vf)
384 *max_vf = loop->safelen;
385 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
386 return false;
389 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
390 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
392 if (dump_enabled_p ())
394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
395 "versioning for alias not supported for: "
396 "bad dist vector for ");
397 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
398 DR_REF (dra));
399 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
400 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
401 DR_REF (drb));
402 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
404 return true;
407 if (dump_enabled_p ())
409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
410 "versioning for alias required: "
411 "bad dist vector for ");
412 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
413 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
414 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
415 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
417 /* Add to list of ddrs that need to be tested at run-time. */
418 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
421 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
423 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
424 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
425 loop_depth, max_vf))
426 return false;
428 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
430 int dist = dist_v[loop_depth];
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_NOTE, vect_location,
434 "dependence distance = %d.\n", dist);
436 if (dist == 0)
438 if (dump_enabled_p ())
440 dump_printf_loc (MSG_NOTE, vect_location,
441 "dependence distance == 0 between ");
442 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
443 dump_printf (MSG_NOTE, " and ");
444 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
445 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
448 /* When we perform grouped accesses and perform implicit CSE
449 by detecting equal accesses and doing disambiguation with
450 runtime alias tests like for
451 .. = a[i];
452 .. = a[i+1];
453 a[i] = ..;
454 a[i+1] = ..;
455 *p = ..;
456 .. = a[i];
457 .. = a[i+1];
458 where we will end up loading { a[i], a[i+1] } once, make
459 sure that inserting group loads before the first load and
460 stores after the last store will do the right thing.
461 Similar for groups like
462 a[i] = ...;
463 ... = a[i];
464 a[i+1] = ...;
465 where loads from the group interleave with the store. */
466 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
470 "READ_WRITE dependence in interleaving.\n");
471 return true;
474 if (loop->safelen < 2)
476 tree indicator = dr_zero_step_indicator (dra);
477 if (TREE_CODE (indicator) != INTEGER_CST)
478 vect_check_nonzero_value (loop_vinfo, indicator);
479 else if (integer_zerop (indicator))
481 if (dump_enabled_p ())
482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
483 "access also has a zero step\n");
484 return true;
487 continue;
490 if (dist > 0 && DDR_REVERSED_P (ddr))
492 /* If DDR_REVERSED_P the order of the data-refs in DDR was
493 reversed (to make distance vector positive), and the actual
494 distance is negative. */
495 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497 "dependence distance negative.\n");
498 /* Record a negative dependence distance to later limit the
499 amount of stmt copying / unrolling we can perform.
500 Only need to handle read-after-write dependence. */
501 if (DR_IS_READ (drb)
502 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
503 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
504 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
505 continue;
508 unsigned int abs_dist = abs (dist);
509 if (abs_dist >= 2 && abs_dist < *max_vf)
511 /* The dependence distance requires reduction of the maximal
512 vectorization factor. */
513 *max_vf = abs (dist);
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location,
516 "adjusting maximal vectorization factor to %i\n",
517 *max_vf);
520 if (abs_dist >= *max_vf)
522 /* Dependence distance does not create dependence, as far as
523 vectorization is concerned, in this case. */
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location,
526 "dependence distance >= VF.\n");
527 continue;
530 if (dump_enabled_p ())
532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
533 "not vectorized, possible dependence "
534 "between data-refs ");
535 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
536 dump_printf (MSG_NOTE, " and ");
537 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
538 dump_printf (MSG_NOTE, "\n");
541 return true;
544 return false;
547 /* Function vect_analyze_data_ref_dependences.
549 Examine all the data references in the loop, and make sure there do not
550 exist any data dependences between them. Set *MAX_VF according to
551 the maximum vectorization factor the data dependences allow. */
553 bool
554 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
555 unsigned int *max_vf)
557 unsigned int i;
558 struct data_dependence_relation *ddr;
560 if (dump_enabled_p ())
561 dump_printf_loc (MSG_NOTE, vect_location,
562 "=== vect_analyze_data_ref_dependences ===\n");
564 LOOP_VINFO_DDRS (loop_vinfo)
565 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
566 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
567 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
568 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
569 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
570 &LOOP_VINFO_DDRS (loop_vinfo),
571 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
572 return false;
574 /* For epilogues we either have no aliases or alias versioning
575 was applied to original loop. Therefore we may just get max_vf
576 using VF of original loop. */
577 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
578 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
579 else
580 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
581 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
582 return false;
584 return true;
588 /* Function vect_slp_analyze_data_ref_dependence.
590 Return TRUE if there (might) exist a dependence between a memory-reference
591 DRA and a memory-reference DRB. When versioning for alias may check a
592 dependence at run-time, return FALSE. Adjust *MAX_VF according to
593 the data dependence. */
595 static bool
596 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
598 struct data_reference *dra = DDR_A (ddr);
599 struct data_reference *drb = DDR_B (ddr);
601 /* We need to check dependences of statements marked as unvectorizable
602 as well, they still can prohibit vectorization. */
604 /* Independent data accesses. */
605 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
606 return false;
608 if (dra == drb)
609 return false;
611 /* Read-read is OK. */
612 if (DR_IS_READ (dra) && DR_IS_READ (drb))
613 return false;
615 /* If dra and drb are part of the same interleaving chain consider
616 them independent. */
617 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
618 && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
619 == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
620 return false;
622 /* Unknown data dependence. */
623 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
625 if (dump_enabled_p ())
627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
628 "can't determine dependence between ");
629 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
630 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
631 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
632 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
635 else if (dump_enabled_p ())
637 dump_printf_loc (MSG_NOTE, vect_location,
638 "determined dependence between ");
639 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
640 dump_printf (MSG_NOTE, " and ");
641 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
642 dump_printf (MSG_NOTE, "\n");
645 return true;
649 /* Analyze dependences involved in the transform of SLP NODE. STORES
650 contain the vector of scalar stores of this instance if we are
651 disambiguating the loads. */
653 static bool
654 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
655 vec<gimple *> stores, gimple *last_store)
657 /* This walks over all stmts involved in the SLP load/store done
658 in NODE verifying we can sink them up to the last stmt in the
659 group. */
660 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
661 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
663 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
664 if (access == last_access)
665 continue;
666 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
667 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
668 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
670 gimple *stmt = gsi_stmt (gsi);
671 if (! gimple_vuse (stmt)
672 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
673 continue;
675 /* If we couldn't record a (single) data reference for this
676 stmt we have to give up. */
677 /* ??? Here and below if dependence analysis fails we can resort
678 to the alias oracle which can handle more kinds of stmts. */
679 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
680 if (!dr_b)
681 return false;
683 bool dependent = false;
684 /* If we run into a store of this same instance (we've just
685 marked those) then delay dependence checking until we run
686 into the last store because this is where it will have
687 been sunk to (and we verify if we can do that as well). */
688 if (gimple_visited_p (stmt))
690 if (stmt != last_store)
691 continue;
692 unsigned i;
693 gimple *store;
694 FOR_EACH_VEC_ELT (stores, i, store)
696 data_reference *store_dr
697 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
698 ddr_p ddr = initialize_data_dependence_relation
699 (dr_a, store_dr, vNULL);
700 dependent = vect_slp_analyze_data_ref_dependence (ddr);
701 free_dependence_relation (ddr);
702 if (dependent)
703 break;
706 else
708 ddr_p ddr = initialize_data_dependence_relation (dr_a,
709 dr_b, vNULL);
710 dependent = vect_slp_analyze_data_ref_dependence (ddr);
711 free_dependence_relation (ddr);
713 if (dependent)
714 return false;
717 return true;
721 /* Function vect_analyze_data_ref_dependences.
723 Examine all the data references in the basic-block, and make sure there
724 do not exist any data dependences between them. Set *MAX_VF according to
725 the maximum vectorization factor the data dependences allow. */
727 bool
728 vect_slp_analyze_instance_dependence (slp_instance instance)
730 if (dump_enabled_p ())
731 dump_printf_loc (MSG_NOTE, vect_location,
732 "=== vect_slp_analyze_instance_dependence ===\n");
734 /* The stores of this instance are at the root of the SLP tree. */
735 slp_tree store = SLP_INSTANCE_TREE (instance);
736 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
737 store = NULL;
739 /* Verify we can sink stores to the vectorized stmt insert location. */
740 gimple *last_store = NULL;
741 if (store)
743 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
744 return false;
746 /* Mark stores in this instance and remember the last one. */
747 last_store = vect_find_last_scalar_stmt_in_slp (store);
748 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
749 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
752 bool res = true;
754 /* Verify we can sink loads to the vectorized stmt insert location,
755 special-casing stores of this instance. */
756 slp_tree load;
757 unsigned int i;
758 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
759 if (! vect_slp_analyze_node_dependences (instance, load,
760 store
761 ? SLP_TREE_SCALAR_STMTS (store)
762 : vNULL, last_store))
764 res = false;
765 break;
768 /* Unset the visited flag. */
769 if (store)
770 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
771 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
773 return res;
776 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
777 the statement that contains DRB, which is useful for recording in the
778 dump file. */
780 static void
781 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
782 innermost_loop_behavior *drb)
784 bool existed;
785 innermost_loop_behavior *&entry
786 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
787 if (!existed || entry->base_alignment < drb->base_alignment)
789 entry = drb;
790 if (dump_enabled_p ())
792 dump_printf_loc (MSG_NOTE, vect_location,
793 "recording new base alignment for ");
794 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
795 dump_printf (MSG_NOTE, "\n");
796 dump_printf_loc (MSG_NOTE, vect_location,
797 " alignment: %d\n", drb->base_alignment);
798 dump_printf_loc (MSG_NOTE, vect_location,
799 " misalignment: %d\n", drb->base_misalignment);
800 dump_printf_loc (MSG_NOTE, vect_location,
801 " based on: ");
802 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
807 /* If the region we're going to vectorize is reached, all unconditional
808 data references occur at least once. We can therefore pool the base
809 alignment guarantees from each unconditional reference. Do this by
810 going through all the data references in VINFO and checking whether
811 the containing statement makes the reference unconditionally. If so,
812 record the alignment of the base address in VINFO so that it can be
813 used for all other references with the same base. */
815 void
816 vect_record_base_alignments (vec_info *vinfo)
818 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
819 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
820 data_reference *dr;
821 unsigned int i;
822 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
823 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
825 gimple *stmt = DR_STMT (dr);
826 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
828 /* If DR is nested in the loop that is being vectorized, we can also
829 record the alignment of the base wrt the outer loop. */
830 if (loop && nested_in_vect_loop_p (loop, stmt))
832 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
833 vect_record_base_alignment
834 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
839 /* Return the target alignment for the vectorized form of DR. */
841 static unsigned int
842 vect_calculate_target_alignment (struct data_reference *dr)
844 gimple *stmt = DR_STMT (dr);
845 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
846 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
847 return targetm.vectorize.preferred_vector_alignment (vectype);
850 /* Function vect_compute_data_ref_alignment
852 Compute the misalignment of the data reference DR.
854 Output:
855 1. If during the misalignment computation it is found that the data reference
856 cannot be vectorized then false is returned.
857 2. DR_MISALIGNMENT (DR) is defined.
859 FOR NOW: No analysis is actually performed. Misalignment is calculated
860 only for trivial cases. TODO. */
862 bool
863 vect_compute_data_ref_alignment (struct data_reference *dr)
865 gimple *stmt = DR_STMT (dr);
866 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
867 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
868 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
869 struct loop *loop = NULL;
870 tree ref = DR_REF (dr);
871 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
873 if (dump_enabled_p ())
874 dump_printf_loc (MSG_NOTE, vect_location,
875 "vect_compute_data_ref_alignment:\n");
877 if (loop_vinfo)
878 loop = LOOP_VINFO_LOOP (loop_vinfo);
880 /* Initialize misalignment to unknown. */
881 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
883 innermost_loop_behavior *drb = vect_dr_behavior (dr);
884 bool step_preserves_misalignment_p;
886 unsigned HOST_WIDE_INT vector_alignment
887 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
888 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
890 /* No step for BB vectorization. */
891 if (!loop)
893 gcc_assert (integer_zerop (drb->step));
894 step_preserves_misalignment_p = true;
897 /* In case the dataref is in an inner-loop of the loop that is being
898 vectorized (LOOP), we use the base and misalignment information
899 relative to the outer-loop (LOOP). This is ok only if the misalignment
900 stays the same throughout the execution of the inner-loop, which is why
901 we have to check that the stride of the dataref in the inner-loop evenly
902 divides by the vector alignment. */
903 else if (nested_in_vect_loop_p (loop, stmt))
905 step_preserves_misalignment_p
906 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
908 if (dump_enabled_p ())
910 if (step_preserves_misalignment_p)
911 dump_printf_loc (MSG_NOTE, vect_location,
912 "inner step divides the vector alignment.\n");
913 else
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 "inner step doesn't divide the vector"
916 " alignment.\n");
920 /* Similarly we can only use base and misalignment information relative to
921 an innermost loop if the misalignment stays the same throughout the
922 execution of the loop. As above, this is the case if the stride of
923 the dataref evenly divides by the alignment. */
924 else
926 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
927 step_preserves_misalignment_p
928 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
930 if (!step_preserves_misalignment_p && dump_enabled_p ())
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 "step doesn't divide the vector alignment.\n");
935 unsigned int base_alignment = drb->base_alignment;
936 unsigned int base_misalignment = drb->base_misalignment;
938 /* Calculate the maximum of the pooled base address alignment and the
939 alignment that we can compute for DR itself. */
940 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
941 if (entry && base_alignment < (*entry)->base_alignment)
943 base_alignment = (*entry)->base_alignment;
944 base_misalignment = (*entry)->base_misalignment;
947 if (drb->offset_alignment < vector_alignment
948 || !step_preserves_misalignment_p
949 /* We need to know whether the step wrt the vectorized loop is
950 negative when computing the starting misalignment below. */
951 || TREE_CODE (drb->step) != INTEGER_CST)
953 if (dump_enabled_p ())
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 "Unknown alignment for access: ");
957 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
958 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
960 return true;
963 if (base_alignment < vector_alignment)
965 unsigned int max_alignment;
966 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
967 if (max_alignment < vector_alignment
968 || !vect_can_force_dr_alignment_p (base,
969 vector_alignment * BITS_PER_UNIT))
971 if (dump_enabled_p ())
973 dump_printf_loc (MSG_NOTE, vect_location,
974 "can't force alignment of ref: ");
975 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
976 dump_printf (MSG_NOTE, "\n");
978 return true;
981 /* Force the alignment of the decl.
982 NOTE: This is the only change to the code we make during
983 the analysis phase, before deciding to vectorize the loop. */
984 if (dump_enabled_p ())
986 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
987 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
988 dump_printf (MSG_NOTE, "\n");
991 DR_VECT_AUX (dr)->base_decl = base;
992 DR_VECT_AUX (dr)->base_misaligned = true;
993 base_misalignment = 0;
995 poly_int64 misalignment
996 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
998 /* If this is a backward running DR then first access in the larger
999 vectype actually is N-1 elements before the address in the DR.
1000 Adjust misalign accordingly. */
1001 if (tree_int_cst_sgn (drb->step) < 0)
1002 /* PLUS because STEP is negative. */
1003 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1004 * TREE_INT_CST_LOW (drb->step));
1006 unsigned int const_misalignment;
1007 if (!known_misalignment (misalignment, vector_alignment,
1008 &const_misalignment))
1010 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013 "Non-constant misalignment for access: ");
1014 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1015 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1017 return true;
1020 SET_DR_MISALIGNMENT (dr, const_misalignment);
1022 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1026 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1027 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1030 return true;
1033 /* Function vect_update_misalignment_for_peel.
1034 Sets DR's misalignment
1035 - to 0 if it has the same alignment as DR_PEEL,
1036 - to the misalignment computed using NPEEL if DR's salignment is known,
1037 - to -1 (unknown) otherwise.
1039 DR - the data reference whose misalignment is to be adjusted.
1040 DR_PEEL - the data reference whose misalignment is being made
1041 zero in the vector loop by the peel.
1042 NPEEL - the number of iterations in the peel loop if the misalignment
1043 of DR_PEEL is known at compile time. */
1045 static void
1046 vect_update_misalignment_for_peel (struct data_reference *dr,
1047 struct data_reference *dr_peel, int npeel)
1049 unsigned int i;
1050 vec<dr_p> same_aligned_drs;
1051 struct data_reference *current_dr;
1052 int dr_size = vect_get_scalar_dr_size (dr);
1053 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1054 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1055 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1057 /* For interleaved data accesses the step in the loop must be multiplied by
1058 the size of the interleaving group. */
1059 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1060 dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1061 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1062 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1064 /* It can be assumed that the data refs with the same alignment as dr_peel
1065 are aligned in the vector loop. */
1066 same_aligned_drs
1067 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1068 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1070 if (current_dr != dr)
1071 continue;
1072 gcc_assert (!known_alignment_for_access_p (dr)
1073 || !known_alignment_for_access_p (dr_peel)
1074 || (DR_MISALIGNMENT (dr) / dr_size
1075 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1076 SET_DR_MISALIGNMENT (dr, 0);
1077 return;
1080 if (known_alignment_for_access_p (dr)
1081 && known_alignment_for_access_p (dr_peel))
1083 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1084 int misal = DR_MISALIGNMENT (dr);
1085 misal += negative ? -npeel * dr_size : npeel * dr_size;
1086 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1087 SET_DR_MISALIGNMENT (dr, misal);
1088 return;
1091 if (dump_enabled_p ())
1092 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1093 "to unknown (-1).\n");
1094 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1098 /* Function verify_data_ref_alignment
1100 Return TRUE if DR can be handled with respect to alignment. */
1102 static bool
1103 verify_data_ref_alignment (data_reference_p dr)
1105 enum dr_alignment_support supportable_dr_alignment
1106 = vect_supportable_dr_alignment (dr, false);
1107 if (!supportable_dr_alignment)
1109 if (dump_enabled_p ())
1111 if (DR_IS_READ (dr))
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "not vectorized: unsupported unaligned load.");
1114 else
1115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1116 "not vectorized: unsupported unaligned "
1117 "store.");
1119 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1120 DR_REF (dr));
1121 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1123 return false;
1126 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1127 dump_printf_loc (MSG_NOTE, vect_location,
1128 "Vectorizing an unaligned access.\n");
1130 return true;
1133 /* Function vect_verify_datarefs_alignment
1135 Return TRUE if all data references in the loop can be
1136 handled with respect to alignment. */
1138 bool
1139 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1141 vec<data_reference_p> datarefs = vinfo->datarefs;
1142 struct data_reference *dr;
1143 unsigned int i;
1145 FOR_EACH_VEC_ELT (datarefs, i, dr)
1147 gimple *stmt = DR_STMT (dr);
1148 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1150 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1151 continue;
1153 /* For interleaving, only the alignment of the first access matters. */
1154 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1155 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1156 continue;
1158 /* Strided accesses perform only component accesses, alignment is
1159 irrelevant for them. */
1160 if (STMT_VINFO_STRIDED_P (stmt_info)
1161 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1162 continue;
1164 if (! verify_data_ref_alignment (dr))
1165 return false;
1168 return true;
1171 /* Given an memory reference EXP return whether its alignment is less
1172 than its size. */
1174 static bool
1175 not_size_aligned (tree exp)
1177 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1178 return true;
1180 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1181 > get_object_alignment (exp));
1184 /* Function vector_alignment_reachable_p
1186 Return true if vector alignment for DR is reachable by peeling
1187 a few loop iterations. Return false otherwise. */
1189 static bool
1190 vector_alignment_reachable_p (struct data_reference *dr)
1192 gimple *stmt = DR_STMT (dr);
1193 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1194 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1196 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1198 /* For interleaved access we peel only if number of iterations in
1199 the prolog loop ({VF - misalignment}), is a multiple of the
1200 number of the interleaved accesses. */
1201 int elem_size, mis_in_elements;
1203 /* FORNOW: handle only known alignment. */
1204 if (!known_alignment_for_access_p (dr))
1205 return false;
1207 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1208 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1209 elem_size = vector_element_size (vector_size, nelements);
1210 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1212 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1213 return false;
1216 /* If misalignment is known at the compile time then allow peeling
1217 only if natural alignment is reachable through peeling. */
1218 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1220 HOST_WIDE_INT elmsize =
1221 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1222 if (dump_enabled_p ())
1224 dump_printf_loc (MSG_NOTE, vect_location,
1225 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1226 dump_printf (MSG_NOTE,
1227 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1229 if (DR_MISALIGNMENT (dr) % elmsize)
1231 if (dump_enabled_p ())
1232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1233 "data size does not divide the misalignment.\n");
1234 return false;
1238 if (!known_alignment_for_access_p (dr))
1240 tree type = TREE_TYPE (DR_REF (dr));
1241 bool is_packed = not_size_aligned (DR_REF (dr));
1242 if (dump_enabled_p ())
1243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1244 "Unknown misalignment, %snaturally aligned\n",
1245 is_packed ? "not " : "");
1246 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1249 return true;
1253 /* Calculate the cost of the memory access represented by DR. */
1255 static void
1256 vect_get_data_access_cost (struct data_reference *dr,
1257 unsigned int *inside_cost,
1258 unsigned int *outside_cost,
1259 stmt_vector_for_cost *body_cost_vec,
1260 stmt_vector_for_cost *prologue_cost_vec)
1262 gimple *stmt = DR_STMT (dr);
1263 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1264 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1265 int ncopies;
1267 if (PURE_SLP_STMT (stmt_info))
1268 ncopies = 1;
1269 else
1270 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1272 if (DR_IS_READ (dr))
1273 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1274 prologue_cost_vec, body_cost_vec, false);
1275 else
1276 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1278 if (dump_enabled_p ())
1279 dump_printf_loc (MSG_NOTE, vect_location,
1280 "vect_get_data_access_cost: inside_cost = %d, "
1281 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1285 typedef struct _vect_peel_info
1287 struct data_reference *dr;
1288 int npeel;
1289 unsigned int count;
1290 } *vect_peel_info;
1292 typedef struct _vect_peel_extended_info
1294 struct _vect_peel_info peel_info;
1295 unsigned int inside_cost;
1296 unsigned int outside_cost;
1297 } *vect_peel_extended_info;
1300 /* Peeling hashtable helpers. */
1302 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1304 static inline hashval_t hash (const _vect_peel_info *);
1305 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1308 inline hashval_t
1309 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1311 return (hashval_t) peel_info->npeel;
1314 inline bool
1315 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1317 return (a->npeel == b->npeel);
1321 /* Insert DR into peeling hash table with NPEEL as key. */
1323 static void
1324 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1325 loop_vec_info loop_vinfo, struct data_reference *dr,
1326 int npeel)
1328 struct _vect_peel_info elem, *slot;
1329 _vect_peel_info **new_slot;
1330 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1332 elem.npeel = npeel;
1333 slot = peeling_htab->find (&elem);
1334 if (slot)
1335 slot->count++;
1336 else
1338 slot = XNEW (struct _vect_peel_info);
1339 slot->npeel = npeel;
1340 slot->dr = dr;
1341 slot->count = 1;
1342 new_slot = peeling_htab->find_slot (slot, INSERT);
1343 *new_slot = slot;
1346 if (!supportable_dr_alignment
1347 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1348 slot->count += VECT_MAX_COST;
1352 /* Traverse peeling hash table to find peeling option that aligns maximum
1353 number of data accesses. */
1356 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1357 _vect_peel_extended_info *max)
1359 vect_peel_info elem = *slot;
1361 if (elem->count > max->peel_info.count
1362 || (elem->count == max->peel_info.count
1363 && max->peel_info.npeel > elem->npeel))
1365 max->peel_info.npeel = elem->npeel;
1366 max->peel_info.count = elem->count;
1367 max->peel_info.dr = elem->dr;
1370 return 1;
1373 /* Get the costs of peeling NPEEL iterations checking data access costs
1374 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1375 misalignment will be zero after peeling. */
1377 static void
1378 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1379 struct data_reference *dr0,
1380 unsigned int *inside_cost,
1381 unsigned int *outside_cost,
1382 stmt_vector_for_cost *body_cost_vec,
1383 stmt_vector_for_cost *prologue_cost_vec,
1384 unsigned int npeel,
1385 bool unknown_misalignment)
1387 unsigned i;
1388 data_reference *dr;
1390 FOR_EACH_VEC_ELT (datarefs, i, dr)
1392 gimple *stmt = DR_STMT (dr);
1393 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1394 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1395 continue;
1397 /* For interleaving, only the alignment of the first access
1398 matters. */
1399 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1400 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1401 continue;
1403 /* Strided accesses perform only component accesses, alignment is
1404 irrelevant for them. */
1405 if (STMT_VINFO_STRIDED_P (stmt_info)
1406 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1407 continue;
1409 int save_misalignment;
1410 save_misalignment = DR_MISALIGNMENT (dr);
1411 if (npeel == 0)
1413 else if (unknown_misalignment && dr == dr0)
1414 SET_DR_MISALIGNMENT (dr, 0);
1415 else
1416 vect_update_misalignment_for_peel (dr, dr0, npeel);
1417 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1418 body_cost_vec, prologue_cost_vec);
1419 SET_DR_MISALIGNMENT (dr, save_misalignment);
1423 /* Traverse peeling hash table and calculate cost for each peeling option.
1424 Find the one with the lowest cost. */
1427 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1428 _vect_peel_extended_info *min)
1430 vect_peel_info elem = *slot;
1431 int dummy;
1432 unsigned int inside_cost = 0, outside_cost = 0;
1433 gimple *stmt = DR_STMT (elem->dr);
1434 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1435 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1436 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1437 epilogue_cost_vec;
1439 prologue_cost_vec.create (2);
1440 body_cost_vec.create (2);
1441 epilogue_cost_vec.create (2);
1443 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1444 elem->dr, &inside_cost, &outside_cost,
1445 &body_cost_vec, &prologue_cost_vec,
1446 elem->npeel, false);
1448 body_cost_vec.release ();
1450 outside_cost += vect_get_known_peeling_cost
1451 (loop_vinfo, elem->npeel, &dummy,
1452 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1453 &prologue_cost_vec, &epilogue_cost_vec);
1455 /* Prologue and epilogue costs are added to the target model later.
1456 These costs depend only on the scalar iteration cost, the
1457 number of peeling iterations finally chosen, and the number of
1458 misaligned statements. So discard the information found here. */
1459 prologue_cost_vec.release ();
1460 epilogue_cost_vec.release ();
1462 if (inside_cost < min->inside_cost
1463 || (inside_cost == min->inside_cost
1464 && outside_cost < min->outside_cost))
1466 min->inside_cost = inside_cost;
1467 min->outside_cost = outside_cost;
1468 min->peel_info.dr = elem->dr;
1469 min->peel_info.npeel = elem->npeel;
1470 min->peel_info.count = elem->count;
1473 return 1;
1477 /* Choose best peeling option by traversing peeling hash table and either
1478 choosing an option with the lowest cost (if cost model is enabled) or the
1479 option that aligns as many accesses as possible. */
1481 static struct _vect_peel_extended_info
1482 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1483 loop_vec_info loop_vinfo)
1485 struct _vect_peel_extended_info res;
1487 res.peel_info.dr = NULL;
1489 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1491 res.inside_cost = INT_MAX;
1492 res.outside_cost = INT_MAX;
1493 peeling_htab->traverse <_vect_peel_extended_info *,
1494 vect_peeling_hash_get_lowest_cost> (&res);
1496 else
1498 res.peel_info.count = 0;
1499 peeling_htab->traverse <_vect_peel_extended_info *,
1500 vect_peeling_hash_get_most_frequent> (&res);
1501 res.inside_cost = 0;
1502 res.outside_cost = 0;
1505 return res;
1508 /* Return true if the new peeling NPEEL is supported. */
1510 static bool
1511 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1512 unsigned npeel)
1514 unsigned i;
1515 struct data_reference *dr = NULL;
1516 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1517 gimple *stmt;
1518 stmt_vec_info stmt_info;
1519 enum dr_alignment_support supportable_dr_alignment;
1521 /* Ensure that all data refs can be vectorized after the peel. */
1522 FOR_EACH_VEC_ELT (datarefs, i, dr)
1524 int save_misalignment;
1526 if (dr == dr0)
1527 continue;
1529 stmt = DR_STMT (dr);
1530 stmt_info = vinfo_for_stmt (stmt);
1531 /* For interleaving, only the alignment of the first access
1532 matters. */
1533 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1534 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1535 continue;
1537 /* Strided accesses perform only component accesses, alignment is
1538 irrelevant for them. */
1539 if (STMT_VINFO_STRIDED_P (stmt_info)
1540 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1541 continue;
1543 save_misalignment = DR_MISALIGNMENT (dr);
1544 vect_update_misalignment_for_peel (dr, dr0, npeel);
1545 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1546 SET_DR_MISALIGNMENT (dr, save_misalignment);
1548 if (!supportable_dr_alignment)
1549 return false;
1552 return true;
1555 /* Function vect_enhance_data_refs_alignment
1557 This pass will use loop versioning and loop peeling in order to enhance
1558 the alignment of data references in the loop.
1560 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1561 original loop is to be vectorized. Any other loops that are created by
1562 the transformations performed in this pass - are not supposed to be
1563 vectorized. This restriction will be relaxed.
1565 This pass will require a cost model to guide it whether to apply peeling
1566 or versioning or a combination of the two. For example, the scheme that
1567 intel uses when given a loop with several memory accesses, is as follows:
1568 choose one memory access ('p') which alignment you want to force by doing
1569 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1570 other accesses are not necessarily aligned, or (2) use loop versioning to
1571 generate one loop in which all accesses are aligned, and another loop in
1572 which only 'p' is necessarily aligned.
1574 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1575 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1576 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1578 Devising a cost model is the most critical aspect of this work. It will
1579 guide us on which access to peel for, whether to use loop versioning, how
1580 many versions to create, etc. The cost model will probably consist of
1581 generic considerations as well as target specific considerations (on
1582 powerpc for example, misaligned stores are more painful than misaligned
1583 loads).
1585 Here are the general steps involved in alignment enhancements:
1587 -- original loop, before alignment analysis:
1588 for (i=0; i<N; i++){
1589 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1590 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1593 -- After vect_compute_data_refs_alignment:
1594 for (i=0; i<N; i++){
1595 x = q[i]; # DR_MISALIGNMENT(q) = 3
1596 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1599 -- Possibility 1: we do loop versioning:
1600 if (p is aligned) {
1601 for (i=0; i<N; i++){ # loop 1A
1602 x = q[i]; # DR_MISALIGNMENT(q) = 3
1603 p[i] = y; # DR_MISALIGNMENT(p) = 0
1606 else {
1607 for (i=0; i<N; i++){ # loop 1B
1608 x = q[i]; # DR_MISALIGNMENT(q) = 3
1609 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1613 -- Possibility 2: we do loop peeling:
1614 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1615 x = q[i];
1616 p[i] = y;
1618 for (i = 3; i < N; i++){ # loop 2A
1619 x = q[i]; # DR_MISALIGNMENT(q) = 0
1620 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1623 -- Possibility 3: combination of loop peeling and versioning:
1624 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1625 x = q[i];
1626 p[i] = y;
1628 if (p is aligned) {
1629 for (i = 3; i<N; i++){ # loop 3A
1630 x = q[i]; # DR_MISALIGNMENT(q) = 0
1631 p[i] = y; # DR_MISALIGNMENT(p) = 0
1634 else {
1635 for (i = 3; i<N; i++){ # loop 3B
1636 x = q[i]; # DR_MISALIGNMENT(q) = 0
1637 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1641 These loops are later passed to loop_transform to be vectorized. The
1642 vectorizer will use the alignment information to guide the transformation
1643 (whether to generate regular loads/stores, or with special handling for
1644 misalignment). */
1646 bool
1647 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1649 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1650 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1651 enum dr_alignment_support supportable_dr_alignment;
1652 struct data_reference *dr0 = NULL, *first_store = NULL;
1653 struct data_reference *dr;
1654 unsigned int i, j;
1655 bool do_peeling = false;
1656 bool do_versioning = false;
1657 bool stat;
1658 gimple *stmt;
1659 stmt_vec_info stmt_info;
1660 unsigned int npeel = 0;
1661 bool one_misalignment_known = false;
1662 bool one_misalignment_unknown = false;
1663 bool one_dr_unsupportable = false;
1664 struct data_reference *unsupportable_dr = NULL;
1665 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1666 unsigned possible_npeel_number = 1;
1667 tree vectype;
1668 unsigned int mis, same_align_drs_max = 0;
1669 hash_table<peel_info_hasher> peeling_htab (1);
1671 if (dump_enabled_p ())
1672 dump_printf_loc (MSG_NOTE, vect_location,
1673 "=== vect_enhance_data_refs_alignment ===\n");
1675 /* Reset data so we can safely be called multiple times. */
1676 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1677 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1679 /* While cost model enhancements are expected in the future, the high level
1680 view of the code at this time is as follows:
1682 A) If there is a misaligned access then see if peeling to align
1683 this access can make all data references satisfy
1684 vect_supportable_dr_alignment. If so, update data structures
1685 as needed and return true.
1687 B) If peeling wasn't possible and there is a data reference with an
1688 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1689 then see if loop versioning checks can be used to make all data
1690 references satisfy vect_supportable_dr_alignment. If so, update
1691 data structures as needed and return true.
1693 C) If neither peeling nor versioning were successful then return false if
1694 any data reference does not satisfy vect_supportable_dr_alignment.
1696 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1698 Note, Possibility 3 above (which is peeling and versioning together) is not
1699 being done at this time. */
1701 /* (1) Peeling to force alignment. */
1703 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1704 Considerations:
1705 + How many accesses will become aligned due to the peeling
1706 - How many accesses will become unaligned due to the peeling,
1707 and the cost of misaligned accesses.
1708 - The cost of peeling (the extra runtime checks, the increase
1709 in code size). */
1711 FOR_EACH_VEC_ELT (datarefs, i, dr)
1713 stmt = DR_STMT (dr);
1714 stmt_info = vinfo_for_stmt (stmt);
1716 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1717 continue;
1719 /* For interleaving, only the alignment of the first access
1720 matters. */
1721 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1722 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1723 continue;
1725 /* For invariant accesses there is nothing to enhance. */
1726 if (integer_zerop (DR_STEP (dr)))
1727 continue;
1729 /* Strided accesses perform only component accesses, alignment is
1730 irrelevant for them. */
1731 if (STMT_VINFO_STRIDED_P (stmt_info)
1732 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1733 continue;
1735 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1736 do_peeling = vector_alignment_reachable_p (dr);
1737 if (do_peeling)
1739 if (known_alignment_for_access_p (dr))
1741 unsigned int npeel_tmp = 0;
1742 bool negative = tree_int_cst_compare (DR_STEP (dr),
1743 size_zero_node) < 0;
1745 vectype = STMT_VINFO_VECTYPE (stmt_info);
1746 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1747 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1748 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1749 if (DR_MISALIGNMENT (dr) != 0)
1750 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1752 /* For multiple types, it is possible that the bigger type access
1753 will have more than one peeling option. E.g., a loop with two
1754 types: one of size (vector size / 4), and the other one of
1755 size (vector size / 8). Vectorization factor will 8. If both
1756 accesses are misaligned by 3, the first one needs one scalar
1757 iteration to be aligned, and the second one needs 5. But the
1758 first one will be aligned also by peeling 5 scalar
1759 iterations, and in that case both accesses will be aligned.
1760 Hence, except for the immediate peeling amount, we also want
1761 to try to add full vector size, while we don't exceed
1762 vectorization factor.
1763 We do this automatically for cost model, since we calculate
1764 cost for every peeling option. */
1765 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1767 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1768 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1769 possible_npeel_number
1770 = vect_get_num_vectors (nscalars, vectype);
1772 /* NPEEL_TMP is 0 when there is no misalignment, but also
1773 allow peeling NELEMENTS. */
1774 if (DR_MISALIGNMENT (dr) == 0)
1775 possible_npeel_number++;
1778 /* Save info about DR in the hash table. Also include peeling
1779 amounts according to the explanation above. */
1780 for (j = 0; j < possible_npeel_number; j++)
1782 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1783 dr, npeel_tmp);
1784 npeel_tmp += target_align / dr_size;
1787 one_misalignment_known = true;
1789 else
1791 /* If we don't know any misalignment values, we prefer
1792 peeling for data-ref that has the maximum number of data-refs
1793 with the same alignment, unless the target prefers to align
1794 stores over load. */
1795 unsigned same_align_drs
1796 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1797 if (!dr0
1798 || same_align_drs_max < same_align_drs)
1800 same_align_drs_max = same_align_drs;
1801 dr0 = dr;
1803 /* For data-refs with the same number of related
1804 accesses prefer the one where the misalign
1805 computation will be invariant in the outermost loop. */
1806 else if (same_align_drs_max == same_align_drs)
1808 struct loop *ivloop0, *ivloop;
1809 ivloop0 = outermost_invariant_loop_for_expr
1810 (loop, DR_BASE_ADDRESS (dr0));
1811 ivloop = outermost_invariant_loop_for_expr
1812 (loop, DR_BASE_ADDRESS (dr));
1813 if ((ivloop && !ivloop0)
1814 || (ivloop && ivloop0
1815 && flow_loop_nested_p (ivloop, ivloop0)))
1816 dr0 = dr;
1819 one_misalignment_unknown = true;
1821 /* Check for data refs with unsupportable alignment that
1822 can be peeled. */
1823 if (!supportable_dr_alignment)
1825 one_dr_unsupportable = true;
1826 unsupportable_dr = dr;
1829 if (!first_store && DR_IS_WRITE (dr))
1830 first_store = dr;
1833 else
1835 if (!aligned_access_p (dr))
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839 "vector alignment may not be reachable\n");
1840 break;
1845 /* Check if we can possibly peel the loop. */
1846 if (!vect_can_advance_ivs_p (loop_vinfo)
1847 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1848 || loop->inner)
1849 do_peeling = false;
1851 struct _vect_peel_extended_info peel_for_known_alignment;
1852 struct _vect_peel_extended_info peel_for_unknown_alignment;
1853 struct _vect_peel_extended_info best_peel;
1855 peel_for_unknown_alignment.inside_cost = INT_MAX;
1856 peel_for_unknown_alignment.outside_cost = INT_MAX;
1857 peel_for_unknown_alignment.peel_info.count = 0;
1859 if (do_peeling
1860 && one_misalignment_unknown)
1862 /* Check if the target requires to prefer stores over loads, i.e., if
1863 misaligned stores are more expensive than misaligned loads (taking
1864 drs with same alignment into account). */
1865 unsigned int load_inside_cost = 0;
1866 unsigned int load_outside_cost = 0;
1867 unsigned int store_inside_cost = 0;
1868 unsigned int store_outside_cost = 0;
1869 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1871 stmt_vector_for_cost dummy;
1872 dummy.create (2);
1873 vect_get_peeling_costs_all_drs (datarefs, dr0,
1874 &load_inside_cost,
1875 &load_outside_cost,
1876 &dummy, &dummy, estimated_npeels, true);
1877 dummy.release ();
1879 if (first_store)
1881 dummy.create (2);
1882 vect_get_peeling_costs_all_drs (datarefs, first_store,
1883 &store_inside_cost,
1884 &store_outside_cost,
1885 &dummy, &dummy,
1886 estimated_npeels, true);
1887 dummy.release ();
1889 else
1891 store_inside_cost = INT_MAX;
1892 store_outside_cost = INT_MAX;
1895 if (load_inside_cost > store_inside_cost
1896 || (load_inside_cost == store_inside_cost
1897 && load_outside_cost > store_outside_cost))
1899 dr0 = first_store;
1900 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1901 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1903 else
1905 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1906 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1909 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1910 prologue_cost_vec.create (2);
1911 epilogue_cost_vec.create (2);
1913 int dummy2;
1914 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1915 (loop_vinfo, estimated_npeels, &dummy2,
1916 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1917 &prologue_cost_vec, &epilogue_cost_vec);
1919 prologue_cost_vec.release ();
1920 epilogue_cost_vec.release ();
1922 peel_for_unknown_alignment.peel_info.count = 1
1923 + STMT_VINFO_SAME_ALIGN_REFS
1924 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1927 peel_for_unknown_alignment.peel_info.npeel = 0;
1928 peel_for_unknown_alignment.peel_info.dr = dr0;
1930 best_peel = peel_for_unknown_alignment;
1932 peel_for_known_alignment.inside_cost = INT_MAX;
1933 peel_for_known_alignment.outside_cost = INT_MAX;
1934 peel_for_known_alignment.peel_info.count = 0;
1935 peel_for_known_alignment.peel_info.dr = NULL;
1937 if (do_peeling && one_misalignment_known)
1939 /* Peeling is possible, but there is no data access that is not supported
1940 unless aligned. So we try to choose the best possible peeling from
1941 the hash table. */
1942 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1943 (&peeling_htab, loop_vinfo);
1946 /* Compare costs of peeling for known and unknown alignment. */
1947 if (peel_for_known_alignment.peel_info.dr != NULL
1948 && peel_for_unknown_alignment.inside_cost
1949 >= peel_for_known_alignment.inside_cost)
1951 best_peel = peel_for_known_alignment;
1953 /* If the best peeling for known alignment has NPEEL == 0, perform no
1954 peeling at all except if there is an unsupportable dr that we can
1955 align. */
1956 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1957 do_peeling = false;
1960 /* If there is an unsupportable data ref, prefer this over all choices so far
1961 since we'd have to discard a chosen peeling except when it accidentally
1962 aligned the unsupportable data ref. */
1963 if (one_dr_unsupportable)
1964 dr0 = unsupportable_dr;
1965 else if (do_peeling)
1967 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1968 TODO: Use nopeel_outside_cost or get rid of it? */
1969 unsigned nopeel_inside_cost = 0;
1970 unsigned nopeel_outside_cost = 0;
1972 stmt_vector_for_cost dummy;
1973 dummy.create (2);
1974 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1975 &nopeel_outside_cost, &dummy, &dummy,
1976 0, false);
1977 dummy.release ();
1979 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1980 costs will be recorded. */
1981 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1982 prologue_cost_vec.create (2);
1983 epilogue_cost_vec.create (2);
1985 int dummy2;
1986 nopeel_outside_cost += vect_get_known_peeling_cost
1987 (loop_vinfo, 0, &dummy2,
1988 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1989 &prologue_cost_vec, &epilogue_cost_vec);
1991 prologue_cost_vec.release ();
1992 epilogue_cost_vec.release ();
1994 npeel = best_peel.peel_info.npeel;
1995 dr0 = best_peel.peel_info.dr;
1997 /* If no peeling is not more expensive than the best peeling we
1998 have so far, don't perform any peeling. */
1999 if (nopeel_inside_cost <= best_peel.inside_cost)
2000 do_peeling = false;
2003 if (do_peeling)
2005 stmt = DR_STMT (dr0);
2006 stmt_info = vinfo_for_stmt (stmt);
2007 vectype = STMT_VINFO_VECTYPE (stmt_info);
2009 if (known_alignment_for_access_p (dr0))
2011 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2012 size_zero_node) < 0;
2013 if (!npeel)
2015 /* Since it's known at compile time, compute the number of
2016 iterations in the peeled loop (the peeling factor) for use in
2017 updating DR_MISALIGNMENT values. The peeling factor is the
2018 vectorization factor minus the misalignment as an element
2019 count. */
2020 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2021 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2022 npeel = ((mis & (target_align - 1))
2023 / vect_get_scalar_dr_size (dr0));
2026 /* For interleaved data access every iteration accesses all the
2027 members of the group, therefore we divide the number of iterations
2028 by the group size. */
2029 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2030 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2031 npeel /= DR_GROUP_SIZE (stmt_info);
2033 if (dump_enabled_p ())
2034 dump_printf_loc (MSG_NOTE, vect_location,
2035 "Try peeling by %d\n", npeel);
2038 /* Ensure that all datarefs can be vectorized after the peel. */
2039 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2040 do_peeling = false;
2042 /* Check if all datarefs are supportable and log. */
2043 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2045 stat = vect_verify_datarefs_alignment (loop_vinfo);
2046 if (!stat)
2047 do_peeling = false;
2048 else
2049 return stat;
2052 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2053 if (do_peeling)
2055 unsigned max_allowed_peel
2056 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2057 if (max_allowed_peel != (unsigned)-1)
2059 unsigned max_peel = npeel;
2060 if (max_peel == 0)
2062 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2063 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2065 if (max_peel > max_allowed_peel)
2067 do_peeling = false;
2068 if (dump_enabled_p ())
2069 dump_printf_loc (MSG_NOTE, vect_location,
2070 "Disable peeling, max peels reached: %d\n", max_peel);
2075 /* Cost model #2 - if peeling may result in a remaining loop not
2076 iterating enough to be vectorized then do not peel. Since this
2077 is a cost heuristic rather than a correctness decision, use the
2078 most likely runtime value for variable vectorization factors. */
2079 if (do_peeling
2080 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2082 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2083 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2084 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2085 < assumed_vf + max_peel)
2086 do_peeling = false;
2089 if (do_peeling)
2091 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2092 If the misalignment of DR_i is identical to that of dr0 then set
2093 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2094 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2095 by the peeling factor times the element size of DR_i (MOD the
2096 vectorization factor times the size). Otherwise, the
2097 misalignment of DR_i must be set to unknown. */
2098 FOR_EACH_VEC_ELT (datarefs, i, dr)
2099 if (dr != dr0)
2101 /* Strided accesses perform only component accesses, alignment
2102 is irrelevant for them. */
2103 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2104 if (STMT_VINFO_STRIDED_P (stmt_info)
2105 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2106 continue;
2108 vect_update_misalignment_for_peel (dr, dr0, npeel);
2111 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2112 if (npeel)
2113 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2114 else
2115 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2116 = DR_MISALIGNMENT (dr0);
2117 SET_DR_MISALIGNMENT (dr0, 0);
2118 if (dump_enabled_p ())
2120 dump_printf_loc (MSG_NOTE, vect_location,
2121 "Alignment of access forced using peeling.\n");
2122 dump_printf_loc (MSG_NOTE, vect_location,
2123 "Peeling for alignment will be applied.\n");
2126 /* The inside-loop cost will be accounted for in vectorizable_load
2127 and vectorizable_store correctly with adjusted alignments.
2128 Drop the body_cst_vec on the floor here. */
2129 stat = vect_verify_datarefs_alignment (loop_vinfo);
2130 gcc_assert (stat);
2131 return stat;
2135 /* (2) Versioning to force alignment. */
2137 /* Try versioning if:
2138 1) optimize loop for speed
2139 2) there is at least one unsupported misaligned data ref with an unknown
2140 misalignment, and
2141 3) all misaligned data refs with a known misalignment are supported, and
2142 4) the number of runtime alignment checks is within reason. */
2144 do_versioning =
2145 optimize_loop_nest_for_speed_p (loop)
2146 && (!loop->inner); /* FORNOW */
2148 if (do_versioning)
2150 FOR_EACH_VEC_ELT (datarefs, i, dr)
2152 stmt = DR_STMT (dr);
2153 stmt_info = vinfo_for_stmt (stmt);
2155 /* For interleaving, only the alignment of the first access
2156 matters. */
2157 if (aligned_access_p (dr)
2158 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2159 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2160 continue;
2162 if (STMT_VINFO_STRIDED_P (stmt_info))
2164 /* Strided loads perform only component accesses, alignment is
2165 irrelevant for them. */
2166 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2167 continue;
2168 do_versioning = false;
2169 break;
2172 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2174 if (!supportable_dr_alignment)
2176 gimple *stmt;
2177 int mask;
2178 tree vectype;
2180 if (known_alignment_for_access_p (dr)
2181 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2182 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2184 do_versioning = false;
2185 break;
2188 stmt = DR_STMT (dr);
2189 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2190 gcc_assert (vectype);
2192 /* At present we don't support versioning for alignment
2193 with variable VF, since there's no guarantee that the
2194 VF is a power of two. We could relax this if we added
2195 a way of enforcing a power-of-two size. */
2196 unsigned HOST_WIDE_INT size;
2197 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2199 do_versioning = false;
2200 break;
2203 /* The rightmost bits of an aligned address must be zeros.
2204 Construct the mask needed for this test. For example,
2205 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2206 mask must be 15 = 0xf. */
2207 mask = size - 1;
2209 /* FORNOW: use the same mask to test all potentially unaligned
2210 references in the loop. The vectorizer currently supports
2211 a single vector size, see the reference to
2212 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2213 vectorization factor is computed. */
2214 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2215 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2216 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2217 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2218 DR_STMT (dr));
2222 /* Versioning requires at least one misaligned data reference. */
2223 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2224 do_versioning = false;
2225 else if (!do_versioning)
2226 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2229 if (do_versioning)
2231 vec<gimple *> may_misalign_stmts
2232 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2233 gimple *stmt;
2235 /* It can now be assumed that the data references in the statements
2236 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2237 of the loop being vectorized. */
2238 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2240 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2241 dr = STMT_VINFO_DATA_REF (stmt_info);
2242 SET_DR_MISALIGNMENT (dr, 0);
2243 if (dump_enabled_p ())
2244 dump_printf_loc (MSG_NOTE, vect_location,
2245 "Alignment of access forced using versioning.\n");
2248 if (dump_enabled_p ())
2249 dump_printf_loc (MSG_NOTE, vect_location,
2250 "Versioning for alignment will be applied.\n");
2252 /* Peeling and versioning can't be done together at this time. */
2253 gcc_assert (! (do_peeling && do_versioning));
2255 stat = vect_verify_datarefs_alignment (loop_vinfo);
2256 gcc_assert (stat);
2257 return stat;
2260 /* This point is reached if neither peeling nor versioning is being done. */
2261 gcc_assert (! (do_peeling || do_versioning));
2263 stat = vect_verify_datarefs_alignment (loop_vinfo);
2264 return stat;
2268 /* Function vect_find_same_alignment_drs.
2270 Update group and alignment relations according to the chosen
2271 vectorization factor. */
2273 static void
2274 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2276 struct data_reference *dra = DDR_A (ddr);
2277 struct data_reference *drb = DDR_B (ddr);
2278 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2279 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2281 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2282 return;
2284 if (dra == drb)
2285 return;
2287 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2288 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2289 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2290 return;
2292 /* Two references with distance zero have the same alignment. */
2293 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2294 - wi::to_poly_offset (DR_INIT (drb)));
2295 if (maybe_ne (diff, 0))
2297 /* Get the wider of the two alignments. */
2298 unsigned int align_a = (vect_calculate_target_alignment (dra)
2299 / BITS_PER_UNIT);
2300 unsigned int align_b = (vect_calculate_target_alignment (drb)
2301 / BITS_PER_UNIT);
2302 unsigned int max_align = MAX (align_a, align_b);
2304 /* Require the gap to be a multiple of the larger vector alignment. */
2305 if (!multiple_p (diff, max_align))
2306 return;
2309 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2310 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2311 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_NOTE, vect_location,
2314 "accesses have the same alignment: ");
2315 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2316 dump_printf (MSG_NOTE, " and ");
2317 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2318 dump_printf (MSG_NOTE, "\n");
2323 /* Function vect_analyze_data_refs_alignment
2325 Analyze the alignment of the data-references in the loop.
2326 Return FALSE if a data reference is found that cannot be vectorized. */
2328 bool
2329 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_NOTE, vect_location,
2333 "=== vect_analyze_data_refs_alignment ===\n");
2335 /* Mark groups of data references with same alignment using
2336 data dependence information. */
2337 vec<ddr_p> ddrs = vinfo->ddrs;
2338 struct data_dependence_relation *ddr;
2339 unsigned int i;
2341 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2342 vect_find_same_alignment_drs (ddr);
2344 vec<data_reference_p> datarefs = vinfo->datarefs;
2345 struct data_reference *dr;
2347 vect_record_base_alignments (vinfo);
2348 FOR_EACH_VEC_ELT (datarefs, i, dr)
2350 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2351 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2352 && !vect_compute_data_ref_alignment (dr))
2354 /* Strided accesses perform only component accesses, misalignment
2355 information is irrelevant for them. */
2356 if (STMT_VINFO_STRIDED_P (stmt_info)
2357 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2358 continue;
2360 if (dump_enabled_p ())
2361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2362 "not vectorized: can't calculate alignment "
2363 "for data ref.\n");
2365 return false;
2369 return true;
2373 /* Analyze alignment of DRs of stmts in NODE. */
2375 static bool
2376 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2378 /* We vectorize from the first scalar stmt in the node unless
2379 the node is permuted in which case we start from the first
2380 element in the group. */
2381 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2382 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2383 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2384 first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2386 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2387 if (! vect_compute_data_ref_alignment (dr)
2388 /* For creating the data-ref pointer we need alignment of the
2389 first element anyway. */
2390 || (dr != first_dr
2391 && ! vect_compute_data_ref_alignment (first_dr))
2392 || ! verify_data_ref_alignment (dr))
2394 if (dump_enabled_p ())
2395 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2396 "not vectorized: bad data alignment in basic "
2397 "block.\n");
2398 return false;
2401 return true;
2404 /* Function vect_slp_analyze_instance_alignment
2406 Analyze the alignment of the data-references in the SLP instance.
2407 Return FALSE if a data reference is found that cannot be vectorized. */
2409 bool
2410 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2412 if (dump_enabled_p ())
2413 dump_printf_loc (MSG_NOTE, vect_location,
2414 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2416 slp_tree node;
2417 unsigned i;
2418 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2419 if (! vect_slp_analyze_and_verify_node_alignment (node))
2420 return false;
2422 node = SLP_INSTANCE_TREE (instance);
2423 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2424 && ! vect_slp_analyze_and_verify_node_alignment
2425 (SLP_INSTANCE_TREE (instance)))
2426 return false;
2428 return true;
2432 /* Analyze groups of accesses: check that DR belongs to a group of
2433 accesses of legal size, step, etc. Detect gaps, single element
2434 interleaving, and other special cases. Set grouped access info.
2435 Collect groups of strided stores for further use in SLP analysis.
2436 Worker for vect_analyze_group_access. */
2438 static bool
2439 vect_analyze_group_access_1 (struct data_reference *dr)
2441 tree step = DR_STEP (dr);
2442 tree scalar_type = TREE_TYPE (DR_REF (dr));
2443 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2444 gimple *stmt = DR_STMT (dr);
2445 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2446 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2447 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2448 HOST_WIDE_INT dr_step = -1;
2449 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2450 bool slp_impossible = false;
2452 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2453 size of the interleaving group (including gaps). */
2454 if (tree_fits_shwi_p (step))
2456 dr_step = tree_to_shwi (step);
2457 /* Check that STEP is a multiple of type size. Otherwise there is
2458 a non-element-sized gap at the end of the group which we
2459 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2460 ??? As we can handle non-constant step fine here we should
2461 simply remove uses of DR_GROUP_GAP between the last and first
2462 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2463 simply not include that gap. */
2464 if ((dr_step % type_size) != 0)
2466 if (dump_enabled_p ())
2468 dump_printf_loc (MSG_NOTE, vect_location,
2469 "Step ");
2470 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2471 dump_printf (MSG_NOTE,
2472 " is not a multiple of the element size for ");
2473 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2474 dump_printf (MSG_NOTE, "\n");
2476 return false;
2478 groupsize = absu_hwi (dr_step) / type_size;
2480 else
2481 groupsize = 0;
2483 /* Not consecutive access is possible only if it is a part of interleaving. */
2484 if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2486 /* Check if it this DR is a part of interleaving, and is a single
2487 element of the group that is accessed in the loop. */
2489 /* Gaps are supported only for loads. STEP must be a multiple of the type
2490 size. */
2491 if (DR_IS_READ (dr)
2492 && (dr_step % type_size) == 0
2493 && groupsize > 0)
2495 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2496 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2497 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2498 if (dump_enabled_p ())
2500 dump_printf_loc (MSG_NOTE, vect_location,
2501 "Detected single element interleaving ");
2502 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2503 dump_printf (MSG_NOTE, " step ");
2504 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2505 dump_printf (MSG_NOTE, "\n");
2508 return true;
2511 if (dump_enabled_p ())
2513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2514 "not consecutive access ");
2515 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2518 if (bb_vinfo)
2520 /* Mark the statement as unvectorizable. */
2521 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2522 return true;
2525 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2526 STMT_VINFO_STRIDED_P (stmt_info) = true;
2527 return true;
2530 if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2532 /* First stmt in the interleaving chain. Check the chain. */
2533 gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2534 struct data_reference *data_ref = dr;
2535 unsigned int count = 1;
2536 tree prev_init = DR_INIT (data_ref);
2537 gimple *prev = stmt;
2538 HOST_WIDE_INT diff, gaps = 0;
2540 /* By construction, all group members have INTEGER_CST DR_INITs. */
2541 while (next)
2543 /* Skip same data-refs. In case that two or more stmts share
2544 data-ref (supported only for loads), we vectorize only the first
2545 stmt, and the rest get their vectorized loads from the first
2546 one. */
2547 if (!tree_int_cst_compare (DR_INIT (data_ref),
2548 DR_INIT (STMT_VINFO_DATA_REF (
2549 vinfo_for_stmt (next)))))
2551 if (DR_IS_WRITE (data_ref))
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555 "Two store stmts share the same dr.\n");
2556 return false;
2559 if (dump_enabled_p ())
2560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2561 "Two or more load stmts share the same dr.\n");
2563 /* For load use the same data-ref load. */
2564 DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2566 prev = next;
2567 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2568 continue;
2571 prev = next;
2572 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2574 /* All group members have the same STEP by construction. */
2575 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2577 /* Check that the distance between two accesses is equal to the type
2578 size. Otherwise, we have gaps. */
2579 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2580 - TREE_INT_CST_LOW (prev_init)) / type_size;
2581 if (diff != 1)
2583 /* FORNOW: SLP of accesses with gaps is not supported. */
2584 slp_impossible = true;
2585 if (DR_IS_WRITE (data_ref))
2587 if (dump_enabled_p ())
2588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2589 "interleaved store with gaps\n");
2590 return false;
2593 gaps += diff - 1;
2596 last_accessed_element += diff;
2598 /* Store the gap from the previous member of the group. If there is no
2599 gap in the access, DR_GROUP_GAP is always 1. */
2600 DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2602 prev_init = DR_INIT (data_ref);
2603 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2604 /* Count the number of data-refs in the chain. */
2605 count++;
2608 if (groupsize == 0)
2609 groupsize = count + gaps;
2611 /* This could be UINT_MAX but as we are generating code in a very
2612 inefficient way we have to cap earlier. See PR78699 for example. */
2613 if (groupsize > 4096)
2615 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2617 "group is too large\n");
2618 return false;
2621 /* Check that the size of the interleaving is equal to count for stores,
2622 i.e., that there are no gaps. */
2623 if (groupsize != count
2624 && !DR_IS_READ (dr))
2626 if (dump_enabled_p ())
2627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2628 "interleaved store with gaps\n");
2629 return false;
2632 /* If there is a gap after the last load in the group it is the
2633 difference between the groupsize and the last accessed
2634 element.
2635 When there is no gap, this difference should be 0. */
2636 DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2638 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2639 if (dump_enabled_p ())
2641 dump_printf_loc (MSG_NOTE, vect_location,
2642 "Detected interleaving ");
2643 if (DR_IS_READ (dr))
2644 dump_printf (MSG_NOTE, "load ");
2645 else
2646 dump_printf (MSG_NOTE, "store ");
2647 dump_printf (MSG_NOTE, "of size %u starting with ",
2648 (unsigned)groupsize);
2649 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2650 if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2651 dump_printf_loc (MSG_NOTE, vect_location,
2652 "There is a gap of %u elements after the group\n",
2653 DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2656 /* SLP: create an SLP data structure for every interleaving group of
2657 stores for further analysis in vect_analyse_slp. */
2658 if (DR_IS_WRITE (dr) && !slp_impossible)
2660 if (loop_vinfo)
2661 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2662 if (bb_vinfo)
2663 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2667 return true;
2670 /* Analyze groups of accesses: check that DR belongs to a group of
2671 accesses of legal size, step, etc. Detect gaps, single element
2672 interleaving, and other special cases. Set grouped access info.
2673 Collect groups of strided stores for further use in SLP analysis. */
2675 static bool
2676 vect_analyze_group_access (struct data_reference *dr)
2678 if (!vect_analyze_group_access_1 (dr))
2680 /* Dissolve the group if present. */
2681 gimple *next;
2682 gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2683 while (stmt)
2685 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2686 next = DR_GROUP_NEXT_ELEMENT (vinfo);
2687 DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2688 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2689 stmt = next;
2691 return false;
2693 return true;
2696 /* Analyze the access pattern of the data-reference DR.
2697 In case of non-consecutive accesses call vect_analyze_group_access() to
2698 analyze groups of accesses. */
2700 static bool
2701 vect_analyze_data_ref_access (struct data_reference *dr)
2703 tree step = DR_STEP (dr);
2704 tree scalar_type = TREE_TYPE (DR_REF (dr));
2705 gimple *stmt = DR_STMT (dr);
2706 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2707 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2708 struct loop *loop = NULL;
2710 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2711 return true;
2713 if (loop_vinfo)
2714 loop = LOOP_VINFO_LOOP (loop_vinfo);
2716 if (loop_vinfo && !step)
2718 if (dump_enabled_p ())
2719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2720 "bad data-ref access in loop\n");
2721 return false;
2724 /* Allow loads with zero step in inner-loop vectorization. */
2725 if (loop_vinfo && integer_zerop (step))
2727 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2728 if (!nested_in_vect_loop_p (loop, stmt))
2729 return DR_IS_READ (dr);
2730 /* Allow references with zero step for outer loops marked
2731 with pragma omp simd only - it guarantees absence of
2732 loop-carried dependencies between inner loop iterations. */
2733 if (loop->safelen < 2)
2735 if (dump_enabled_p ())
2736 dump_printf_loc (MSG_NOTE, vect_location,
2737 "zero step in inner loop of nest\n");
2738 return false;
2742 if (loop && nested_in_vect_loop_p (loop, stmt))
2744 /* Interleaved accesses are not yet supported within outer-loop
2745 vectorization for references in the inner-loop. */
2746 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2748 /* For the rest of the analysis we use the outer-loop step. */
2749 step = STMT_VINFO_DR_STEP (stmt_info);
2750 if (integer_zerop (step))
2752 if (dump_enabled_p ())
2753 dump_printf_loc (MSG_NOTE, vect_location,
2754 "zero step in outer loop.\n");
2755 return DR_IS_READ (dr);
2759 /* Consecutive? */
2760 if (TREE_CODE (step) == INTEGER_CST)
2762 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2763 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2764 || (dr_step < 0
2765 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2767 /* Mark that it is not interleaving. */
2768 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2769 return true;
2773 if (loop && nested_in_vect_loop_p (loop, stmt))
2775 if (dump_enabled_p ())
2776 dump_printf_loc (MSG_NOTE, vect_location,
2777 "grouped access in outer loop.\n");
2778 return false;
2782 /* Assume this is a DR handled by non-constant strided load case. */
2783 if (TREE_CODE (step) != INTEGER_CST)
2784 return (STMT_VINFO_STRIDED_P (stmt_info)
2785 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2786 || vect_analyze_group_access (dr)));
2788 /* Not consecutive access - check if it's a part of interleaving group. */
2789 return vect_analyze_group_access (dr);
2792 /* Compare two data-references DRA and DRB to group them into chunks
2793 suitable for grouping. */
2795 static int
2796 dr_group_sort_cmp (const void *dra_, const void *drb_)
2798 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2799 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2800 int cmp;
2802 /* Stabilize sort. */
2803 if (dra == drb)
2804 return 0;
2806 /* DRs in different loops never belong to the same group. */
2807 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2808 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2809 if (loopa != loopb)
2810 return loopa->num < loopb->num ? -1 : 1;
2812 /* Ordering of DRs according to base. */
2813 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2814 DR_BASE_ADDRESS (drb));
2815 if (cmp != 0)
2816 return cmp;
2818 /* And according to DR_OFFSET. */
2819 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2820 if (cmp != 0)
2821 return cmp;
2823 /* Put reads before writes. */
2824 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2825 return DR_IS_READ (dra) ? -1 : 1;
2827 /* Then sort after access size. */
2828 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2829 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2830 if (cmp != 0)
2831 return cmp;
2833 /* And after step. */
2834 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2835 if (cmp != 0)
2836 return cmp;
2838 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2839 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2840 if (cmp == 0)
2841 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2842 return cmp;
2845 /* If OP is the result of a conversion, return the unconverted value,
2846 otherwise return null. */
2848 static tree
2849 strip_conversion (tree op)
2851 if (TREE_CODE (op) != SSA_NAME)
2852 return NULL_TREE;
2853 gimple *stmt = SSA_NAME_DEF_STMT (op);
2854 if (!is_gimple_assign (stmt)
2855 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2856 return NULL_TREE;
2857 return gimple_assign_rhs1 (stmt);
2860 /* Return true if vectorizable_* routines can handle statements STMT1
2861 and STMT2 being in a single group. */
2863 static bool
2864 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2866 if (gimple_assign_single_p (stmt1))
2867 return gimple_assign_single_p (stmt2);
2869 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2871 /* Check for two masked loads or two masked stores. */
2872 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2873 return false;
2874 internal_fn ifn = gimple_call_internal_fn (stmt1);
2875 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2876 return false;
2877 if (ifn != gimple_call_internal_fn (stmt2))
2878 return false;
2880 /* Check that the masks are the same. Cope with casts of masks,
2881 like those created by build_mask_conversion. */
2882 tree mask1 = gimple_call_arg (stmt1, 2);
2883 tree mask2 = gimple_call_arg (stmt2, 2);
2884 if (!operand_equal_p (mask1, mask2, 0))
2886 mask1 = strip_conversion (mask1);
2887 if (!mask1)
2888 return false;
2889 mask2 = strip_conversion (mask2);
2890 if (!mask2)
2891 return false;
2892 if (!operand_equal_p (mask1, mask2, 0))
2893 return false;
2895 return true;
2898 return false;
2901 /* Function vect_analyze_data_ref_accesses.
2903 Analyze the access pattern of all the data references in the loop.
2905 FORNOW: the only access pattern that is considered vectorizable is a
2906 simple step 1 (consecutive) access.
2908 FORNOW: handle only arrays and pointer accesses. */
2910 bool
2911 vect_analyze_data_ref_accesses (vec_info *vinfo)
2913 unsigned int i;
2914 vec<data_reference_p> datarefs = vinfo->datarefs;
2915 struct data_reference *dr;
2917 if (dump_enabled_p ())
2918 dump_printf_loc (MSG_NOTE, vect_location,
2919 "=== vect_analyze_data_ref_accesses ===\n");
2921 if (datarefs.is_empty ())
2922 return true;
2924 /* Sort the array of datarefs to make building the interleaving chains
2925 linear. Don't modify the original vector's order, it is needed for
2926 determining what dependencies are reversed. */
2927 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2928 datarefs_copy.qsort (dr_group_sort_cmp);
2930 /* Build the interleaving chains. */
2931 for (i = 0; i < datarefs_copy.length () - 1;)
2933 data_reference_p dra = datarefs_copy[i];
2934 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2935 stmt_vec_info lastinfo = NULL;
2936 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2937 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2939 ++i;
2940 continue;
2942 for (i = i + 1; i < datarefs_copy.length (); ++i)
2944 data_reference_p drb = datarefs_copy[i];
2945 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2946 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2947 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2948 break;
2950 /* ??? Imperfect sorting (non-compatible types, non-modulo
2951 accesses, same accesses) can lead to a group to be artificially
2952 split here as we don't just skip over those. If it really
2953 matters we can push those to a worklist and re-iterate
2954 over them. The we can just skip ahead to the next DR here. */
2956 /* DRs in a different loop should not be put into the same
2957 interleaving group. */
2958 if (gimple_bb (DR_STMT (dra))->loop_father
2959 != gimple_bb (DR_STMT (drb))->loop_father)
2960 break;
2962 /* Check that the data-refs have same first location (except init)
2963 and they are both either store or load (not load and store,
2964 not masked loads or stores). */
2965 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2966 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2967 DR_BASE_ADDRESS (drb)) != 0
2968 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2969 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2970 break;
2972 /* Check that the data-refs have the same constant size. */
2973 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2974 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2975 if (!tree_fits_uhwi_p (sza)
2976 || !tree_fits_uhwi_p (szb)
2977 || !tree_int_cst_equal (sza, szb))
2978 break;
2980 /* Check that the data-refs have the same step. */
2981 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2982 break;
2984 /* Check the types are compatible.
2985 ??? We don't distinguish this during sorting. */
2986 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2987 TREE_TYPE (DR_REF (drb))))
2988 break;
2990 /* Check that the DR_INITs are compile-time constants. */
2991 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2992 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2993 break;
2995 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2996 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2997 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2998 HOST_WIDE_INT init_prev
2999 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3000 gcc_assert (init_a <= init_b
3001 && init_a <= init_prev
3002 && init_prev <= init_b);
3004 /* Do not place the same access in the interleaving chain twice. */
3005 if (init_b == init_prev)
3007 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3008 < gimple_uid (DR_STMT (drb)));
3009 /* ??? For now we simply "drop" the later reference which is
3010 otherwise the same rather than finishing off this group.
3011 In the end we'd want to re-process duplicates forming
3012 multiple groups from the refs, likely by just collecting
3013 all candidates (including duplicates and split points
3014 below) in a vector and then process them together. */
3015 continue;
3018 /* If init_b == init_a + the size of the type * k, we have an
3019 interleaving, and DRA is accessed before DRB. */
3020 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3021 if (type_size_a == 0
3022 || (init_b - init_a) % type_size_a != 0)
3023 break;
3025 /* If we have a store, the accesses are adjacent. This splits
3026 groups into chunks we support (we don't support vectorization
3027 of stores with gaps). */
3028 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3029 break;
3031 /* If the step (if not zero or non-constant) is greater than the
3032 difference between data-refs' inits this splits groups into
3033 suitable sizes. */
3034 if (tree_fits_shwi_p (DR_STEP (dra)))
3036 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3037 if (step != 0 && step <= (init_b - init_a))
3038 break;
3041 if (dump_enabled_p ())
3043 dump_printf_loc (MSG_NOTE, vect_location,
3044 "Detected interleaving ");
3045 if (DR_IS_READ (dra))
3046 dump_printf (MSG_NOTE, "load ");
3047 else
3048 dump_printf (MSG_NOTE, "store ");
3049 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3050 dump_printf (MSG_NOTE, " and ");
3051 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3052 dump_printf (MSG_NOTE, "\n");
3055 /* Link the found element into the group list. */
3056 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3058 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3059 lastinfo = stmtinfo_a;
3061 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3062 DR_GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3063 lastinfo = stmtinfo_b;
3067 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3068 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3069 && !vect_analyze_data_ref_access (dr))
3071 if (dump_enabled_p ())
3072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3073 "not vectorized: complicated access pattern.\n");
3075 if (is_a <bb_vec_info> (vinfo))
3077 /* Mark the statement as not vectorizable. */
3078 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3079 continue;
3081 else
3083 datarefs_copy.release ();
3084 return false;
3088 datarefs_copy.release ();
3089 return true;
3092 /* Function vect_vfa_segment_size.
3094 Input:
3095 DR: The data reference.
3096 LENGTH_FACTOR: segment length to consider.
3098 Return a value suitable for the dr_with_seg_len::seg_len field.
3099 This is the "distance travelled" by the pointer from the first
3100 iteration in the segment to the last. Note that it does not include
3101 the size of the access; in effect it only describes the first byte. */
3103 static tree
3104 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3106 length_factor = size_binop (MINUS_EXPR,
3107 fold_convert (sizetype, length_factor),
3108 size_one_node);
3109 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3110 length_factor);
3113 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3114 gives the worst-case number of bytes covered by the segment. */
3116 static unsigned HOST_WIDE_INT
3117 vect_vfa_access_size (data_reference *dr)
3119 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3120 tree ref_type = TREE_TYPE (DR_REF (dr));
3121 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3122 unsigned HOST_WIDE_INT access_size = ref_size;
3123 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3125 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3126 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3128 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3129 && (vect_supportable_dr_alignment (dr, false)
3130 == dr_explicit_realign_optimized))
3132 /* We might access a full vector's worth. */
3133 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3134 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3136 return access_size;
3139 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3141 static unsigned int
3142 vect_vfa_align (const data_reference *dr)
3144 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3147 /* Function vect_no_alias_p.
3149 Given data references A and B with equal base and offset, see whether
3150 the alias relation can be decided at compilation time. Return 1 if
3151 it can and the references alias, 0 if it can and the references do
3152 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3153 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3154 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3156 static int
3157 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3158 tree segment_length_a, tree segment_length_b,
3159 unsigned HOST_WIDE_INT access_size_a,
3160 unsigned HOST_WIDE_INT access_size_b)
3162 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3163 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3164 poly_uint64 const_length_a;
3165 poly_uint64 const_length_b;
3167 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3168 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3169 [a, a+12) */
3170 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3172 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3173 offset_a = (offset_a + access_size_a) - const_length_a;
3175 else
3176 const_length_a = tree_to_poly_uint64 (segment_length_a);
3177 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3179 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3180 offset_b = (offset_b + access_size_b) - const_length_b;
3182 else
3183 const_length_b = tree_to_poly_uint64 (segment_length_b);
3185 const_length_a += access_size_a;
3186 const_length_b += access_size_b;
3188 if (ranges_known_overlap_p (offset_a, const_length_a,
3189 offset_b, const_length_b))
3190 return 1;
3192 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3193 offset_b, const_length_b))
3194 return 0;
3196 return -1;
3199 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3200 in DDR is >= VF. */
3202 static bool
3203 dependence_distance_ge_vf (data_dependence_relation *ddr,
3204 unsigned int loop_depth, poly_uint64 vf)
3206 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3207 || DDR_NUM_DIST_VECTS (ddr) == 0)
3208 return false;
3210 /* If the dependence is exact, we should have limited the VF instead. */
3211 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3213 unsigned int i;
3214 lambda_vector dist_v;
3215 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3217 HOST_WIDE_INT dist = dist_v[loop_depth];
3218 if (dist != 0
3219 && !(dist > 0 && DDR_REVERSED_P (ddr))
3220 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3221 return false;
3224 if (dump_enabled_p ())
3226 dump_printf_loc (MSG_NOTE, vect_location,
3227 "dependence distance between ");
3228 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3229 dump_printf (MSG_NOTE, " and ");
3230 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3231 dump_printf (MSG_NOTE, " is >= VF\n");
3234 return true;
3237 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3239 static void
3240 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3242 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3243 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3244 dump_printf (dump_kind, ") >= ");
3245 dump_dec (dump_kind, lower_bound.min_value);
3248 /* Record that the vectorized loop requires the vec_lower_bound described
3249 by EXPR, UNSIGNED_P and MIN_VALUE. */
3251 static void
3252 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3253 poly_uint64 min_value)
3255 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3256 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3257 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3259 unsigned_p &= lower_bounds[i].unsigned_p;
3260 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3261 if (lower_bounds[i].unsigned_p != unsigned_p
3262 || maybe_lt (lower_bounds[i].min_value, min_value))
3264 lower_bounds[i].unsigned_p = unsigned_p;
3265 lower_bounds[i].min_value = min_value;
3266 if (dump_enabled_p ())
3268 dump_printf_loc (MSG_NOTE, vect_location,
3269 "updating run-time check to ");
3270 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3271 dump_printf (MSG_NOTE, "\n");
3274 return;
3277 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3278 if (dump_enabled_p ())
3280 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3281 dump_lower_bound (MSG_NOTE, lower_bound);
3282 dump_printf (MSG_NOTE, "\n");
3284 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3287 /* Return true if it's unlikely that the step of the vectorized form of DR
3288 will span fewer than GAP bytes. */
3290 static bool
3291 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3293 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3294 HOST_WIDE_INT count
3295 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3296 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3297 count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3298 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3301 /* Return true if we know that there is no alias between DR_A and DR_B
3302 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3303 *LOWER_BOUND_OUT to this N. */
3305 static bool
3306 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3307 poly_uint64 *lower_bound_out)
3309 /* Check that there is a constant gap of known sign between DR_A
3310 and DR_B. */
3311 poly_int64 init_a, init_b;
3312 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3313 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3314 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3315 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3316 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3317 || !ordered_p (init_a, init_b))
3318 return false;
3320 /* Sort DR_A and DR_B by the address they access. */
3321 if (maybe_lt (init_b, init_a))
3323 std::swap (init_a, init_b);
3324 std::swap (dr_a, dr_b);
3327 /* If the two accesses could be dependent within a scalar iteration,
3328 make sure that we'd retain their order. */
3329 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3330 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3331 return false;
3333 /* There is no alias if abs (DR_STEP) is greater than or equal to
3334 the bytes spanned by the combination of the two accesses. */
3335 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3336 return true;
3339 /* Function vect_prune_runtime_alias_test_list.
3341 Prune a list of ddrs to be tested at run-time by versioning for alias.
3342 Merge several alias checks into one if possible.
3343 Return FALSE if resulting list of ddrs is longer then allowed by
3344 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3346 bool
3347 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3349 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3350 hash_set <tree_pair_hash> compared_objects;
3352 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3353 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3354 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3355 vec<vec_object_pair> &check_unequal_addrs
3356 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3357 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3358 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3360 ddr_p ddr;
3361 unsigned int i;
3362 tree length_factor;
3364 if (dump_enabled_p ())
3365 dump_printf_loc (MSG_NOTE, vect_location,
3366 "=== vect_prune_runtime_alias_test_list ===\n");
3368 /* Step values are irrelevant for aliasing if the number of vector
3369 iterations is equal to the number of scalar iterations (which can
3370 happen for fully-SLP loops). */
3371 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3373 if (!ignore_step_p)
3375 /* Convert the checks for nonzero steps into bound tests. */
3376 tree value;
3377 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3378 vect_check_lower_bound (loop_vinfo, value, true, 1);
3381 if (may_alias_ddrs.is_empty ())
3382 return true;
3384 comp_alias_ddrs.create (may_alias_ddrs.length ());
3386 unsigned int loop_depth
3387 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3388 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3390 /* First, we collect all data ref pairs for aliasing checks. */
3391 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3393 int comp_res;
3394 poly_uint64 lower_bound;
3395 struct data_reference *dr_a, *dr_b;
3396 gimple *dr_group_first_a, *dr_group_first_b;
3397 tree segment_length_a, segment_length_b;
3398 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3399 unsigned int align_a, align_b;
3400 gimple *stmt_a, *stmt_b;
3402 /* Ignore the alias if the VF we chose ended up being no greater
3403 than the dependence distance. */
3404 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3405 continue;
3407 if (DDR_OBJECT_A (ddr))
3409 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3410 if (!compared_objects.add (new_pair))
3412 if (dump_enabled_p ())
3414 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3415 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3416 dump_printf (MSG_NOTE, " and ");
3417 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3418 dump_printf (MSG_NOTE, " have different addresses\n");
3420 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3422 continue;
3425 dr_a = DDR_A (ddr);
3426 stmt_a = DR_STMT (DDR_A (ddr));
3428 dr_b = DDR_B (ddr);
3429 stmt_b = DR_STMT (DDR_B (ddr));
3431 /* Skip the pair if inter-iteration dependencies are irrelevant
3432 and intra-iteration dependencies are guaranteed to be honored. */
3433 if (ignore_step_p
3434 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3435 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3437 if (dump_enabled_p ())
3439 dump_printf_loc (MSG_NOTE, vect_location,
3440 "no need for alias check between ");
3441 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3442 dump_printf (MSG_NOTE, " and ");
3443 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3444 dump_printf (MSG_NOTE, " when VF is 1\n");
3446 continue;
3449 /* See whether we can handle the alias using a bounds check on
3450 the step, and whether that's likely to be the best approach.
3451 (It might not be, for example, if the minimum step is much larger
3452 than the number of bytes handled by one vector iteration.) */
3453 if (!ignore_step_p
3454 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3455 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3456 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3457 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3459 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3460 if (dump_enabled_p ())
3462 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3463 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3464 dump_printf (MSG_NOTE, " and ");
3465 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3466 dump_printf (MSG_NOTE, " when the step ");
3467 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3468 dump_printf (MSG_NOTE, " is outside ");
3469 if (unsigned_p)
3470 dump_printf (MSG_NOTE, "[0");
3471 else
3473 dump_printf (MSG_NOTE, "(");
3474 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3476 dump_printf (MSG_NOTE, ", ");
3477 dump_dec (MSG_NOTE, lower_bound);
3478 dump_printf (MSG_NOTE, ")\n");
3480 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3481 lower_bound);
3482 continue;
3485 dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3486 if (dr_group_first_a)
3488 stmt_a = dr_group_first_a;
3489 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3492 dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3493 if (dr_group_first_b)
3495 stmt_b = dr_group_first_b;
3496 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3499 if (ignore_step_p)
3501 segment_length_a = size_zero_node;
3502 segment_length_b = size_zero_node;
3504 else
3506 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3507 length_factor = scalar_loop_iters;
3508 else
3509 length_factor = size_int (vect_factor);
3510 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3511 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3513 access_size_a = vect_vfa_access_size (dr_a);
3514 access_size_b = vect_vfa_access_size (dr_b);
3515 align_a = vect_vfa_align (dr_a);
3516 align_b = vect_vfa_align (dr_b);
3518 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3519 DR_BASE_ADDRESS (dr_b));
3520 if (comp_res == 0)
3521 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3522 DR_OFFSET (dr_b));
3524 /* See whether the alias is known at compilation time. */
3525 if (comp_res == 0
3526 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3527 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3528 && poly_int_tree_p (segment_length_a)
3529 && poly_int_tree_p (segment_length_b))
3531 int res = vect_compile_time_alias (dr_a, dr_b,
3532 segment_length_a,
3533 segment_length_b,
3534 access_size_a,
3535 access_size_b);
3536 if (res >= 0 && dump_enabled_p ())
3538 dump_printf_loc (MSG_NOTE, vect_location,
3539 "can tell at compile time that ");
3540 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3541 dump_printf (MSG_NOTE, " and ");
3542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3543 if (res == 0)
3544 dump_printf (MSG_NOTE, " do not alias\n");
3545 else
3546 dump_printf (MSG_NOTE, " alias\n");
3549 if (res == 0)
3550 continue;
3552 if (res == 1)
3554 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_NOTE, vect_location,
3556 "not vectorized: compilation time alias.\n");
3557 return false;
3561 dr_with_seg_len_pair_t dr_with_seg_len_pair
3562 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3563 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3565 /* Canonicalize pairs by sorting the two DR members. */
3566 if (comp_res > 0)
3567 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3569 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3572 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3574 unsigned int count = (comp_alias_ddrs.length ()
3575 + check_unequal_addrs.length ());
3577 dump_printf_loc (MSG_NOTE, vect_location,
3578 "improved number of alias checks from %d to %d\n",
3579 may_alias_ddrs.length (), count);
3580 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3582 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584 "number of versioning for alias "
3585 "run-time tests exceeds %d "
3586 "(--param vect-max-version-for-alias-checks)\n",
3587 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3588 return false;
3591 return true;
3594 /* Check whether we can use an internal function for a gather load
3595 or scatter store. READ_P is true for loads and false for stores.
3596 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3597 the type of the memory elements being loaded or stored. OFFSET_BITS
3598 is the number of bits in each scalar offset and OFFSET_SIGN is the
3599 sign of the offset. SCALE is the amount by which the offset should
3600 be multiplied *after* it has been converted to address width.
3602 Return true if the function is supported, storing the function
3603 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3605 bool
3606 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3607 tree memory_type, unsigned int offset_bits,
3608 signop offset_sign, int scale,
3609 internal_fn *ifn_out, tree *element_type_out)
3611 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3612 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3613 if (offset_bits > element_bits)
3614 /* Internal functions require the offset to be the same width as
3615 the vector elements. We can extend narrower offsets, but it isn't
3616 safe to truncate wider offsets. */
3617 return false;
3619 if (element_bits != memory_bits)
3620 /* For now the vector elements must be the same width as the
3621 memory elements. */
3622 return false;
3624 /* Work out which function we need. */
3625 internal_fn ifn;
3626 if (read_p)
3627 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3628 else
3629 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3631 /* Test whether the target supports this combination. */
3632 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3633 offset_sign, scale))
3634 return false;
3636 *ifn_out = ifn;
3637 *element_type_out = TREE_TYPE (vectype);
3638 return true;
3641 /* CALL is a call to an internal gather load or scatter store function.
3642 Describe the operation in INFO. */
3644 static void
3645 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3647 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3648 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3649 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3651 info->ifn = gimple_call_internal_fn (call);
3652 info->decl = NULL_TREE;
3653 info->base = gimple_call_arg (call, 0);
3654 info->offset = gimple_call_arg (call, 1);
3655 info->offset_dt = vect_unknown_def_type;
3656 info->offset_vectype = NULL_TREE;
3657 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3658 info->element_type = TREE_TYPE (vectype);
3659 info->memory_type = TREE_TYPE (DR_REF (dr));
3662 /* Return true if a non-affine read or write in STMT is suitable for a
3663 gather load or scatter store. Describe the operation in *INFO if so. */
3665 bool
3666 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3667 gather_scatter_info *info)
3669 HOST_WIDE_INT scale = 1;
3670 poly_int64 pbitpos, pbitsize;
3671 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3672 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3673 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3674 tree offtype = NULL_TREE;
3675 tree decl = NULL_TREE, base, off;
3676 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3677 tree memory_type = TREE_TYPE (DR_REF (dr));
3678 machine_mode pmode;
3679 int punsignedp, reversep, pvolatilep = 0;
3680 internal_fn ifn;
3681 tree element_type;
3682 bool masked_p = false;
3684 /* See whether this is already a call to a gather/scatter internal function.
3685 If not, see whether it's a masked load or store. */
3686 gcall *call = dyn_cast <gcall *> (stmt);
3687 if (call && gimple_call_internal_p (call))
3689 ifn = gimple_call_internal_fn (stmt);
3690 if (internal_gather_scatter_fn_p (ifn))
3692 vect_describe_gather_scatter_call (call, info);
3693 return true;
3695 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3698 /* True if we should aim to use internal functions rather than
3699 built-in functions. */
3700 bool use_ifn_p = (DR_IS_READ (dr)
3701 ? supports_vec_gather_load_p ()
3702 : supports_vec_scatter_store_p ());
3704 base = DR_REF (dr);
3705 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3706 see if we can use the def stmt of the address. */
3707 if (masked_p
3708 && TREE_CODE (base) == MEM_REF
3709 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3710 && integer_zerop (TREE_OPERAND (base, 1))
3711 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3713 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3714 if (is_gimple_assign (def_stmt)
3715 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3716 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3719 /* The gather and scatter builtins need address of the form
3720 loop_invariant + vector * {1, 2, 4, 8}
3722 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3723 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3724 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3725 multiplications and additions in it. To get a vector, we need
3726 a single SSA_NAME that will be defined in the loop and will
3727 contain everything that is not loop invariant and that can be
3728 vectorized. The following code attempts to find such a preexistng
3729 SSA_NAME OFF and put the loop invariants into a tree BASE
3730 that can be gimplified before the loop. */
3731 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3732 &punsignedp, &reversep, &pvolatilep);
3733 gcc_assert (base && !reversep);
3734 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3736 if (TREE_CODE (base) == MEM_REF)
3738 if (!integer_zerop (TREE_OPERAND (base, 1)))
3740 if (off == NULL_TREE)
3741 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3742 else
3743 off = size_binop (PLUS_EXPR, off,
3744 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3746 base = TREE_OPERAND (base, 0);
3748 else
3749 base = build_fold_addr_expr (base);
3751 if (off == NULL_TREE)
3752 off = size_zero_node;
3754 /* If base is not loop invariant, either off is 0, then we start with just
3755 the constant offset in the loop invariant BASE and continue with base
3756 as OFF, otherwise give up.
3757 We could handle that case by gimplifying the addition of base + off
3758 into some SSA_NAME and use that as off, but for now punt. */
3759 if (!expr_invariant_in_loop_p (loop, base))
3761 if (!integer_zerop (off))
3762 return false;
3763 off = base;
3764 base = size_int (pbytepos);
3766 /* Otherwise put base + constant offset into the loop invariant BASE
3767 and continue with OFF. */
3768 else
3770 base = fold_convert (sizetype, base);
3771 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3774 /* OFF at this point may be either a SSA_NAME or some tree expression
3775 from get_inner_reference. Try to peel off loop invariants from it
3776 into BASE as long as possible. */
3777 STRIP_NOPS (off);
3778 while (offtype == NULL_TREE)
3780 enum tree_code code;
3781 tree op0, op1, add = NULL_TREE;
3783 if (TREE_CODE (off) == SSA_NAME)
3785 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3787 if (expr_invariant_in_loop_p (loop, off))
3788 return false;
3790 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3791 break;
3793 op0 = gimple_assign_rhs1 (def_stmt);
3794 code = gimple_assign_rhs_code (def_stmt);
3795 op1 = gimple_assign_rhs2 (def_stmt);
3797 else
3799 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3800 return false;
3801 code = TREE_CODE (off);
3802 extract_ops_from_tree (off, &code, &op0, &op1);
3804 switch (code)
3806 case POINTER_PLUS_EXPR:
3807 case PLUS_EXPR:
3808 if (expr_invariant_in_loop_p (loop, op0))
3810 add = op0;
3811 off = op1;
3812 do_add:
3813 add = fold_convert (sizetype, add);
3814 if (scale != 1)
3815 add = size_binop (MULT_EXPR, add, size_int (scale));
3816 base = size_binop (PLUS_EXPR, base, add);
3817 continue;
3819 if (expr_invariant_in_loop_p (loop, op1))
3821 add = op1;
3822 off = op0;
3823 goto do_add;
3825 break;
3826 case MINUS_EXPR:
3827 if (expr_invariant_in_loop_p (loop, op1))
3829 add = fold_convert (sizetype, op1);
3830 add = size_binop (MINUS_EXPR, size_zero_node, add);
3831 off = op0;
3832 goto do_add;
3834 break;
3835 case MULT_EXPR:
3836 if (scale == 1 && tree_fits_shwi_p (op1))
3838 int new_scale = tree_to_shwi (op1);
3839 /* Only treat this as a scaling operation if the target
3840 supports it. */
3841 if (use_ifn_p
3842 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3843 vectype, memory_type, 1,
3844 TYPE_SIGN (TREE_TYPE (op0)),
3845 new_scale, &ifn,
3846 &element_type))
3847 break;
3848 scale = new_scale;
3849 off = op0;
3850 continue;
3852 break;
3853 case SSA_NAME:
3854 off = op0;
3855 continue;
3856 CASE_CONVERT:
3857 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3858 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3859 break;
3860 if (TYPE_PRECISION (TREE_TYPE (op0))
3861 == TYPE_PRECISION (TREE_TYPE (off)))
3863 off = op0;
3864 continue;
3867 /* The internal functions need the offset to be the same width
3868 as the elements of VECTYPE. Don't include operations that
3869 cast the offset from that width to a different width. */
3870 if (use_ifn_p
3871 && (int_size_in_bytes (TREE_TYPE (vectype))
3872 == int_size_in_bytes (TREE_TYPE (off))))
3873 break;
3875 if (TYPE_PRECISION (TREE_TYPE (op0))
3876 < TYPE_PRECISION (TREE_TYPE (off)))
3878 off = op0;
3879 offtype = TREE_TYPE (off);
3880 STRIP_NOPS (off);
3881 continue;
3883 break;
3884 default:
3885 break;
3887 break;
3890 /* If at the end OFF still isn't a SSA_NAME or isn't
3891 defined in the loop, punt. */
3892 if (TREE_CODE (off) != SSA_NAME
3893 || expr_invariant_in_loop_p (loop, off))
3894 return false;
3896 if (offtype == NULL_TREE)
3897 offtype = TREE_TYPE (off);
3899 if (use_ifn_p)
3901 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3902 memory_type, TYPE_PRECISION (offtype),
3903 TYPE_SIGN (offtype), scale, &ifn,
3904 &element_type))
3905 return false;
3907 else
3909 if (DR_IS_READ (dr))
3911 if (targetm.vectorize.builtin_gather)
3912 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3914 else
3916 if (targetm.vectorize.builtin_scatter)
3917 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3920 if (!decl)
3921 return false;
3923 ifn = IFN_LAST;
3924 element_type = TREE_TYPE (vectype);
3927 info->ifn = ifn;
3928 info->decl = decl;
3929 info->base = base;
3930 info->offset = off;
3931 info->offset_dt = vect_unknown_def_type;
3932 info->offset_vectype = NULL_TREE;
3933 info->scale = scale;
3934 info->element_type = element_type;
3935 info->memory_type = memory_type;
3936 return true;
3939 /* Find the data references in STMT, analyze them with respect to LOOP and
3940 append them to DATAREFS. Return false if datarefs in this stmt cannot
3941 be handled. */
3943 bool
3944 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3945 vec<data_reference_p> *datarefs)
3947 /* We can ignore clobbers for dataref analysis - they are removed during
3948 loop vectorization and BB vectorization checks dependences with a
3949 stmt walk. */
3950 if (gimple_clobber_p (stmt))
3951 return true;
3953 if (gimple_has_volatile_ops (stmt))
3955 if (dump_enabled_p ())
3957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3958 "not vectorized: volatile type ");
3959 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3961 return false;
3964 if (stmt_can_throw_internal (stmt))
3966 if (dump_enabled_p ())
3968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3969 "not vectorized: statement can throw an "
3970 "exception ");
3971 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3973 return false;
3976 auto_vec<data_reference_p, 2> refs;
3977 if (!find_data_references_in_stmt (loop, stmt, &refs))
3978 return false;
3980 if (refs.is_empty ())
3981 return true;
3983 if (refs.length () > 1)
3985 if (dump_enabled_p ())
3987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3988 "not vectorized: more than one data ref "
3989 "in stmt: ");
3990 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3992 return false;
3995 if (gcall *call = dyn_cast <gcall *> (stmt))
3996 if (!gimple_call_internal_p (call)
3997 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3998 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4000 if (dump_enabled_p ())
4002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4003 "not vectorized: dr in a call ");
4004 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4006 return false;
4009 data_reference_p dr = refs.pop ();
4010 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4011 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4013 if (dump_enabled_p ())
4015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4016 "not vectorized: statement is bitfield "
4017 "access ");
4018 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4020 return false;
4023 if (DR_BASE_ADDRESS (dr)
4024 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4026 if (dump_enabled_p ())
4027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4028 "not vectorized: base addr of dr is a "
4029 "constant\n");
4030 return false;
4033 datarefs->safe_push (dr);
4034 return true;
4037 /* Function vect_analyze_data_refs.
4039 Find all the data references in the loop or basic block.
4041 The general structure of the analysis of data refs in the vectorizer is as
4042 follows:
4043 1- vect_analyze_data_refs(loop/bb): call
4044 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4045 in the loop/bb and their dependences.
4046 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4047 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4048 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4052 bool
4053 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4055 struct loop *loop = NULL;
4056 unsigned int i;
4057 struct data_reference *dr;
4058 tree scalar_type;
4060 if (dump_enabled_p ())
4061 dump_printf_loc (MSG_NOTE, vect_location,
4062 "=== vect_analyze_data_refs ===\n");
4064 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4065 loop = LOOP_VINFO_LOOP (loop_vinfo);
4067 /* Go through the data-refs, check that the analysis succeeded. Update
4068 pointer from stmt_vec_info struct to DR and vectype. */
4070 vec<data_reference_p> datarefs = vinfo->datarefs;
4071 FOR_EACH_VEC_ELT (datarefs, i, dr)
4073 gimple *stmt;
4074 stmt_vec_info stmt_info;
4075 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4076 bool simd_lane_access = false;
4077 poly_uint64 vf;
4079 gcc_assert (DR_REF (dr));
4080 stmt = DR_STMT (dr);
4081 stmt_info = vinfo_for_stmt (stmt);
4083 /* Check that analysis of the data-ref succeeded. */
4084 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4085 || !DR_STEP (dr))
4087 bool maybe_gather
4088 = DR_IS_READ (dr)
4089 && !TREE_THIS_VOLATILE (DR_REF (dr))
4090 && (targetm.vectorize.builtin_gather != NULL
4091 || supports_vec_gather_load_p ());
4092 bool maybe_scatter
4093 = DR_IS_WRITE (dr)
4094 && !TREE_THIS_VOLATILE (DR_REF (dr))
4095 && (targetm.vectorize.builtin_scatter != NULL
4096 || supports_vec_scatter_store_p ());
4097 bool maybe_simd_lane_access
4098 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4100 /* If target supports vector gather loads or scatter stores, or if
4101 this might be a SIMD lane access, see if they can't be used. */
4102 if (is_a <loop_vec_info> (vinfo)
4103 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4104 && !nested_in_vect_loop_p (loop, stmt))
4106 struct data_reference *newdr
4107 = create_data_ref (NULL, loop_containing_stmt (stmt),
4108 DR_REF (dr), stmt, !maybe_scatter,
4109 DR_IS_CONDITIONAL_IN_STMT (dr));
4110 gcc_assert (newdr != NULL && DR_REF (newdr));
4111 if (DR_BASE_ADDRESS (newdr)
4112 && DR_OFFSET (newdr)
4113 && DR_INIT (newdr)
4114 && DR_STEP (newdr)
4115 && integer_zerop (DR_STEP (newdr)))
4117 if (maybe_simd_lane_access)
4119 tree off = DR_OFFSET (newdr);
4120 STRIP_NOPS (off);
4121 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4122 && TREE_CODE (off) == MULT_EXPR
4123 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4125 tree step = TREE_OPERAND (off, 1);
4126 off = TREE_OPERAND (off, 0);
4127 STRIP_NOPS (off);
4128 if (CONVERT_EXPR_P (off)
4129 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4130 0)))
4131 < TYPE_PRECISION (TREE_TYPE (off)))
4132 off = TREE_OPERAND (off, 0);
4133 if (TREE_CODE (off) == SSA_NAME)
4135 gimple *def = SSA_NAME_DEF_STMT (off);
4136 tree reft = TREE_TYPE (DR_REF (newdr));
4137 if (is_gimple_call (def)
4138 && gimple_call_internal_p (def)
4139 && (gimple_call_internal_fn (def)
4140 == IFN_GOMP_SIMD_LANE))
4142 tree arg = gimple_call_arg (def, 0);
4143 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4144 arg = SSA_NAME_VAR (arg);
4145 if (arg == loop->simduid
4146 /* For now. */
4147 && tree_int_cst_equal
4148 (TYPE_SIZE_UNIT (reft),
4149 step))
4151 DR_OFFSET (newdr) = ssize_int (0);
4152 DR_STEP (newdr) = step;
4153 DR_OFFSET_ALIGNMENT (newdr)
4154 = BIGGEST_ALIGNMENT;
4155 DR_STEP_ALIGNMENT (newdr)
4156 = highest_pow2_factor (step);
4157 dr = newdr;
4158 simd_lane_access = true;
4164 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4166 dr = newdr;
4167 if (maybe_gather)
4168 gatherscatter = GATHER;
4169 else
4170 gatherscatter = SCATTER;
4173 if (gatherscatter == SG_NONE && !simd_lane_access)
4174 free_data_ref (newdr);
4177 if (gatherscatter == SG_NONE && !simd_lane_access)
4179 if (dump_enabled_p ())
4181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4182 "not vectorized: data ref analysis "
4183 "failed ");
4184 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4187 if (is_a <bb_vec_info> (vinfo))
4188 break;
4190 return false;
4194 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4195 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4196 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4198 if (dump_enabled_p ())
4200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4201 "not vectorized: base object not addressable "
4202 "for stmt: ");
4203 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4205 if (is_a <bb_vec_info> (vinfo))
4207 /* In BB vectorization the ref can still participate
4208 in dependence analysis, we just can't vectorize it. */
4209 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4210 continue;
4212 return false;
4215 if (is_a <loop_vec_info> (vinfo)
4216 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4218 if (nested_in_vect_loop_p (loop, stmt))
4220 if (dump_enabled_p ())
4222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4223 "not vectorized: not suitable for strided "
4224 "load ");
4225 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4227 return false;
4229 STMT_VINFO_STRIDED_P (stmt_info) = true;
4232 /* Update DR field in stmt_vec_info struct. */
4234 /* If the dataref is in an inner-loop of the loop that is considered for
4235 for vectorization, we also want to analyze the access relative to
4236 the outer-loop (DR contains information only relative to the
4237 inner-most enclosing loop). We do that by building a reference to the
4238 first location accessed by the inner-loop, and analyze it relative to
4239 the outer-loop. */
4240 if (loop && nested_in_vect_loop_p (loop, stmt))
4242 /* Build a reference to the first location accessed by the
4243 inner loop: *(BASE + INIT + OFFSET). By construction,
4244 this address must be invariant in the inner loop, so we
4245 can consider it as being used in the outer loop. */
4246 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4247 tree offset = unshare_expr (DR_OFFSET (dr));
4248 tree init = unshare_expr (DR_INIT (dr));
4249 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4250 init, offset);
4251 tree init_addr = fold_build_pointer_plus (base, init_offset);
4252 tree init_ref = build_fold_indirect_ref (init_addr);
4254 if (dump_enabled_p ())
4256 dump_printf_loc (MSG_NOTE, vect_location,
4257 "analyze in outer loop: ");
4258 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4259 dump_printf (MSG_NOTE, "\n");
4262 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4263 init_ref, loop))
4264 /* dr_analyze_innermost already explained the failure. */
4265 return false;
4267 if (dump_enabled_p ())
4269 dump_printf_loc (MSG_NOTE, vect_location,
4270 "\touter base_address: ");
4271 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4272 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4273 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4274 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4275 STMT_VINFO_DR_OFFSET (stmt_info));
4276 dump_printf (MSG_NOTE,
4277 "\n\touter constant offset from base address: ");
4278 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4279 STMT_VINFO_DR_INIT (stmt_info));
4280 dump_printf (MSG_NOTE, "\n\touter step: ");
4281 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4282 STMT_VINFO_DR_STEP (stmt_info));
4283 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4284 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4285 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4286 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4287 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4288 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4289 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4290 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4294 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4295 STMT_VINFO_DATA_REF (stmt_info) = dr;
4296 if (simd_lane_access)
4298 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4299 free_data_ref (datarefs[i]);
4300 datarefs[i] = dr;
4303 /* Set vectype for STMT. */
4304 scalar_type = TREE_TYPE (DR_REF (dr));
4305 STMT_VINFO_VECTYPE (stmt_info)
4306 = get_vectype_for_scalar_type (scalar_type);
4307 if (!STMT_VINFO_VECTYPE (stmt_info))
4309 if (dump_enabled_p ())
4311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4312 "not vectorized: no vectype for stmt: ");
4313 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4314 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4315 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4316 scalar_type);
4317 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4320 if (is_a <bb_vec_info> (vinfo))
4322 /* No vector type is fine, the ref can still participate
4323 in dependence analysis, we just can't vectorize it. */
4324 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4325 continue;
4328 if (gatherscatter != SG_NONE || simd_lane_access)
4330 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4331 if (gatherscatter != SG_NONE)
4332 free_data_ref (dr);
4334 return false;
4336 else
4338 if (dump_enabled_p ())
4340 dump_printf_loc (MSG_NOTE, vect_location,
4341 "got vectype for stmt: ");
4342 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4343 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4344 STMT_VINFO_VECTYPE (stmt_info));
4345 dump_printf (MSG_NOTE, "\n");
4349 /* Adjust the minimal vectorization factor according to the
4350 vector type. */
4351 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4352 *min_vf = upper_bound (*min_vf, vf);
4354 if (gatherscatter != SG_NONE)
4356 gather_scatter_info gs_info;
4357 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4358 &gs_info)
4359 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4361 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4362 free_data_ref (dr);
4363 if (dump_enabled_p ())
4365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4366 (gatherscatter == GATHER) ?
4367 "not vectorized: not suitable for gather "
4368 "load " :
4369 "not vectorized: not suitable for scatter "
4370 "store ");
4371 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4373 return false;
4376 free_data_ref (datarefs[i]);
4377 datarefs[i] = dr;
4378 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4382 /* If we stopped analysis at the first dataref we could not analyze
4383 when trying to vectorize a basic-block mark the rest of the datarefs
4384 as not vectorizable and truncate the vector of datarefs. That
4385 avoids spending useless time in analyzing their dependence. */
4386 if (i != datarefs.length ())
4388 gcc_assert (is_a <bb_vec_info> (vinfo));
4389 for (unsigned j = i; j < datarefs.length (); ++j)
4391 data_reference_p dr = datarefs[j];
4392 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4393 free_data_ref (dr);
4395 datarefs.truncate (i);
4398 return true;
4402 /* Function vect_get_new_vect_var.
4404 Returns a name for a new variable. The current naming scheme appends the
4405 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4406 the name of vectorizer generated variables, and appends that to NAME if
4407 provided. */
4409 tree
4410 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4412 const char *prefix;
4413 tree new_vect_var;
4415 switch (var_kind)
4417 case vect_simple_var:
4418 prefix = "vect";
4419 break;
4420 case vect_scalar_var:
4421 prefix = "stmp";
4422 break;
4423 case vect_mask_var:
4424 prefix = "mask";
4425 break;
4426 case vect_pointer_var:
4427 prefix = "vectp";
4428 break;
4429 default:
4430 gcc_unreachable ();
4433 if (name)
4435 char* tmp = concat (prefix, "_", name, NULL);
4436 new_vect_var = create_tmp_reg (type, tmp);
4437 free (tmp);
4439 else
4440 new_vect_var = create_tmp_reg (type, prefix);
4442 return new_vect_var;
4445 /* Like vect_get_new_vect_var but return an SSA name. */
4447 tree
4448 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4450 const char *prefix;
4451 tree new_vect_var;
4453 switch (var_kind)
4455 case vect_simple_var:
4456 prefix = "vect";
4457 break;
4458 case vect_scalar_var:
4459 prefix = "stmp";
4460 break;
4461 case vect_pointer_var:
4462 prefix = "vectp";
4463 break;
4464 default:
4465 gcc_unreachable ();
4468 if (name)
4470 char* tmp = concat (prefix, "_", name, NULL);
4471 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4472 free (tmp);
4474 else
4475 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4477 return new_vect_var;
4480 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4482 static void
4483 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4485 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4486 int misalign = DR_MISALIGNMENT (dr);
4487 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4488 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4489 else
4490 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4491 DR_TARGET_ALIGNMENT (dr), misalign);
4494 /* Function vect_create_addr_base_for_vector_ref.
4496 Create an expression that computes the address of the first memory location
4497 that will be accessed for a data reference.
4499 Input:
4500 STMT: The statement containing the data reference.
4501 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4502 OFFSET: Optional. If supplied, it is be added to the initial address.
4503 LOOP: Specify relative to which loop-nest should the address be computed.
4504 For example, when the dataref is in an inner-loop nested in an
4505 outer-loop that is now being vectorized, LOOP can be either the
4506 outer-loop, or the inner-loop. The first memory location accessed
4507 by the following dataref ('in' points to short):
4509 for (i=0; i<N; i++)
4510 for (j=0; j<M; j++)
4511 s += in[i+j]
4513 is as follows:
4514 if LOOP=i_loop: &in (relative to i_loop)
4515 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4516 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4517 initial address. Unlike OFFSET, which is number of elements to
4518 be added, BYTE_OFFSET is measured in bytes.
4520 Output:
4521 1. Return an SSA_NAME whose value is the address of the memory location of
4522 the first vector of the data reference.
4523 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4524 these statement(s) which define the returned SSA_NAME.
4526 FORNOW: We are only handling array accesses with step 1. */
4528 tree
4529 vect_create_addr_base_for_vector_ref (gimple *stmt,
4530 gimple_seq *new_stmt_list,
4531 tree offset,
4532 tree byte_offset)
4534 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4535 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4536 const char *base_name;
4537 tree addr_base;
4538 tree dest;
4539 gimple_seq seq = NULL;
4540 tree vect_ptr_type;
4541 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4542 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4543 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4545 tree data_ref_base = unshare_expr (drb->base_address);
4546 tree base_offset = unshare_expr (drb->offset);
4547 tree init = unshare_expr (drb->init);
4549 if (loop_vinfo)
4550 base_name = get_name (data_ref_base);
4551 else
4553 base_offset = ssize_int (0);
4554 init = ssize_int (0);
4555 base_name = get_name (DR_REF (dr));
4558 /* Create base_offset */
4559 base_offset = size_binop (PLUS_EXPR,
4560 fold_convert (sizetype, base_offset),
4561 fold_convert (sizetype, init));
4563 if (offset)
4565 offset = fold_build2 (MULT_EXPR, sizetype,
4566 fold_convert (sizetype, offset), step);
4567 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4568 base_offset, offset);
4570 if (byte_offset)
4572 byte_offset = fold_convert (sizetype, byte_offset);
4573 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4574 base_offset, byte_offset);
4577 /* base + base_offset */
4578 if (loop_vinfo)
4579 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4580 else
4582 addr_base = build1 (ADDR_EXPR,
4583 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4584 unshare_expr (DR_REF (dr)));
4587 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4588 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4589 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4590 gimple_seq_add_seq (new_stmt_list, seq);
4592 if (DR_PTR_INFO (dr)
4593 && TREE_CODE (addr_base) == SSA_NAME
4594 && !SSA_NAME_PTR_INFO (addr_base))
4596 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4597 if (offset || byte_offset)
4598 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4601 if (dump_enabled_p ())
4603 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4604 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4605 dump_printf (MSG_NOTE, "\n");
4608 return addr_base;
4612 /* Function vect_create_data_ref_ptr.
4614 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4615 location accessed in the loop by STMT, along with the def-use update
4616 chain to appropriately advance the pointer through the loop iterations.
4617 Also set aliasing information for the pointer. This pointer is used by
4618 the callers to this function to create a memory reference expression for
4619 vector load/store access.
4621 Input:
4622 1. STMT: a stmt that references memory. Expected to be of the form
4623 GIMPLE_ASSIGN <name, data-ref> or
4624 GIMPLE_ASSIGN <data-ref, name>.
4625 2. AGGR_TYPE: the type of the reference, which should be either a vector
4626 or an array.
4627 3. AT_LOOP: the loop where the vector memref is to be created.
4628 4. OFFSET (optional): an offset to be added to the initial address accessed
4629 by the data-ref in STMT.
4630 5. BSI: location where the new stmts are to be placed if there is no loop
4631 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4632 pointing to the initial address.
4633 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4634 to the initial address accessed by the data-ref in STMT. This is
4635 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4636 in bytes.
4637 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4638 to the IV during each iteration of the loop. NULL says to move
4639 by one copy of AGGR_TYPE up or down, depending on the step of the
4640 data reference.
4642 Output:
4643 1. Declare a new ptr to vector_type, and have it point to the base of the
4644 data reference (initial addressed accessed by the data reference).
4645 For example, for vector of type V8HI, the following code is generated:
4647 v8hi *ap;
4648 ap = (v8hi *)initial_address;
4650 if OFFSET is not supplied:
4651 initial_address = &a[init];
4652 if OFFSET is supplied:
4653 initial_address = &a[init + OFFSET];
4654 if BYTE_OFFSET is supplied:
4655 initial_address = &a[init] + BYTE_OFFSET;
4657 Return the initial_address in INITIAL_ADDRESS.
4659 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4660 update the pointer in each iteration of the loop.
4662 Return the increment stmt that updates the pointer in PTR_INCR.
4664 3. Set INV_P to true if the access pattern of the data reference in the
4665 vectorized loop is invariant. Set it to false otherwise.
4667 4. Return the pointer. */
4669 tree
4670 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4671 tree offset, tree *initial_address,
4672 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4673 bool only_init, bool *inv_p, tree byte_offset,
4674 tree iv_step)
4676 const char *base_name;
4677 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4678 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4679 struct loop *loop = NULL;
4680 bool nested_in_vect_loop = false;
4681 struct loop *containing_loop = NULL;
4682 tree aggr_ptr_type;
4683 tree aggr_ptr;
4684 tree new_temp;
4685 gimple_seq new_stmt_list = NULL;
4686 edge pe = NULL;
4687 basic_block new_bb;
4688 tree aggr_ptr_init;
4689 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4690 tree aptr;
4691 gimple_stmt_iterator incr_gsi;
4692 bool insert_after;
4693 tree indx_before_incr, indx_after_incr;
4694 gimple *incr;
4695 tree step;
4696 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4698 gcc_assert (iv_step != NULL_TREE
4699 || TREE_CODE (aggr_type) == ARRAY_TYPE
4700 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4702 if (loop_vinfo)
4704 loop = LOOP_VINFO_LOOP (loop_vinfo);
4705 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4706 containing_loop = (gimple_bb (stmt))->loop_father;
4707 pe = loop_preheader_edge (loop);
4709 else
4711 gcc_assert (bb_vinfo);
4712 only_init = true;
4713 *ptr_incr = NULL;
4716 /* Check the step (evolution) of the load in LOOP, and record
4717 whether it's invariant. */
4718 step = vect_dr_behavior (dr)->step;
4719 if (integer_zerop (step))
4720 *inv_p = true;
4721 else
4722 *inv_p = false;
4724 /* Create an expression for the first address accessed by this load
4725 in LOOP. */
4726 base_name = get_name (DR_BASE_ADDRESS (dr));
4728 if (dump_enabled_p ())
4730 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4731 dump_printf_loc (MSG_NOTE, vect_location,
4732 "create %s-pointer variable to type: ",
4733 get_tree_code_name (TREE_CODE (aggr_type)));
4734 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4735 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4736 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4737 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4738 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4739 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4740 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4741 else
4742 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4743 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4744 dump_printf (MSG_NOTE, "\n");
4747 /* (1) Create the new aggregate-pointer variable.
4748 Vector and array types inherit the alias set of their component
4749 type by default so we need to use a ref-all pointer if the data
4750 reference does not conflict with the created aggregated data
4751 reference because it is not addressable. */
4752 bool need_ref_all = false;
4753 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4754 get_alias_set (DR_REF (dr))))
4755 need_ref_all = true;
4756 /* Likewise for any of the data references in the stmt group. */
4757 else if (DR_GROUP_SIZE (stmt_info) > 1)
4759 gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4762 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4763 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4764 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4765 get_alias_set (DR_REF (sdr))))
4767 need_ref_all = true;
4768 break;
4770 orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4772 while (orig_stmt);
4774 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4775 need_ref_all);
4776 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4779 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4780 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4781 def-use update cycles for the pointer: one relative to the outer-loop
4782 (LOOP), which is what steps (3) and (4) below do. The other is relative
4783 to the inner-loop (which is the inner-most loop containing the dataref),
4784 and this is done be step (5) below.
4786 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4787 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4788 redundant. Steps (3),(4) create the following:
4790 vp0 = &base_addr;
4791 LOOP: vp1 = phi(vp0,vp2)
4794 vp2 = vp1 + step
4795 goto LOOP
4797 If there is an inner-loop nested in loop, then step (5) will also be
4798 applied, and an additional update in the inner-loop will be created:
4800 vp0 = &base_addr;
4801 LOOP: vp1 = phi(vp0,vp2)
4803 inner: vp3 = phi(vp1,vp4)
4804 vp4 = vp3 + inner_step
4805 if () goto inner
4807 vp2 = vp1 + step
4808 if () goto LOOP */
4810 /* (2) Calculate the initial address of the aggregate-pointer, and set
4811 the aggregate-pointer to point to it before the loop. */
4813 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4815 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4816 offset, byte_offset);
4817 if (new_stmt_list)
4819 if (pe)
4821 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4822 gcc_assert (!new_bb);
4824 else
4825 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4828 *initial_address = new_temp;
4829 aggr_ptr_init = new_temp;
4831 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4832 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4833 inner-loop nested in LOOP (during outer-loop vectorization). */
4835 /* No update in loop is required. */
4836 if (only_init && (!loop_vinfo || at_loop == loop))
4837 aptr = aggr_ptr_init;
4838 else
4840 if (iv_step == NULL_TREE)
4842 /* The step of the aggregate pointer is the type size. */
4843 iv_step = TYPE_SIZE_UNIT (aggr_type);
4844 /* One exception to the above is when the scalar step of the load in
4845 LOOP is zero. In this case the step here is also zero. */
4846 if (*inv_p)
4847 iv_step = size_zero_node;
4848 else if (tree_int_cst_sgn (step) == -1)
4849 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4852 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4854 create_iv (aggr_ptr_init,
4855 fold_convert (aggr_ptr_type, iv_step),
4856 aggr_ptr, loop, &incr_gsi, insert_after,
4857 &indx_before_incr, &indx_after_incr);
4858 incr = gsi_stmt (incr_gsi);
4859 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4861 /* Copy the points-to information if it exists. */
4862 if (DR_PTR_INFO (dr))
4864 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4865 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4867 if (ptr_incr)
4868 *ptr_incr = incr;
4870 aptr = indx_before_incr;
4873 if (!nested_in_vect_loop || only_init)
4874 return aptr;
4877 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4878 nested in LOOP, if exists. */
4880 gcc_assert (nested_in_vect_loop);
4881 if (!only_init)
4883 standard_iv_increment_position (containing_loop, &incr_gsi,
4884 &insert_after);
4885 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4886 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4887 &indx_after_incr);
4888 incr = gsi_stmt (incr_gsi);
4889 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4891 /* Copy the points-to information if it exists. */
4892 if (DR_PTR_INFO (dr))
4894 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4895 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4897 if (ptr_incr)
4898 *ptr_incr = incr;
4900 return indx_before_incr;
4902 else
4903 gcc_unreachable ();
4907 /* Function bump_vector_ptr
4909 Increment a pointer (to a vector type) by vector-size. If requested,
4910 i.e. if PTR-INCR is given, then also connect the new increment stmt
4911 to the existing def-use update-chain of the pointer, by modifying
4912 the PTR_INCR as illustrated below:
4914 The pointer def-use update-chain before this function:
4915 DATAREF_PTR = phi (p_0, p_2)
4916 ....
4917 PTR_INCR: p_2 = DATAREF_PTR + step
4919 The pointer def-use update-chain after this function:
4920 DATAREF_PTR = phi (p_0, p_2)
4921 ....
4922 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4923 ....
4924 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4926 Input:
4927 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4928 in the loop.
4929 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4930 the loop. The increment amount across iterations is expected
4931 to be vector_size.
4932 BSI - location where the new update stmt is to be placed.
4933 STMT - the original scalar memory-access stmt that is being vectorized.
4934 BUMP - optional. The offset by which to bump the pointer. If not given,
4935 the offset is assumed to be vector_size.
4937 Output: Return NEW_DATAREF_PTR as illustrated above.
4941 tree
4942 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4943 gimple *stmt, tree bump)
4945 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4946 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4947 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4948 tree update = TYPE_SIZE_UNIT (vectype);
4949 gassign *incr_stmt;
4950 ssa_op_iter iter;
4951 use_operand_p use_p;
4952 tree new_dataref_ptr;
4954 if (bump)
4955 update = bump;
4957 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4958 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4959 else
4960 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4961 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4962 dataref_ptr, update);
4963 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4965 /* Copy the points-to information if it exists. */
4966 if (DR_PTR_INFO (dr))
4968 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4969 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4972 if (!ptr_incr)
4973 return new_dataref_ptr;
4975 /* Update the vector-pointer's cross-iteration increment. */
4976 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4978 tree use = USE_FROM_PTR (use_p);
4980 if (use == dataref_ptr)
4981 SET_USE (use_p, new_dataref_ptr);
4982 else
4983 gcc_assert (operand_equal_p (use, update, 0));
4986 return new_dataref_ptr;
4990 /* Copy memory reference info such as base/clique from the SRC reference
4991 to the DEST MEM_REF. */
4993 void
4994 vect_copy_ref_info (tree dest, tree src)
4996 if (TREE_CODE (dest) != MEM_REF)
4997 return;
4999 tree src_base = src;
5000 while (handled_component_p (src_base))
5001 src_base = TREE_OPERAND (src_base, 0);
5002 if (TREE_CODE (src_base) != MEM_REF
5003 && TREE_CODE (src_base) != TARGET_MEM_REF)
5004 return;
5006 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5007 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5011 /* Function vect_create_destination_var.
5013 Create a new temporary of type VECTYPE. */
5015 tree
5016 vect_create_destination_var (tree scalar_dest, tree vectype)
5018 tree vec_dest;
5019 const char *name;
5020 char *new_name;
5021 tree type;
5022 enum vect_var_kind kind;
5024 kind = vectype
5025 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5026 ? vect_mask_var
5027 : vect_simple_var
5028 : vect_scalar_var;
5029 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5031 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5033 name = get_name (scalar_dest);
5034 if (name)
5035 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5036 else
5037 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5038 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5039 free (new_name);
5041 return vec_dest;
5044 /* Function vect_grouped_store_supported.
5046 Returns TRUE if interleave high and interleave low permutations
5047 are supported, and FALSE otherwise. */
5049 bool
5050 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5052 machine_mode mode = TYPE_MODE (vectype);
5054 /* vect_permute_store_chain requires the group size to be equal to 3 or
5055 be a power of two. */
5056 if (count != 3 && exact_log2 (count) == -1)
5058 if (dump_enabled_p ())
5059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5060 "the size of the group of accesses"
5061 " is not a power of 2 or not eqaul to 3\n");
5062 return false;
5065 /* Check that the permutation is supported. */
5066 if (VECTOR_MODE_P (mode))
5068 unsigned int i;
5069 if (count == 3)
5071 unsigned int j0 = 0, j1 = 0, j2 = 0;
5072 unsigned int i, j;
5074 unsigned int nelt;
5075 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5077 if (dump_enabled_p ())
5078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5079 "cannot handle groups of 3 stores for"
5080 " variable-length vectors\n");
5081 return false;
5084 vec_perm_builder sel (nelt, nelt, 1);
5085 sel.quick_grow (nelt);
5086 vec_perm_indices indices;
5087 for (j = 0; j < 3; j++)
5089 int nelt0 = ((3 - j) * nelt) % 3;
5090 int nelt1 = ((3 - j) * nelt + 1) % 3;
5091 int nelt2 = ((3 - j) * nelt + 2) % 3;
5092 for (i = 0; i < nelt; i++)
5094 if (3 * i + nelt0 < nelt)
5095 sel[3 * i + nelt0] = j0++;
5096 if (3 * i + nelt1 < nelt)
5097 sel[3 * i + nelt1] = nelt + j1++;
5098 if (3 * i + nelt2 < nelt)
5099 sel[3 * i + nelt2] = 0;
5101 indices.new_vector (sel, 2, nelt);
5102 if (!can_vec_perm_const_p (mode, indices))
5104 if (dump_enabled_p ())
5105 dump_printf (MSG_MISSED_OPTIMIZATION,
5106 "permutation op not supported by target.\n");
5107 return false;
5110 for (i = 0; i < nelt; i++)
5112 if (3 * i + nelt0 < nelt)
5113 sel[3 * i + nelt0] = 3 * i + nelt0;
5114 if (3 * i + nelt1 < nelt)
5115 sel[3 * i + nelt1] = 3 * i + nelt1;
5116 if (3 * i + nelt2 < nelt)
5117 sel[3 * i + nelt2] = nelt + j2++;
5119 indices.new_vector (sel, 2, nelt);
5120 if (!can_vec_perm_const_p (mode, indices))
5122 if (dump_enabled_p ())
5123 dump_printf (MSG_MISSED_OPTIMIZATION,
5124 "permutation op not supported by target.\n");
5125 return false;
5128 return true;
5130 else
5132 /* If length is not equal to 3 then only power of 2 is supported. */
5133 gcc_assert (pow2p_hwi (count));
5134 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5136 /* The encoding has 2 interleaved stepped patterns. */
5137 vec_perm_builder sel (nelt, 2, 3);
5138 sel.quick_grow (6);
5139 for (i = 0; i < 3; i++)
5141 sel[i * 2] = i;
5142 sel[i * 2 + 1] = i + nelt;
5144 vec_perm_indices indices (sel, 2, nelt);
5145 if (can_vec_perm_const_p (mode, indices))
5147 for (i = 0; i < 6; i++)
5148 sel[i] += exact_div (nelt, 2);
5149 indices.new_vector (sel, 2, nelt);
5150 if (can_vec_perm_const_p (mode, indices))
5151 return true;
5156 if (dump_enabled_p ())
5157 dump_printf (MSG_MISSED_OPTIMIZATION,
5158 "permutaion op not supported by target.\n");
5159 return false;
5163 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5164 type VECTYPE. MASKED_P says whether the masked form is needed. */
5166 bool
5167 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5168 bool masked_p)
5170 if (masked_p)
5171 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5172 vec_mask_store_lanes_optab,
5173 vectype, count);
5174 else
5175 return vect_lanes_optab_supported_p ("vec_store_lanes",
5176 vec_store_lanes_optab,
5177 vectype, count);
5181 /* Function vect_permute_store_chain.
5183 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5184 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5185 the data correctly for the stores. Return the final references for stores
5186 in RESULT_CHAIN.
5188 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5189 The input is 4 vectors each containing 8 elements. We assign a number to
5190 each element, the input sequence is:
5192 1st vec: 0 1 2 3 4 5 6 7
5193 2nd vec: 8 9 10 11 12 13 14 15
5194 3rd vec: 16 17 18 19 20 21 22 23
5195 4th vec: 24 25 26 27 28 29 30 31
5197 The output sequence should be:
5199 1st vec: 0 8 16 24 1 9 17 25
5200 2nd vec: 2 10 18 26 3 11 19 27
5201 3rd vec: 4 12 20 28 5 13 21 30
5202 4th vec: 6 14 22 30 7 15 23 31
5204 i.e., we interleave the contents of the four vectors in their order.
5206 We use interleave_high/low instructions to create such output. The input of
5207 each interleave_high/low operation is two vectors:
5208 1st vec 2nd vec
5209 0 1 2 3 4 5 6 7
5210 the even elements of the result vector are obtained left-to-right from the
5211 high/low elements of the first vector. The odd elements of the result are
5212 obtained left-to-right from the high/low elements of the second vector.
5213 The output of interleave_high will be: 0 4 1 5
5214 and of interleave_low: 2 6 3 7
5217 The permutation is done in log LENGTH stages. In each stage interleave_high
5218 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5219 where the first argument is taken from the first half of DR_CHAIN and the
5220 second argument from it's second half.
5221 In our example,
5223 I1: interleave_high (1st vec, 3rd vec)
5224 I2: interleave_low (1st vec, 3rd vec)
5225 I3: interleave_high (2nd vec, 4th vec)
5226 I4: interleave_low (2nd vec, 4th vec)
5228 The output for the first stage is:
5230 I1: 0 16 1 17 2 18 3 19
5231 I2: 4 20 5 21 6 22 7 23
5232 I3: 8 24 9 25 10 26 11 27
5233 I4: 12 28 13 29 14 30 15 31
5235 The output of the second stage, i.e. the final result is:
5237 I1: 0 8 16 24 1 9 17 25
5238 I2: 2 10 18 26 3 11 19 27
5239 I3: 4 12 20 28 5 13 21 30
5240 I4: 6 14 22 30 7 15 23 31. */
5242 void
5243 vect_permute_store_chain (vec<tree> dr_chain,
5244 unsigned int length,
5245 gimple *stmt,
5246 gimple_stmt_iterator *gsi,
5247 vec<tree> *result_chain)
5249 tree vect1, vect2, high, low;
5250 gimple *perm_stmt;
5251 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5252 tree perm_mask_low, perm_mask_high;
5253 tree data_ref;
5254 tree perm3_mask_low, perm3_mask_high;
5255 unsigned int i, j, n, log_length = exact_log2 (length);
5257 result_chain->quick_grow (length);
5258 memcpy (result_chain->address (), dr_chain.address (),
5259 length * sizeof (tree));
5261 if (length == 3)
5263 /* vect_grouped_store_supported ensures that this is constant. */
5264 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5265 unsigned int j0 = 0, j1 = 0, j2 = 0;
5267 vec_perm_builder sel (nelt, nelt, 1);
5268 sel.quick_grow (nelt);
5269 vec_perm_indices indices;
5270 for (j = 0; j < 3; j++)
5272 int nelt0 = ((3 - j) * nelt) % 3;
5273 int nelt1 = ((3 - j) * nelt + 1) % 3;
5274 int nelt2 = ((3 - j) * nelt + 2) % 3;
5276 for (i = 0; i < nelt; i++)
5278 if (3 * i + nelt0 < nelt)
5279 sel[3 * i + nelt0] = j0++;
5280 if (3 * i + nelt1 < nelt)
5281 sel[3 * i + nelt1] = nelt + j1++;
5282 if (3 * i + nelt2 < nelt)
5283 sel[3 * i + nelt2] = 0;
5285 indices.new_vector (sel, 2, nelt);
5286 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5288 for (i = 0; i < nelt; i++)
5290 if (3 * i + nelt0 < nelt)
5291 sel[3 * i + nelt0] = 3 * i + nelt0;
5292 if (3 * i + nelt1 < nelt)
5293 sel[3 * i + nelt1] = 3 * i + nelt1;
5294 if (3 * i + nelt2 < nelt)
5295 sel[3 * i + nelt2] = nelt + j2++;
5297 indices.new_vector (sel, 2, nelt);
5298 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5300 vect1 = dr_chain[0];
5301 vect2 = dr_chain[1];
5303 /* Create interleaving stmt:
5304 low = VEC_PERM_EXPR <vect1, vect2,
5305 {j, nelt, *, j + 1, nelt + j + 1, *,
5306 j + 2, nelt + j + 2, *, ...}> */
5307 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5308 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5309 vect2, perm3_mask_low);
5310 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5312 vect1 = data_ref;
5313 vect2 = dr_chain[2];
5314 /* Create interleaving stmt:
5315 low = VEC_PERM_EXPR <vect1, vect2,
5316 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5317 6, 7, nelt + j + 2, ...}> */
5318 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5319 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5320 vect2, perm3_mask_high);
5321 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5322 (*result_chain)[j] = data_ref;
5325 else
5327 /* If length is not equal to 3 then only power of 2 is supported. */
5328 gcc_assert (pow2p_hwi (length));
5330 /* The encoding has 2 interleaved stepped patterns. */
5331 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5332 vec_perm_builder sel (nelt, 2, 3);
5333 sel.quick_grow (6);
5334 for (i = 0; i < 3; i++)
5336 sel[i * 2] = i;
5337 sel[i * 2 + 1] = i + nelt;
5339 vec_perm_indices indices (sel, 2, nelt);
5340 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5342 for (i = 0; i < 6; i++)
5343 sel[i] += exact_div (nelt, 2);
5344 indices.new_vector (sel, 2, nelt);
5345 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5347 for (i = 0, n = log_length; i < n; i++)
5349 for (j = 0; j < length/2; j++)
5351 vect1 = dr_chain[j];
5352 vect2 = dr_chain[j+length/2];
5354 /* Create interleaving stmt:
5355 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5356 ...}> */
5357 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5358 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5359 vect2, perm_mask_high);
5360 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5361 (*result_chain)[2*j] = high;
5363 /* Create interleaving stmt:
5364 low = VEC_PERM_EXPR <vect1, vect2,
5365 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5366 ...}> */
5367 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5368 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5369 vect2, perm_mask_low);
5370 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5371 (*result_chain)[2*j+1] = low;
5373 memcpy (dr_chain.address (), result_chain->address (),
5374 length * sizeof (tree));
5379 /* Function vect_setup_realignment
5381 This function is called when vectorizing an unaligned load using
5382 the dr_explicit_realign[_optimized] scheme.
5383 This function generates the following code at the loop prolog:
5385 p = initial_addr;
5386 x msq_init = *(floor(p)); # prolog load
5387 realignment_token = call target_builtin;
5388 loop:
5389 x msq = phi (msq_init, ---)
5391 The stmts marked with x are generated only for the case of
5392 dr_explicit_realign_optimized.
5394 The code above sets up a new (vector) pointer, pointing to the first
5395 location accessed by STMT, and a "floor-aligned" load using that pointer.
5396 It also generates code to compute the "realignment-token" (if the relevant
5397 target hook was defined), and creates a phi-node at the loop-header bb
5398 whose arguments are the result of the prolog-load (created by this
5399 function) and the result of a load that takes place in the loop (to be
5400 created by the caller to this function).
5402 For the case of dr_explicit_realign_optimized:
5403 The caller to this function uses the phi-result (msq) to create the
5404 realignment code inside the loop, and sets up the missing phi argument,
5405 as follows:
5406 loop:
5407 msq = phi (msq_init, lsq)
5408 lsq = *(floor(p')); # load in loop
5409 result = realign_load (msq, lsq, realignment_token);
5411 For the case of dr_explicit_realign:
5412 loop:
5413 msq = *(floor(p)); # load in loop
5414 p' = p + (VS-1);
5415 lsq = *(floor(p')); # load in loop
5416 result = realign_load (msq, lsq, realignment_token);
5418 Input:
5419 STMT - (scalar) load stmt to be vectorized. This load accesses
5420 a memory location that may be unaligned.
5421 BSI - place where new code is to be inserted.
5422 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5423 is used.
5425 Output:
5426 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5427 target hook, if defined.
5428 Return value - the result of the loop-header phi node. */
5430 tree
5431 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5432 tree *realignment_token,
5433 enum dr_alignment_support alignment_support_scheme,
5434 tree init_addr,
5435 struct loop **at_loop)
5437 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5438 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5439 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5440 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5441 struct loop *loop = NULL;
5442 edge pe = NULL;
5443 tree scalar_dest = gimple_assign_lhs (stmt);
5444 tree vec_dest;
5445 gimple *inc;
5446 tree ptr;
5447 tree data_ref;
5448 basic_block new_bb;
5449 tree msq_init = NULL_TREE;
5450 tree new_temp;
5451 gphi *phi_stmt;
5452 tree msq = NULL_TREE;
5453 gimple_seq stmts = NULL;
5454 bool inv_p;
5455 bool compute_in_loop = false;
5456 bool nested_in_vect_loop = false;
5457 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5458 struct loop *loop_for_initial_load = NULL;
5460 if (loop_vinfo)
5462 loop = LOOP_VINFO_LOOP (loop_vinfo);
5463 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5466 gcc_assert (alignment_support_scheme == dr_explicit_realign
5467 || alignment_support_scheme == dr_explicit_realign_optimized);
5469 /* We need to generate three things:
5470 1. the misalignment computation
5471 2. the extra vector load (for the optimized realignment scheme).
5472 3. the phi node for the two vectors from which the realignment is
5473 done (for the optimized realignment scheme). */
5475 /* 1. Determine where to generate the misalignment computation.
5477 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5478 calculation will be generated by this function, outside the loop (in the
5479 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5480 caller, inside the loop.
5482 Background: If the misalignment remains fixed throughout the iterations of
5483 the loop, then both realignment schemes are applicable, and also the
5484 misalignment computation can be done outside LOOP. This is because we are
5485 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5486 are a multiple of VS (the Vector Size), and therefore the misalignment in
5487 different vectorized LOOP iterations is always the same.
5488 The problem arises only if the memory access is in an inner-loop nested
5489 inside LOOP, which is now being vectorized using outer-loop vectorization.
5490 This is the only case when the misalignment of the memory access may not
5491 remain fixed throughout the iterations of the inner-loop (as explained in
5492 detail in vect_supportable_dr_alignment). In this case, not only is the
5493 optimized realignment scheme not applicable, but also the misalignment
5494 computation (and generation of the realignment token that is passed to
5495 REALIGN_LOAD) have to be done inside the loop.
5497 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5498 or not, which in turn determines if the misalignment is computed inside
5499 the inner-loop, or outside LOOP. */
5501 if (init_addr != NULL_TREE || !loop_vinfo)
5503 compute_in_loop = true;
5504 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5508 /* 2. Determine where to generate the extra vector load.
5510 For the optimized realignment scheme, instead of generating two vector
5511 loads in each iteration, we generate a single extra vector load in the
5512 preheader of the loop, and in each iteration reuse the result of the
5513 vector load from the previous iteration. In case the memory access is in
5514 an inner-loop nested inside LOOP, which is now being vectorized using
5515 outer-loop vectorization, we need to determine whether this initial vector
5516 load should be generated at the preheader of the inner-loop, or can be
5517 generated at the preheader of LOOP. If the memory access has no evolution
5518 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5519 to be generated inside LOOP (in the preheader of the inner-loop). */
5521 if (nested_in_vect_loop)
5523 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5524 bool invariant_in_outerloop =
5525 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5526 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5528 else
5529 loop_for_initial_load = loop;
5530 if (at_loop)
5531 *at_loop = loop_for_initial_load;
5533 if (loop_for_initial_load)
5534 pe = loop_preheader_edge (loop_for_initial_load);
5536 /* 3. For the case of the optimized realignment, create the first vector
5537 load at the loop preheader. */
5539 if (alignment_support_scheme == dr_explicit_realign_optimized)
5541 /* Create msq_init = *(floor(p1)) in the loop preheader */
5542 gassign *new_stmt;
5544 gcc_assert (!compute_in_loop);
5545 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5546 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5547 NULL_TREE, &init_addr, NULL, &inc,
5548 true, &inv_p);
5549 if (TREE_CODE (ptr) == SSA_NAME)
5550 new_temp = copy_ssa_name (ptr);
5551 else
5552 new_temp = make_ssa_name (TREE_TYPE (ptr));
5553 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5554 new_stmt = gimple_build_assign
5555 (new_temp, BIT_AND_EXPR, ptr,
5556 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5557 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5558 gcc_assert (!new_bb);
5559 data_ref
5560 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5561 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5562 vect_copy_ref_info (data_ref, DR_REF (dr));
5563 new_stmt = gimple_build_assign (vec_dest, data_ref);
5564 new_temp = make_ssa_name (vec_dest, new_stmt);
5565 gimple_assign_set_lhs (new_stmt, new_temp);
5566 if (pe)
5568 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5569 gcc_assert (!new_bb);
5571 else
5572 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5574 msq_init = gimple_assign_lhs (new_stmt);
5577 /* 4. Create realignment token using a target builtin, if available.
5578 It is done either inside the containing loop, or before LOOP (as
5579 determined above). */
5581 if (targetm.vectorize.builtin_mask_for_load)
5583 gcall *new_stmt;
5584 tree builtin_decl;
5586 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5587 if (!init_addr)
5589 /* Generate the INIT_ADDR computation outside LOOP. */
5590 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5591 NULL_TREE);
5592 if (loop)
5594 pe = loop_preheader_edge (loop);
5595 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5596 gcc_assert (!new_bb);
5598 else
5599 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5602 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5603 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5604 vec_dest =
5605 vect_create_destination_var (scalar_dest,
5606 gimple_call_return_type (new_stmt));
5607 new_temp = make_ssa_name (vec_dest, new_stmt);
5608 gimple_call_set_lhs (new_stmt, new_temp);
5610 if (compute_in_loop)
5611 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5612 else
5614 /* Generate the misalignment computation outside LOOP. */
5615 pe = loop_preheader_edge (loop);
5616 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5617 gcc_assert (!new_bb);
5620 *realignment_token = gimple_call_lhs (new_stmt);
5622 /* The result of the CALL_EXPR to this builtin is determined from
5623 the value of the parameter and no global variables are touched
5624 which makes the builtin a "const" function. Requiring the
5625 builtin to have the "const" attribute makes it unnecessary
5626 to call mark_call_clobbered. */
5627 gcc_assert (TREE_READONLY (builtin_decl));
5630 if (alignment_support_scheme == dr_explicit_realign)
5631 return msq;
5633 gcc_assert (!compute_in_loop);
5634 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5637 /* 5. Create msq = phi <msq_init, lsq> in loop */
5639 pe = loop_preheader_edge (containing_loop);
5640 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5641 msq = make_ssa_name (vec_dest);
5642 phi_stmt = create_phi_node (msq, containing_loop->header);
5643 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5645 return msq;
5649 /* Function vect_grouped_load_supported.
5651 COUNT is the size of the load group (the number of statements plus the
5652 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5653 only one statement, with a gap of COUNT - 1.
5655 Returns true if a suitable permute exists. */
5657 bool
5658 vect_grouped_load_supported (tree vectype, bool single_element_p,
5659 unsigned HOST_WIDE_INT count)
5661 machine_mode mode = TYPE_MODE (vectype);
5663 /* If this is single-element interleaving with an element distance
5664 that leaves unused vector loads around punt - we at least create
5665 very sub-optimal code in that case (and blow up memory,
5666 see PR65518). */
5667 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5669 if (dump_enabled_p ())
5670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5671 "single-element interleaving not supported "
5672 "for not adjacent vector loads\n");
5673 return false;
5676 /* vect_permute_load_chain requires the group size to be equal to 3 or
5677 be a power of two. */
5678 if (count != 3 && exact_log2 (count) == -1)
5680 if (dump_enabled_p ())
5681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5682 "the size of the group of accesses"
5683 " is not a power of 2 or not equal to 3\n");
5684 return false;
5687 /* Check that the permutation is supported. */
5688 if (VECTOR_MODE_P (mode))
5690 unsigned int i, j;
5691 if (count == 3)
5693 unsigned int nelt;
5694 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5696 if (dump_enabled_p ())
5697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5698 "cannot handle groups of 3 loads for"
5699 " variable-length vectors\n");
5700 return false;
5703 vec_perm_builder sel (nelt, nelt, 1);
5704 sel.quick_grow (nelt);
5705 vec_perm_indices indices;
5706 unsigned int k;
5707 for (k = 0; k < 3; k++)
5709 for (i = 0; i < nelt; i++)
5710 if (3 * i + k < 2 * nelt)
5711 sel[i] = 3 * i + k;
5712 else
5713 sel[i] = 0;
5714 indices.new_vector (sel, 2, nelt);
5715 if (!can_vec_perm_const_p (mode, indices))
5717 if (dump_enabled_p ())
5718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5719 "shuffle of 3 loads is not supported by"
5720 " target\n");
5721 return false;
5723 for (i = 0, j = 0; i < nelt; i++)
5724 if (3 * i + k < 2 * nelt)
5725 sel[i] = i;
5726 else
5727 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5728 indices.new_vector (sel, 2, nelt);
5729 if (!can_vec_perm_const_p (mode, indices))
5731 if (dump_enabled_p ())
5732 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5733 "shuffle of 3 loads is not supported by"
5734 " target\n");
5735 return false;
5738 return true;
5740 else
5742 /* If length is not equal to 3 then only power of 2 is supported. */
5743 gcc_assert (pow2p_hwi (count));
5744 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5746 /* The encoding has a single stepped pattern. */
5747 vec_perm_builder sel (nelt, 1, 3);
5748 sel.quick_grow (3);
5749 for (i = 0; i < 3; i++)
5750 sel[i] = i * 2;
5751 vec_perm_indices indices (sel, 2, nelt);
5752 if (can_vec_perm_const_p (mode, indices))
5754 for (i = 0; i < 3; i++)
5755 sel[i] = i * 2 + 1;
5756 indices.new_vector (sel, 2, nelt);
5757 if (can_vec_perm_const_p (mode, indices))
5758 return true;
5763 if (dump_enabled_p ())
5764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5765 "extract even/odd not supported by target\n");
5766 return false;
5769 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5770 type VECTYPE. MASKED_P says whether the masked form is needed. */
5772 bool
5773 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5774 bool masked_p)
5776 if (masked_p)
5777 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5778 vec_mask_load_lanes_optab,
5779 vectype, count);
5780 else
5781 return vect_lanes_optab_supported_p ("vec_load_lanes",
5782 vec_load_lanes_optab,
5783 vectype, count);
5786 /* Function vect_permute_load_chain.
5788 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5789 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5790 the input data correctly. Return the final references for loads in
5791 RESULT_CHAIN.
5793 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5794 The input is 4 vectors each containing 8 elements. We assign a number to each
5795 element, the input sequence is:
5797 1st vec: 0 1 2 3 4 5 6 7
5798 2nd vec: 8 9 10 11 12 13 14 15
5799 3rd vec: 16 17 18 19 20 21 22 23
5800 4th vec: 24 25 26 27 28 29 30 31
5802 The output sequence should be:
5804 1st vec: 0 4 8 12 16 20 24 28
5805 2nd vec: 1 5 9 13 17 21 25 29
5806 3rd vec: 2 6 10 14 18 22 26 30
5807 4th vec: 3 7 11 15 19 23 27 31
5809 i.e., the first output vector should contain the first elements of each
5810 interleaving group, etc.
5812 We use extract_even/odd instructions to create such output. The input of
5813 each extract_even/odd operation is two vectors
5814 1st vec 2nd vec
5815 0 1 2 3 4 5 6 7
5817 and the output is the vector of extracted even/odd elements. The output of
5818 extract_even will be: 0 2 4 6
5819 and of extract_odd: 1 3 5 7
5822 The permutation is done in log LENGTH stages. In each stage extract_even
5823 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5824 their order. In our example,
5826 E1: extract_even (1st vec, 2nd vec)
5827 E2: extract_odd (1st vec, 2nd vec)
5828 E3: extract_even (3rd vec, 4th vec)
5829 E4: extract_odd (3rd vec, 4th vec)
5831 The output for the first stage will be:
5833 E1: 0 2 4 6 8 10 12 14
5834 E2: 1 3 5 7 9 11 13 15
5835 E3: 16 18 20 22 24 26 28 30
5836 E4: 17 19 21 23 25 27 29 31
5838 In order to proceed and create the correct sequence for the next stage (or
5839 for the correct output, if the second stage is the last one, as in our
5840 example), we first put the output of extract_even operation and then the
5841 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5842 The input for the second stage is:
5844 1st vec (E1): 0 2 4 6 8 10 12 14
5845 2nd vec (E3): 16 18 20 22 24 26 28 30
5846 3rd vec (E2): 1 3 5 7 9 11 13 15
5847 4th vec (E4): 17 19 21 23 25 27 29 31
5849 The output of the second stage:
5851 E1: 0 4 8 12 16 20 24 28
5852 E2: 2 6 10 14 18 22 26 30
5853 E3: 1 5 9 13 17 21 25 29
5854 E4: 3 7 11 15 19 23 27 31
5856 And RESULT_CHAIN after reordering:
5858 1st vec (E1): 0 4 8 12 16 20 24 28
5859 2nd vec (E3): 1 5 9 13 17 21 25 29
5860 3rd vec (E2): 2 6 10 14 18 22 26 30
5861 4th vec (E4): 3 7 11 15 19 23 27 31. */
5863 static void
5864 vect_permute_load_chain (vec<tree> dr_chain,
5865 unsigned int length,
5866 gimple *stmt,
5867 gimple_stmt_iterator *gsi,
5868 vec<tree> *result_chain)
5870 tree data_ref, first_vect, second_vect;
5871 tree perm_mask_even, perm_mask_odd;
5872 tree perm3_mask_low, perm3_mask_high;
5873 gimple *perm_stmt;
5874 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5875 unsigned int i, j, log_length = exact_log2 (length);
5877 result_chain->quick_grow (length);
5878 memcpy (result_chain->address (), dr_chain.address (),
5879 length * sizeof (tree));
5881 if (length == 3)
5883 /* vect_grouped_load_supported ensures that this is constant. */
5884 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5885 unsigned int k;
5887 vec_perm_builder sel (nelt, nelt, 1);
5888 sel.quick_grow (nelt);
5889 vec_perm_indices indices;
5890 for (k = 0; k < 3; k++)
5892 for (i = 0; i < nelt; i++)
5893 if (3 * i + k < 2 * nelt)
5894 sel[i] = 3 * i + k;
5895 else
5896 sel[i] = 0;
5897 indices.new_vector (sel, 2, nelt);
5898 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5900 for (i = 0, j = 0; i < nelt; i++)
5901 if (3 * i + k < 2 * nelt)
5902 sel[i] = i;
5903 else
5904 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5905 indices.new_vector (sel, 2, nelt);
5906 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5908 first_vect = dr_chain[0];
5909 second_vect = dr_chain[1];
5911 /* Create interleaving stmt (low part of):
5912 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5913 ...}> */
5914 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5915 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5916 second_vect, perm3_mask_low);
5917 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5919 /* Create interleaving stmt (high part of):
5920 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5921 ...}> */
5922 first_vect = data_ref;
5923 second_vect = dr_chain[2];
5924 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5925 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5926 second_vect, perm3_mask_high);
5927 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5928 (*result_chain)[k] = data_ref;
5931 else
5933 /* If length is not equal to 3 then only power of 2 is supported. */
5934 gcc_assert (pow2p_hwi (length));
5936 /* The encoding has a single stepped pattern. */
5937 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5938 vec_perm_builder sel (nelt, 1, 3);
5939 sel.quick_grow (3);
5940 for (i = 0; i < 3; ++i)
5941 sel[i] = i * 2;
5942 vec_perm_indices indices (sel, 2, nelt);
5943 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5945 for (i = 0; i < 3; ++i)
5946 sel[i] = i * 2 + 1;
5947 indices.new_vector (sel, 2, nelt);
5948 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5950 for (i = 0; i < log_length; i++)
5952 for (j = 0; j < length; j += 2)
5954 first_vect = dr_chain[j];
5955 second_vect = dr_chain[j+1];
5957 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5958 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5959 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5960 first_vect, second_vect,
5961 perm_mask_even);
5962 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5963 (*result_chain)[j/2] = data_ref;
5965 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5966 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5967 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5968 first_vect, second_vect,
5969 perm_mask_odd);
5970 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5971 (*result_chain)[j/2+length/2] = data_ref;
5973 memcpy (dr_chain.address (), result_chain->address (),
5974 length * sizeof (tree));
5979 /* Function vect_shift_permute_load_chain.
5981 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5982 sequence of stmts to reorder the input data accordingly.
5983 Return the final references for loads in RESULT_CHAIN.
5984 Return true if successed, false otherwise.
5986 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5987 The input is 3 vectors each containing 8 elements. We assign a
5988 number to each element, the input sequence is:
5990 1st vec: 0 1 2 3 4 5 6 7
5991 2nd vec: 8 9 10 11 12 13 14 15
5992 3rd vec: 16 17 18 19 20 21 22 23
5994 The output sequence should be:
5996 1st vec: 0 3 6 9 12 15 18 21
5997 2nd vec: 1 4 7 10 13 16 19 22
5998 3rd vec: 2 5 8 11 14 17 20 23
6000 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6002 First we shuffle all 3 vectors to get correct elements order:
6004 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6005 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6006 3rd vec: (16 19 22) (17 20 23) (18 21)
6008 Next we unite and shift vector 3 times:
6010 1st step:
6011 shift right by 6 the concatenation of:
6012 "1st vec" and "2nd vec"
6013 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6014 "2nd vec" and "3rd vec"
6015 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6016 "3rd vec" and "1st vec"
6017 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6018 | New vectors |
6020 So that now new vectors are:
6022 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6023 2nd vec: (10 13) (16 19 22) (17 20 23)
6024 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6026 2nd step:
6027 shift right by 5 the concatenation of:
6028 "1st vec" and "3rd vec"
6029 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6030 "2nd vec" and "1st vec"
6031 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6032 "3rd vec" and "2nd vec"
6033 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6034 | New vectors |
6036 So that now new vectors are:
6038 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6039 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6040 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6042 3rd step:
6043 shift right by 5 the concatenation of:
6044 "1st vec" and "1st vec"
6045 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6046 shift right by 3 the concatenation of:
6047 "2nd vec" and "2nd vec"
6048 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6049 | New vectors |
6051 So that now all vectors are READY:
6052 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6053 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6054 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6056 This algorithm is faster than one in vect_permute_load_chain if:
6057 1. "shift of a concatination" is faster than general permutation.
6058 This is usually so.
6059 2. The TARGET machine can't execute vector instructions in parallel.
6060 This is because each step of the algorithm depends on previous.
6061 The algorithm in vect_permute_load_chain is much more parallel.
6063 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6066 static bool
6067 vect_shift_permute_load_chain (vec<tree> dr_chain,
6068 unsigned int length,
6069 gimple *stmt,
6070 gimple_stmt_iterator *gsi,
6071 vec<tree> *result_chain)
6073 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6074 tree perm2_mask1, perm2_mask2, perm3_mask;
6075 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6076 gimple *perm_stmt;
6078 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6079 unsigned int i;
6080 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6081 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6083 unsigned HOST_WIDE_INT nelt, vf;
6084 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6085 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6086 /* Not supported for variable-length vectors. */
6087 return false;
6089 vec_perm_builder sel (nelt, nelt, 1);
6090 sel.quick_grow (nelt);
6092 result_chain->quick_grow (length);
6093 memcpy (result_chain->address (), dr_chain.address (),
6094 length * sizeof (tree));
6096 if (pow2p_hwi (length) && vf > 4)
6098 unsigned int j, log_length = exact_log2 (length);
6099 for (i = 0; i < nelt / 2; ++i)
6100 sel[i] = i * 2;
6101 for (i = 0; i < nelt / 2; ++i)
6102 sel[nelt / 2 + i] = i * 2 + 1;
6103 vec_perm_indices indices (sel, 2, nelt);
6104 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6106 if (dump_enabled_p ())
6107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6108 "shuffle of 2 fields structure is not \
6109 supported by target\n");
6110 return false;
6112 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6114 for (i = 0; i < nelt / 2; ++i)
6115 sel[i] = i * 2 + 1;
6116 for (i = 0; i < nelt / 2; ++i)
6117 sel[nelt / 2 + i] = i * 2;
6118 indices.new_vector (sel, 2, nelt);
6119 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6121 if (dump_enabled_p ())
6122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6123 "shuffle of 2 fields structure is not \
6124 supported by target\n");
6125 return false;
6127 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6129 /* Generating permutation constant to shift all elements.
6130 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6131 for (i = 0; i < nelt; i++)
6132 sel[i] = nelt / 2 + i;
6133 indices.new_vector (sel, 2, nelt);
6134 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6136 if (dump_enabled_p ())
6137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6138 "shift permutation is not supported by target\n");
6139 return false;
6141 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6143 /* Generating permutation constant to select vector from 2.
6144 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6145 for (i = 0; i < nelt / 2; i++)
6146 sel[i] = i;
6147 for (i = nelt / 2; i < nelt; i++)
6148 sel[i] = nelt + i;
6149 indices.new_vector (sel, 2, nelt);
6150 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6152 if (dump_enabled_p ())
6153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6154 "select is not supported by target\n");
6155 return false;
6157 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6159 for (i = 0; i < log_length; i++)
6161 for (j = 0; j < length; j += 2)
6163 first_vect = dr_chain[j];
6164 second_vect = dr_chain[j + 1];
6166 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6167 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6168 first_vect, first_vect,
6169 perm2_mask1);
6170 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6171 vect[0] = data_ref;
6173 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6174 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6175 second_vect, second_vect,
6176 perm2_mask2);
6177 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6178 vect[1] = data_ref;
6180 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6181 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6182 vect[0], vect[1], shift1_mask);
6183 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6184 (*result_chain)[j/2 + length/2] = data_ref;
6186 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6187 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6188 vect[0], vect[1], select_mask);
6189 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6190 (*result_chain)[j/2] = data_ref;
6192 memcpy (dr_chain.address (), result_chain->address (),
6193 length * sizeof (tree));
6195 return true;
6197 if (length == 3 && vf > 2)
6199 unsigned int k = 0, l = 0;
6201 /* Generating permutation constant to get all elements in rigth order.
6202 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6203 for (i = 0; i < nelt; i++)
6205 if (3 * k + (l % 3) >= nelt)
6207 k = 0;
6208 l += (3 - (nelt % 3));
6210 sel[i] = 3 * k + (l % 3);
6211 k++;
6213 vec_perm_indices indices (sel, 2, nelt);
6214 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6216 if (dump_enabled_p ())
6217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6218 "shuffle of 3 fields structure is not \
6219 supported by target\n");
6220 return false;
6222 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6224 /* Generating permutation constant to shift all elements.
6225 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6226 for (i = 0; i < nelt; i++)
6227 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6228 indices.new_vector (sel, 2, nelt);
6229 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6231 if (dump_enabled_p ())
6232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6233 "shift permutation is not supported by target\n");
6234 return false;
6236 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6238 /* Generating permutation constant to shift all elements.
6239 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6240 for (i = 0; i < nelt; i++)
6241 sel[i] = 2 * (nelt / 3) + 1 + i;
6242 indices.new_vector (sel, 2, nelt);
6243 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6245 if (dump_enabled_p ())
6246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6247 "shift permutation is not supported by target\n");
6248 return false;
6250 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6252 /* Generating permutation constant to shift all elements.
6253 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6254 for (i = 0; i < nelt; i++)
6255 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6256 indices.new_vector (sel, 2, nelt);
6257 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6259 if (dump_enabled_p ())
6260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6261 "shift permutation is not supported by target\n");
6262 return false;
6264 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6266 /* Generating permutation constant to shift all elements.
6267 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6268 for (i = 0; i < nelt; i++)
6269 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6270 indices.new_vector (sel, 2, nelt);
6271 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6273 if (dump_enabled_p ())
6274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6275 "shift permutation is not supported by target\n");
6276 return false;
6278 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6280 for (k = 0; k < 3; k++)
6282 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6283 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6284 dr_chain[k], dr_chain[k],
6285 perm3_mask);
6286 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6287 vect[k] = data_ref;
6290 for (k = 0; k < 3; k++)
6292 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6293 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6294 vect[k % 3], vect[(k + 1) % 3],
6295 shift1_mask);
6296 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6297 vect_shift[k] = data_ref;
6300 for (k = 0; k < 3; k++)
6302 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6303 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6304 vect_shift[(4 - k) % 3],
6305 vect_shift[(3 - k) % 3],
6306 shift2_mask);
6307 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6308 vect[k] = data_ref;
6311 (*result_chain)[3 - (nelt % 3)] = vect[2];
6313 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6314 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6315 vect[0], shift3_mask);
6316 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6317 (*result_chain)[nelt % 3] = data_ref;
6319 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6320 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6321 vect[1], shift4_mask);
6322 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6323 (*result_chain)[0] = data_ref;
6324 return true;
6326 return false;
6329 /* Function vect_transform_grouped_load.
6331 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6332 to perform their permutation and ascribe the result vectorized statements to
6333 the scalar statements.
6336 void
6337 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6338 gimple_stmt_iterator *gsi)
6340 machine_mode mode;
6341 vec<tree> result_chain = vNULL;
6343 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6344 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6345 vectors, that are ready for vector computation. */
6346 result_chain.create (size);
6348 /* If reassociation width for vector type is 2 or greater target machine can
6349 execute 2 or more vector instructions in parallel. Otherwise try to
6350 get chain for loads group using vect_shift_permute_load_chain. */
6351 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6352 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6353 || pow2p_hwi (size)
6354 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6355 gsi, &result_chain))
6356 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6357 vect_record_grouped_load_vectors (stmt, result_chain);
6358 result_chain.release ();
6361 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6362 generated as part of the vectorization of STMT. Assign the statement
6363 for each vector to the associated scalar statement. */
6365 void
6366 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6368 gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6369 gimple *next_stmt, *new_stmt;
6370 unsigned int i, gap_count;
6371 tree tmp_data_ref;
6373 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6374 Since we scan the chain starting from it's first node, their order
6375 corresponds the order of data-refs in RESULT_CHAIN. */
6376 next_stmt = first_stmt;
6377 gap_count = 1;
6378 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6380 if (!next_stmt)
6381 break;
6383 /* Skip the gaps. Loads created for the gaps will be removed by dead
6384 code elimination pass later. No need to check for the first stmt in
6385 the group, since it always exists.
6386 DR_GROUP_GAP is the number of steps in elements from the previous
6387 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6388 correspond to the gaps. */
6389 if (next_stmt != first_stmt
6390 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6392 gap_count++;
6393 continue;
6396 while (next_stmt)
6398 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6399 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6400 copies, and we put the new vector statement in the first available
6401 RELATED_STMT. */
6402 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6403 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6404 else
6406 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6408 gimple *prev_stmt =
6409 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6410 gimple *rel_stmt =
6411 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6412 while (rel_stmt)
6414 prev_stmt = rel_stmt;
6415 rel_stmt =
6416 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6419 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6420 new_stmt;
6424 next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6425 gap_count = 1;
6426 /* If NEXT_STMT accesses the same DR as the previous statement,
6427 put the same TMP_DATA_REF as its vectorized statement; otherwise
6428 get the next data-ref from RESULT_CHAIN. */
6429 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6430 break;
6435 /* Function vect_force_dr_alignment_p.
6437 Returns whether the alignment of a DECL can be forced to be aligned
6438 on ALIGNMENT bit boundary. */
6440 bool
6441 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6443 if (!VAR_P (decl))
6444 return false;
6446 if (decl_in_symtab_p (decl)
6447 && !symtab_node::get (decl)->can_increase_alignment_p ())
6448 return false;
6450 if (TREE_STATIC (decl))
6451 return (alignment <= MAX_OFILE_ALIGNMENT);
6452 else
6453 return (alignment <= MAX_STACK_ALIGNMENT);
6457 /* Return whether the data reference DR is supported with respect to its
6458 alignment.
6459 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6460 it is aligned, i.e., check if it is possible to vectorize it with different
6461 alignment. */
6463 enum dr_alignment_support
6464 vect_supportable_dr_alignment (struct data_reference *dr,
6465 bool check_aligned_accesses)
6467 gimple *stmt = DR_STMT (dr);
6468 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6469 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6470 machine_mode mode = TYPE_MODE (vectype);
6471 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6472 struct loop *vect_loop = NULL;
6473 bool nested_in_vect_loop = false;
6475 if (aligned_access_p (dr) && !check_aligned_accesses)
6476 return dr_aligned;
6478 /* For now assume all conditional loads/stores support unaligned
6479 access without any special code. */
6480 if (is_gimple_call (stmt)
6481 && gimple_call_internal_p (stmt)
6482 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6483 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6484 return dr_unaligned_supported;
6486 if (loop_vinfo)
6488 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6489 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6492 /* Possibly unaligned access. */
6494 /* We can choose between using the implicit realignment scheme (generating
6495 a misaligned_move stmt) and the explicit realignment scheme (generating
6496 aligned loads with a REALIGN_LOAD). There are two variants to the
6497 explicit realignment scheme: optimized, and unoptimized.
6498 We can optimize the realignment only if the step between consecutive
6499 vector loads is equal to the vector size. Since the vector memory
6500 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6501 is guaranteed that the misalignment amount remains the same throughout the
6502 execution of the vectorized loop. Therefore, we can create the
6503 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6504 at the loop preheader.
6506 However, in the case of outer-loop vectorization, when vectorizing a
6507 memory access in the inner-loop nested within the LOOP that is now being
6508 vectorized, while it is guaranteed that the misalignment of the
6509 vectorized memory access will remain the same in different outer-loop
6510 iterations, it is *not* guaranteed that is will remain the same throughout
6511 the execution of the inner-loop. This is because the inner-loop advances
6512 with the original scalar step (and not in steps of VS). If the inner-loop
6513 step happens to be a multiple of VS, then the misalignment remains fixed
6514 and we can use the optimized realignment scheme. For example:
6516 for (i=0; i<N; i++)
6517 for (j=0; j<M; j++)
6518 s += a[i+j];
6520 When vectorizing the i-loop in the above example, the step between
6521 consecutive vector loads is 1, and so the misalignment does not remain
6522 fixed across the execution of the inner-loop, and the realignment cannot
6523 be optimized (as illustrated in the following pseudo vectorized loop):
6525 for (i=0; i<N; i+=4)
6526 for (j=0; j<M; j++){
6527 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6528 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6529 // (assuming that we start from an aligned address).
6532 We therefore have to use the unoptimized realignment scheme:
6534 for (i=0; i<N; i+=4)
6535 for (j=k; j<M; j+=4)
6536 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6537 // that the misalignment of the initial address is
6538 // 0).
6540 The loop can then be vectorized as follows:
6542 for (k=0; k<4; k++){
6543 rt = get_realignment_token (&vp[k]);
6544 for (i=0; i<N; i+=4){
6545 v1 = vp[i+k];
6546 for (j=k; j<M; j+=4){
6547 v2 = vp[i+j+VS-1];
6548 va = REALIGN_LOAD <v1,v2,rt>;
6549 vs += va;
6550 v1 = v2;
6553 } */
6555 if (DR_IS_READ (dr))
6557 bool is_packed = false;
6558 tree type = (TREE_TYPE (DR_REF (dr)));
6560 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6561 && (!targetm.vectorize.builtin_mask_for_load
6562 || targetm.vectorize.builtin_mask_for_load ()))
6564 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6566 /* If we are doing SLP then the accesses need not have the
6567 same alignment, instead it depends on the SLP group size. */
6568 if (loop_vinfo
6569 && STMT_SLP_TYPE (stmt_info)
6570 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6571 * DR_GROUP_SIZE (vinfo_for_stmt
6572 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6573 TYPE_VECTOR_SUBPARTS (vectype)))
6575 else if (!loop_vinfo
6576 || (nested_in_vect_loop
6577 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6578 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6579 return dr_explicit_realign;
6580 else
6581 return dr_explicit_realign_optimized;
6583 if (!known_alignment_for_access_p (dr))
6584 is_packed = not_size_aligned (DR_REF (dr));
6586 if (targetm.vectorize.support_vector_misalignment
6587 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6588 /* Can't software pipeline the loads, but can at least do them. */
6589 return dr_unaligned_supported;
6591 else
6593 bool is_packed = false;
6594 tree type = (TREE_TYPE (DR_REF (dr)));
6596 if (!known_alignment_for_access_p (dr))
6597 is_packed = not_size_aligned (DR_REF (dr));
6599 if (targetm.vectorize.support_vector_misalignment
6600 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6601 return dr_unaligned_supported;
6604 /* Unsupported. */
6605 return dr_unaligned_unsupported;