Prefer open-coding vector integer division
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobfe4c4a5a1be24e8e249ed1e59b5b2b8d40ecf1b2
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
216 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
219 /* A subroutine of vect_analyze_data_ref_dependence. Handle
220 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
221 distances. These distances are conservatively correct but they don't
222 reflect a guaranteed dependence.
224 Return true if this function does all the work necessary to avoid
225 an alias or false if the caller should use the dependence distances
226 to limit the vectorization factor in the usual way. LOOP_DEPTH is
227 the depth of the loop described by LOOP_VINFO and the other arguments
228 are as for vect_analyze_data_ref_dependence. */
230 static bool
231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 int loop_depth, unsigned int *max_vf)
235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
236 lambda_vector dist_v;
237 unsigned int i;
238 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
240 int dist = dist_v[loop_depth];
241 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
243 /* If the user asserted safelen >= DIST consecutive iterations
244 can be executed concurrently, assume independence.
246 ??? An alternative would be to add the alias check even
247 in this case, and vectorize the fallback loop with the
248 maximum VF set to safelen. However, if the user has
249 explicitly given a length, it's less likely that that
250 would be a win. */
251 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
253 if ((unsigned int) loop->safelen < *max_vf)
254 *max_vf = loop->safelen;
255 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
256 continue;
259 /* For dependence distances of 2 or more, we have the option
260 of limiting VF or checking for an alias at runtime.
261 Prefer to check at runtime if we can, to avoid limiting
262 the VF unnecessarily when the bases are in fact independent.
264 Note that the alias checks will be removed if the VF ends up
265 being small enough. */
266 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
269 return true;
273 /* Function vect_analyze_data_ref_dependence.
275 Return TRUE if there (might) exist a dependence between a memory-reference
276 DRA and a memory-reference DRB. When versioning for alias may check a
277 dependence at run-time, return FALSE. Adjust *MAX_VF according to
278 the data dependence. */
280 static bool
281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
282 loop_vec_info loop_vinfo,
283 unsigned int *max_vf)
285 unsigned int i;
286 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
287 struct data_reference *dra = DDR_A (ddr);
288 struct data_reference *drb = DDR_B (ddr);
289 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
290 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
291 lambda_vector dist_v;
292 unsigned int loop_depth;
294 /* In loop analysis all data references should be vectorizable. */
295 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
296 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
297 gcc_unreachable ();
299 /* Independent data accesses. */
300 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
301 return false;
303 if (dra == drb
304 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
305 return false;
307 /* We do not have to consider dependences between accesses that belong
308 to the same group, unless the stride could be smaller than the
309 group size. */
310 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
311 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
312 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
313 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
314 return false;
316 /* Even if we have an anti-dependence then, as the vectorized loop covers at
317 least two scalar iterations, there is always also a true dependence.
318 As the vectorizer does not re-order loads and stores we can ignore
319 the anti-dependence if TBAA can disambiguate both DRs similar to the
320 case with known negative distance anti-dependences (positive
321 distance anti-dependences would violate TBAA constraints). */
322 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
323 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
324 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
325 get_alias_set (DR_REF (drb))))
326 return false;
328 /* Unknown data dependence. */
329 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
331 /* If user asserted safelen consecutive iterations can be
332 executed concurrently, assume independence. */
333 if (loop->safelen >= 2)
335 if ((unsigned int) loop->safelen < *max_vf)
336 *max_vf = loop->safelen;
337 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
338 return false;
341 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
342 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
344 if (dump_enabled_p ())
346 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
347 "versioning for alias not supported for: "
348 "can't determine dependence between ");
349 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
350 DR_REF (dra));
351 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
353 DR_REF (drb));
354 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 return true;
359 if (dump_enabled_p ())
361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
362 "versioning for alias required: "
363 "can't determine dependence between ");
364 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
365 DR_REF (dra));
366 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
367 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
368 DR_REF (drb));
369 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
372 /* Add to list of ddrs that need to be tested at run-time. */
373 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
376 /* Known data dependence. */
377 if (DDR_NUM_DIST_VECTS (ddr) == 0)
379 /* If user asserted safelen consecutive iterations can be
380 executed concurrently, assume independence. */
381 if (loop->safelen >= 2)
383 if ((unsigned int) loop->safelen < *max_vf)
384 *max_vf = loop->safelen;
385 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
386 return false;
389 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
390 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
392 if (dump_enabled_p ())
394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
395 "versioning for alias not supported for: "
396 "bad dist vector for ");
397 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
398 DR_REF (dra));
399 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
400 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
401 DR_REF (drb));
402 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
404 return true;
407 if (dump_enabled_p ())
409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
410 "versioning for alias required: "
411 "bad dist vector for ");
412 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
413 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
414 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
415 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
417 /* Add to list of ddrs that need to be tested at run-time. */
418 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
421 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
423 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
424 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
425 loop_depth, max_vf))
426 return false;
428 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
430 int dist = dist_v[loop_depth];
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_NOTE, vect_location,
434 "dependence distance = %d.\n", dist);
436 if (dist == 0)
438 if (dump_enabled_p ())
440 dump_printf_loc (MSG_NOTE, vect_location,
441 "dependence distance == 0 between ");
442 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
443 dump_printf (MSG_NOTE, " and ");
444 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
445 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
448 /* When we perform grouped accesses and perform implicit CSE
449 by detecting equal accesses and doing disambiguation with
450 runtime alias tests like for
451 .. = a[i];
452 .. = a[i+1];
453 a[i] = ..;
454 a[i+1] = ..;
455 *p = ..;
456 .. = a[i];
457 .. = a[i+1];
458 where we will end up loading { a[i], a[i+1] } once, make
459 sure that inserting group loads before the first load and
460 stores after the last store will do the right thing.
461 Similar for groups like
462 a[i] = ...;
463 ... = a[i];
464 a[i+1] = ...;
465 where loads from the group interleave with the store. */
466 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
470 "READ_WRITE dependence in interleaving.\n");
471 return true;
474 if (loop->safelen < 2)
476 tree indicator = dr_zero_step_indicator (dra);
477 if (TREE_CODE (indicator) != INTEGER_CST)
478 vect_check_nonzero_value (loop_vinfo, indicator);
479 else if (integer_zerop (indicator))
481 if (dump_enabled_p ())
482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
483 "access also has a zero step\n");
484 return true;
487 continue;
490 if (dist > 0 && DDR_REVERSED_P (ddr))
492 /* If DDR_REVERSED_P the order of the data-refs in DDR was
493 reversed (to make distance vector positive), and the actual
494 distance is negative. */
495 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497 "dependence distance negative.\n");
498 /* Record a negative dependence distance to later limit the
499 amount of stmt copying / unrolling we can perform.
500 Only need to handle read-after-write dependence. */
501 if (DR_IS_READ (drb)
502 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
503 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
504 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
505 continue;
508 unsigned int abs_dist = abs (dist);
509 if (abs_dist >= 2 && abs_dist < *max_vf)
511 /* The dependence distance requires reduction of the maximal
512 vectorization factor. */
513 *max_vf = abs (dist);
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location,
516 "adjusting maximal vectorization factor to %i\n",
517 *max_vf);
520 if (abs_dist >= *max_vf)
522 /* Dependence distance does not create dependence, as far as
523 vectorization is concerned, in this case. */
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location,
526 "dependence distance >= VF.\n");
527 continue;
530 if (dump_enabled_p ())
532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
533 "not vectorized, possible dependence "
534 "between data-refs ");
535 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
536 dump_printf (MSG_NOTE, " and ");
537 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
538 dump_printf (MSG_NOTE, "\n");
541 return true;
544 return false;
547 /* Function vect_analyze_data_ref_dependences.
549 Examine all the data references in the loop, and make sure there do not
550 exist any data dependences between them. Set *MAX_VF according to
551 the maximum vectorization factor the data dependences allow. */
553 bool
554 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
555 unsigned int *max_vf)
557 unsigned int i;
558 struct data_dependence_relation *ddr;
560 if (dump_enabled_p ())
561 dump_printf_loc (MSG_NOTE, vect_location,
562 "=== vect_analyze_data_ref_dependences ===\n");
564 LOOP_VINFO_DDRS (loop_vinfo)
565 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
566 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
567 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
568 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
569 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
570 &LOOP_VINFO_DDRS (loop_vinfo),
571 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
572 return false;
574 /* For epilogues we either have no aliases or alias versioning
575 was applied to original loop. Therefore we may just get max_vf
576 using VF of original loop. */
577 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
578 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
579 else
580 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
581 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
582 return false;
584 return true;
588 /* Function vect_slp_analyze_data_ref_dependence.
590 Return TRUE if there (might) exist a dependence between a memory-reference
591 DRA and a memory-reference DRB. When versioning for alias may check a
592 dependence at run-time, return FALSE. Adjust *MAX_VF according to
593 the data dependence. */
595 static bool
596 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
598 struct data_reference *dra = DDR_A (ddr);
599 struct data_reference *drb = DDR_B (ddr);
601 /* We need to check dependences of statements marked as unvectorizable
602 as well, they still can prohibit vectorization. */
604 /* Independent data accesses. */
605 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
606 return false;
608 if (dra == drb)
609 return false;
611 /* Read-read is OK. */
612 if (DR_IS_READ (dra) && DR_IS_READ (drb))
613 return false;
615 /* If dra and drb are part of the same interleaving chain consider
616 them independent. */
617 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
618 && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
619 == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
620 return false;
622 /* Unknown data dependence. */
623 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
625 if (dump_enabled_p ())
627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
628 "can't determine dependence between ");
629 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
630 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
631 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
632 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
635 else if (dump_enabled_p ())
637 dump_printf_loc (MSG_NOTE, vect_location,
638 "determined dependence between ");
639 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
640 dump_printf (MSG_NOTE, " and ");
641 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
642 dump_printf (MSG_NOTE, "\n");
645 return true;
649 /* Analyze dependences involved in the transform of SLP NODE. STORES
650 contain the vector of scalar stores of this instance if we are
651 disambiguating the loads. */
653 static bool
654 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
655 vec<gimple *> stores, gimple *last_store)
657 /* This walks over all stmts involved in the SLP load/store done
658 in NODE verifying we can sink them up to the last stmt in the
659 group. */
660 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
661 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
663 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
664 if (access == last_access)
665 continue;
666 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
667 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
668 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
670 gimple *stmt = gsi_stmt (gsi);
671 if (! gimple_vuse (stmt)
672 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
673 continue;
675 /* If we couldn't record a (single) data reference for this
676 stmt we have to give up. */
677 /* ??? Here and below if dependence analysis fails we can resort
678 to the alias oracle which can handle more kinds of stmts. */
679 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
680 if (!dr_b)
681 return false;
683 bool dependent = false;
684 /* If we run into a store of this same instance (we've just
685 marked those) then delay dependence checking until we run
686 into the last store because this is where it will have
687 been sunk to (and we verify if we can do that as well). */
688 if (gimple_visited_p (stmt))
690 if (stmt != last_store)
691 continue;
692 unsigned i;
693 gimple *store;
694 FOR_EACH_VEC_ELT (stores, i, store)
696 data_reference *store_dr
697 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
698 ddr_p ddr = initialize_data_dependence_relation
699 (dr_a, store_dr, vNULL);
700 dependent = vect_slp_analyze_data_ref_dependence (ddr);
701 free_dependence_relation (ddr);
702 if (dependent)
703 break;
706 else
708 ddr_p ddr = initialize_data_dependence_relation (dr_a,
709 dr_b, vNULL);
710 dependent = vect_slp_analyze_data_ref_dependence (ddr);
711 free_dependence_relation (ddr);
713 if (dependent)
714 return false;
717 return true;
721 /* Function vect_analyze_data_ref_dependences.
723 Examine all the data references in the basic-block, and make sure there
724 do not exist any data dependences between them. Set *MAX_VF according to
725 the maximum vectorization factor the data dependences allow. */
727 bool
728 vect_slp_analyze_instance_dependence (slp_instance instance)
730 if (dump_enabled_p ())
731 dump_printf_loc (MSG_NOTE, vect_location,
732 "=== vect_slp_analyze_instance_dependence ===\n");
734 /* The stores of this instance are at the root of the SLP tree. */
735 slp_tree store = SLP_INSTANCE_TREE (instance);
736 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
737 store = NULL;
739 /* Verify we can sink stores to the vectorized stmt insert location. */
740 gimple *last_store = NULL;
741 if (store)
743 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
744 return false;
746 /* Mark stores in this instance and remember the last one. */
747 last_store = vect_find_last_scalar_stmt_in_slp (store);
748 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
749 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
752 bool res = true;
754 /* Verify we can sink loads to the vectorized stmt insert location,
755 special-casing stores of this instance. */
756 slp_tree load;
757 unsigned int i;
758 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
759 if (! vect_slp_analyze_node_dependences (instance, load,
760 store
761 ? SLP_TREE_SCALAR_STMTS (store)
762 : vNULL, last_store))
764 res = false;
765 break;
768 /* Unset the visited flag. */
769 if (store)
770 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
771 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
773 return res;
776 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
777 the statement that contains DRB, which is useful for recording in the
778 dump file. */
780 static void
781 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
782 innermost_loop_behavior *drb)
784 bool existed;
785 innermost_loop_behavior *&entry
786 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
787 if (!existed || entry->base_alignment < drb->base_alignment)
789 entry = drb;
790 if (dump_enabled_p ())
792 dump_printf_loc (MSG_NOTE, vect_location,
793 "recording new base alignment for ");
794 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
795 dump_printf (MSG_NOTE, "\n");
796 dump_printf_loc (MSG_NOTE, vect_location,
797 " alignment: %d\n", drb->base_alignment);
798 dump_printf_loc (MSG_NOTE, vect_location,
799 " misalignment: %d\n", drb->base_misalignment);
800 dump_printf_loc (MSG_NOTE, vect_location,
801 " based on: ");
802 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
807 /* If the region we're going to vectorize is reached, all unconditional
808 data references occur at least once. We can therefore pool the base
809 alignment guarantees from each unconditional reference. Do this by
810 going through all the data references in VINFO and checking whether
811 the containing statement makes the reference unconditionally. If so,
812 record the alignment of the base address in VINFO so that it can be
813 used for all other references with the same base. */
815 void
816 vect_record_base_alignments (vec_info *vinfo)
818 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
819 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
820 data_reference *dr;
821 unsigned int i;
822 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
823 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
825 gimple *stmt = DR_STMT (dr);
826 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
828 /* If DR is nested in the loop that is being vectorized, we can also
829 record the alignment of the base wrt the outer loop. */
830 if (loop && nested_in_vect_loop_p (loop, stmt))
832 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
833 vect_record_base_alignment
834 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
839 /* Return the target alignment for the vectorized form of DR. */
841 static unsigned int
842 vect_calculate_target_alignment (struct data_reference *dr)
844 gimple *stmt = DR_STMT (dr);
845 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
846 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
847 return targetm.vectorize.preferred_vector_alignment (vectype);
850 /* Function vect_compute_data_ref_alignment
852 Compute the misalignment of the data reference DR.
854 Output:
855 1. If during the misalignment computation it is found that the data reference
856 cannot be vectorized then false is returned.
857 2. DR_MISALIGNMENT (DR) is defined.
859 FOR NOW: No analysis is actually performed. Misalignment is calculated
860 only for trivial cases. TODO. */
862 bool
863 vect_compute_data_ref_alignment (struct data_reference *dr)
865 gimple *stmt = DR_STMT (dr);
866 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
867 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
868 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
869 struct loop *loop = NULL;
870 tree ref = DR_REF (dr);
871 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
873 if (dump_enabled_p ())
874 dump_printf_loc (MSG_NOTE, vect_location,
875 "vect_compute_data_ref_alignment:\n");
877 if (loop_vinfo)
878 loop = LOOP_VINFO_LOOP (loop_vinfo);
880 /* Initialize misalignment to unknown. */
881 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
883 innermost_loop_behavior *drb = vect_dr_behavior (dr);
884 bool step_preserves_misalignment_p;
886 unsigned HOST_WIDE_INT vector_alignment
887 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
888 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
890 /* No step for BB vectorization. */
891 if (!loop)
893 gcc_assert (integer_zerop (drb->step));
894 step_preserves_misalignment_p = true;
897 /* In case the dataref is in an inner-loop of the loop that is being
898 vectorized (LOOP), we use the base and misalignment information
899 relative to the outer-loop (LOOP). This is ok only if the misalignment
900 stays the same throughout the execution of the inner-loop, which is why
901 we have to check that the stride of the dataref in the inner-loop evenly
902 divides by the vector alignment. */
903 else if (nested_in_vect_loop_p (loop, stmt))
905 step_preserves_misalignment_p
906 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
908 if (dump_enabled_p ())
910 if (step_preserves_misalignment_p)
911 dump_printf_loc (MSG_NOTE, vect_location,
912 "inner step divides the vector alignment.\n");
913 else
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 "inner step doesn't divide the vector"
916 " alignment.\n");
920 /* Similarly we can only use base and misalignment information relative to
921 an innermost loop if the misalignment stays the same throughout the
922 execution of the loop. As above, this is the case if the stride of
923 the dataref evenly divides by the alignment. */
924 else
926 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
927 step_preserves_misalignment_p
928 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
930 if (!step_preserves_misalignment_p && dump_enabled_p ())
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 "step doesn't divide the vector alignment.\n");
935 unsigned int base_alignment = drb->base_alignment;
936 unsigned int base_misalignment = drb->base_misalignment;
938 /* Calculate the maximum of the pooled base address alignment and the
939 alignment that we can compute for DR itself. */
940 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
941 if (entry && base_alignment < (*entry)->base_alignment)
943 base_alignment = (*entry)->base_alignment;
944 base_misalignment = (*entry)->base_misalignment;
947 if (drb->offset_alignment < vector_alignment
948 || !step_preserves_misalignment_p
949 /* We need to know whether the step wrt the vectorized loop is
950 negative when computing the starting misalignment below. */
951 || TREE_CODE (drb->step) != INTEGER_CST)
953 if (dump_enabled_p ())
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 "Unknown alignment for access: ");
957 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
958 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
960 return true;
963 if (base_alignment < vector_alignment)
965 unsigned int max_alignment;
966 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
967 if (max_alignment < vector_alignment
968 || !vect_can_force_dr_alignment_p (base,
969 vector_alignment * BITS_PER_UNIT))
971 if (dump_enabled_p ())
973 dump_printf_loc (MSG_NOTE, vect_location,
974 "can't force alignment of ref: ");
975 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
976 dump_printf (MSG_NOTE, "\n");
978 return true;
981 /* Force the alignment of the decl.
982 NOTE: This is the only change to the code we make during
983 the analysis phase, before deciding to vectorize the loop. */
984 if (dump_enabled_p ())
986 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
987 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
988 dump_printf (MSG_NOTE, "\n");
991 DR_VECT_AUX (dr)->base_decl = base;
992 DR_VECT_AUX (dr)->base_misaligned = true;
993 base_misalignment = 0;
995 poly_int64 misalignment
996 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
998 /* If this is a backward running DR then first access in the larger
999 vectype actually is N-1 elements before the address in the DR.
1000 Adjust misalign accordingly. */
1001 if (tree_int_cst_sgn (drb->step) < 0)
1002 /* PLUS because STEP is negative. */
1003 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1004 * TREE_INT_CST_LOW (drb->step));
1006 unsigned int const_misalignment;
1007 if (!known_misalignment (misalignment, vector_alignment,
1008 &const_misalignment))
1010 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013 "Non-constant misalignment for access: ");
1014 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1015 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1017 return true;
1020 SET_DR_MISALIGNMENT (dr, const_misalignment);
1022 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1026 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1027 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1030 return true;
1033 /* Function vect_update_misalignment_for_peel.
1034 Sets DR's misalignment
1035 - to 0 if it has the same alignment as DR_PEEL,
1036 - to the misalignment computed using NPEEL if DR's salignment is known,
1037 - to -1 (unknown) otherwise.
1039 DR - the data reference whose misalignment is to be adjusted.
1040 DR_PEEL - the data reference whose misalignment is being made
1041 zero in the vector loop by the peel.
1042 NPEEL - the number of iterations in the peel loop if the misalignment
1043 of DR_PEEL is known at compile time. */
1045 static void
1046 vect_update_misalignment_for_peel (struct data_reference *dr,
1047 struct data_reference *dr_peel, int npeel)
1049 unsigned int i;
1050 vec<dr_p> same_aligned_drs;
1051 struct data_reference *current_dr;
1052 int dr_size = vect_get_scalar_dr_size (dr);
1053 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1054 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1055 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1057 /* For interleaved data accesses the step in the loop must be multiplied by
1058 the size of the interleaving group. */
1059 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1060 dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1061 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1062 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1064 /* It can be assumed that the data refs with the same alignment as dr_peel
1065 are aligned in the vector loop. */
1066 same_aligned_drs
1067 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1068 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1070 if (current_dr != dr)
1071 continue;
1072 gcc_assert (!known_alignment_for_access_p (dr)
1073 || !known_alignment_for_access_p (dr_peel)
1074 || (DR_MISALIGNMENT (dr) / dr_size
1075 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1076 SET_DR_MISALIGNMENT (dr, 0);
1077 return;
1080 if (known_alignment_for_access_p (dr)
1081 && known_alignment_for_access_p (dr_peel))
1083 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1084 int misal = DR_MISALIGNMENT (dr);
1085 misal += negative ? -npeel * dr_size : npeel * dr_size;
1086 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1087 SET_DR_MISALIGNMENT (dr, misal);
1088 return;
1091 if (dump_enabled_p ())
1092 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1093 "to unknown (-1).\n");
1094 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1098 /* Function verify_data_ref_alignment
1100 Return TRUE if DR can be handled with respect to alignment. */
1102 static bool
1103 verify_data_ref_alignment (data_reference_p dr)
1105 enum dr_alignment_support supportable_dr_alignment
1106 = vect_supportable_dr_alignment (dr, false);
1107 if (!supportable_dr_alignment)
1109 if (dump_enabled_p ())
1111 if (DR_IS_READ (dr))
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "not vectorized: unsupported unaligned load.");
1114 else
1115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1116 "not vectorized: unsupported unaligned "
1117 "store.");
1119 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1120 DR_REF (dr));
1121 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1123 return false;
1126 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1127 dump_printf_loc (MSG_NOTE, vect_location,
1128 "Vectorizing an unaligned access.\n");
1130 return true;
1133 /* Function vect_verify_datarefs_alignment
1135 Return TRUE if all data references in the loop can be
1136 handled with respect to alignment. */
1138 bool
1139 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1141 vec<data_reference_p> datarefs = vinfo->datarefs;
1142 struct data_reference *dr;
1143 unsigned int i;
1145 FOR_EACH_VEC_ELT (datarefs, i, dr)
1147 gimple *stmt = DR_STMT (dr);
1148 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1150 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1151 continue;
1153 /* For interleaving, only the alignment of the first access matters. */
1154 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1155 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1156 continue;
1158 /* Strided accesses perform only component accesses, alignment is
1159 irrelevant for them. */
1160 if (STMT_VINFO_STRIDED_P (stmt_info)
1161 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1162 continue;
1164 if (! verify_data_ref_alignment (dr))
1165 return false;
1168 return true;
1171 /* Given an memory reference EXP return whether its alignment is less
1172 than its size. */
1174 static bool
1175 not_size_aligned (tree exp)
1177 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1178 return true;
1180 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1181 > get_object_alignment (exp));
1184 /* Function vector_alignment_reachable_p
1186 Return true if vector alignment for DR is reachable by peeling
1187 a few loop iterations. Return false otherwise. */
1189 static bool
1190 vector_alignment_reachable_p (struct data_reference *dr)
1192 gimple *stmt = DR_STMT (dr);
1193 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1194 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1196 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1198 /* For interleaved access we peel only if number of iterations in
1199 the prolog loop ({VF - misalignment}), is a multiple of the
1200 number of the interleaved accesses. */
1201 int elem_size, mis_in_elements;
1203 /* FORNOW: handle only known alignment. */
1204 if (!known_alignment_for_access_p (dr))
1205 return false;
1207 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1208 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1209 elem_size = vector_element_size (vector_size, nelements);
1210 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1212 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1213 return false;
1216 /* If misalignment is known at the compile time then allow peeling
1217 only if natural alignment is reachable through peeling. */
1218 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1220 HOST_WIDE_INT elmsize =
1221 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1222 if (dump_enabled_p ())
1224 dump_printf_loc (MSG_NOTE, vect_location,
1225 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1226 dump_printf (MSG_NOTE,
1227 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1229 if (DR_MISALIGNMENT (dr) % elmsize)
1231 if (dump_enabled_p ())
1232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1233 "data size does not divide the misalignment.\n");
1234 return false;
1238 if (!known_alignment_for_access_p (dr))
1240 tree type = TREE_TYPE (DR_REF (dr));
1241 bool is_packed = not_size_aligned (DR_REF (dr));
1242 if (dump_enabled_p ())
1243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1244 "Unknown misalignment, %snaturally aligned\n",
1245 is_packed ? "not " : "");
1246 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1249 return true;
1253 /* Calculate the cost of the memory access represented by DR. */
1255 static void
1256 vect_get_data_access_cost (struct data_reference *dr,
1257 unsigned int *inside_cost,
1258 unsigned int *outside_cost,
1259 stmt_vector_for_cost *body_cost_vec,
1260 stmt_vector_for_cost *prologue_cost_vec)
1262 gimple *stmt = DR_STMT (dr);
1263 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1264 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1265 int ncopies;
1267 if (PURE_SLP_STMT (stmt_info))
1268 ncopies = 1;
1269 else
1270 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1272 if (DR_IS_READ (dr))
1273 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1274 prologue_cost_vec, body_cost_vec, false);
1275 else
1276 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1278 if (dump_enabled_p ())
1279 dump_printf_loc (MSG_NOTE, vect_location,
1280 "vect_get_data_access_cost: inside_cost = %d, "
1281 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1285 typedef struct _vect_peel_info
1287 struct data_reference *dr;
1288 int npeel;
1289 unsigned int count;
1290 } *vect_peel_info;
1292 typedef struct _vect_peel_extended_info
1294 struct _vect_peel_info peel_info;
1295 unsigned int inside_cost;
1296 unsigned int outside_cost;
1297 } *vect_peel_extended_info;
1300 /* Peeling hashtable helpers. */
1302 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1304 static inline hashval_t hash (const _vect_peel_info *);
1305 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1308 inline hashval_t
1309 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1311 return (hashval_t) peel_info->npeel;
1314 inline bool
1315 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1317 return (a->npeel == b->npeel);
1321 /* Insert DR into peeling hash table with NPEEL as key. */
1323 static void
1324 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1325 loop_vec_info loop_vinfo, struct data_reference *dr,
1326 int npeel)
1328 struct _vect_peel_info elem, *slot;
1329 _vect_peel_info **new_slot;
1330 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1332 elem.npeel = npeel;
1333 slot = peeling_htab->find (&elem);
1334 if (slot)
1335 slot->count++;
1336 else
1338 slot = XNEW (struct _vect_peel_info);
1339 slot->npeel = npeel;
1340 slot->dr = dr;
1341 slot->count = 1;
1342 new_slot = peeling_htab->find_slot (slot, INSERT);
1343 *new_slot = slot;
1346 if (!supportable_dr_alignment
1347 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1348 slot->count += VECT_MAX_COST;
1352 /* Traverse peeling hash table to find peeling option that aligns maximum
1353 number of data accesses. */
1356 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1357 _vect_peel_extended_info *max)
1359 vect_peel_info elem = *slot;
1361 if (elem->count > max->peel_info.count
1362 || (elem->count == max->peel_info.count
1363 && max->peel_info.npeel > elem->npeel))
1365 max->peel_info.npeel = elem->npeel;
1366 max->peel_info.count = elem->count;
1367 max->peel_info.dr = elem->dr;
1370 return 1;
1373 /* Get the costs of peeling NPEEL iterations checking data access costs
1374 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1375 misalignment will be zero after peeling. */
1377 static void
1378 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1379 struct data_reference *dr0,
1380 unsigned int *inside_cost,
1381 unsigned int *outside_cost,
1382 stmt_vector_for_cost *body_cost_vec,
1383 stmt_vector_for_cost *prologue_cost_vec,
1384 unsigned int npeel,
1385 bool unknown_misalignment)
1387 unsigned i;
1388 data_reference *dr;
1390 FOR_EACH_VEC_ELT (datarefs, i, dr)
1392 gimple *stmt = DR_STMT (dr);
1393 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1394 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1395 continue;
1397 /* For interleaving, only the alignment of the first access
1398 matters. */
1399 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1400 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1401 continue;
1403 /* Strided accesses perform only component accesses, alignment is
1404 irrelevant for them. */
1405 if (STMT_VINFO_STRIDED_P (stmt_info)
1406 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1407 continue;
1409 int save_misalignment;
1410 save_misalignment = DR_MISALIGNMENT (dr);
1411 if (npeel == 0)
1413 else if (unknown_misalignment && dr == dr0)
1414 SET_DR_MISALIGNMENT (dr, 0);
1415 else
1416 vect_update_misalignment_for_peel (dr, dr0, npeel);
1417 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1418 body_cost_vec, prologue_cost_vec);
1419 SET_DR_MISALIGNMENT (dr, save_misalignment);
1423 /* Traverse peeling hash table and calculate cost for each peeling option.
1424 Find the one with the lowest cost. */
1427 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1428 _vect_peel_extended_info *min)
1430 vect_peel_info elem = *slot;
1431 int dummy;
1432 unsigned int inside_cost = 0, outside_cost = 0;
1433 gimple *stmt = DR_STMT (elem->dr);
1434 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1435 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1436 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1437 epilogue_cost_vec;
1439 prologue_cost_vec.create (2);
1440 body_cost_vec.create (2);
1441 epilogue_cost_vec.create (2);
1443 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1444 elem->dr, &inside_cost, &outside_cost,
1445 &body_cost_vec, &prologue_cost_vec,
1446 elem->npeel, false);
1448 body_cost_vec.release ();
1450 outside_cost += vect_get_known_peeling_cost
1451 (loop_vinfo, elem->npeel, &dummy,
1452 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1453 &prologue_cost_vec, &epilogue_cost_vec);
1455 /* Prologue and epilogue costs are added to the target model later.
1456 These costs depend only on the scalar iteration cost, the
1457 number of peeling iterations finally chosen, and the number of
1458 misaligned statements. So discard the information found here. */
1459 prologue_cost_vec.release ();
1460 epilogue_cost_vec.release ();
1462 if (inside_cost < min->inside_cost
1463 || (inside_cost == min->inside_cost
1464 && outside_cost < min->outside_cost))
1466 min->inside_cost = inside_cost;
1467 min->outside_cost = outside_cost;
1468 min->peel_info.dr = elem->dr;
1469 min->peel_info.npeel = elem->npeel;
1470 min->peel_info.count = elem->count;
1473 return 1;
1477 /* Choose best peeling option by traversing peeling hash table and either
1478 choosing an option with the lowest cost (if cost model is enabled) or the
1479 option that aligns as many accesses as possible. */
1481 static struct _vect_peel_extended_info
1482 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1483 loop_vec_info loop_vinfo)
1485 struct _vect_peel_extended_info res;
1487 res.peel_info.dr = NULL;
1489 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1491 res.inside_cost = INT_MAX;
1492 res.outside_cost = INT_MAX;
1493 peeling_htab->traverse <_vect_peel_extended_info *,
1494 vect_peeling_hash_get_lowest_cost> (&res);
1496 else
1498 res.peel_info.count = 0;
1499 peeling_htab->traverse <_vect_peel_extended_info *,
1500 vect_peeling_hash_get_most_frequent> (&res);
1501 res.inside_cost = 0;
1502 res.outside_cost = 0;
1505 return res;
1508 /* Return true if the new peeling NPEEL is supported. */
1510 static bool
1511 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1512 unsigned npeel)
1514 unsigned i;
1515 struct data_reference *dr = NULL;
1516 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1517 gimple *stmt;
1518 stmt_vec_info stmt_info;
1519 enum dr_alignment_support supportable_dr_alignment;
1521 /* Ensure that all data refs can be vectorized after the peel. */
1522 FOR_EACH_VEC_ELT (datarefs, i, dr)
1524 int save_misalignment;
1526 if (dr == dr0)
1527 continue;
1529 stmt = DR_STMT (dr);
1530 stmt_info = vinfo_for_stmt (stmt);
1531 /* For interleaving, only the alignment of the first access
1532 matters. */
1533 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1534 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1535 continue;
1537 /* Strided accesses perform only component accesses, alignment is
1538 irrelevant for them. */
1539 if (STMT_VINFO_STRIDED_P (stmt_info)
1540 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1541 continue;
1543 save_misalignment = DR_MISALIGNMENT (dr);
1544 vect_update_misalignment_for_peel (dr, dr0, npeel);
1545 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1546 SET_DR_MISALIGNMENT (dr, save_misalignment);
1548 if (!supportable_dr_alignment)
1549 return false;
1552 return true;
1555 /* Function vect_enhance_data_refs_alignment
1557 This pass will use loop versioning and loop peeling in order to enhance
1558 the alignment of data references in the loop.
1560 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1561 original loop is to be vectorized. Any other loops that are created by
1562 the transformations performed in this pass - are not supposed to be
1563 vectorized. This restriction will be relaxed.
1565 This pass will require a cost model to guide it whether to apply peeling
1566 or versioning or a combination of the two. For example, the scheme that
1567 intel uses when given a loop with several memory accesses, is as follows:
1568 choose one memory access ('p') which alignment you want to force by doing
1569 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1570 other accesses are not necessarily aligned, or (2) use loop versioning to
1571 generate one loop in which all accesses are aligned, and another loop in
1572 which only 'p' is necessarily aligned.
1574 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1575 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1576 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1578 Devising a cost model is the most critical aspect of this work. It will
1579 guide us on which access to peel for, whether to use loop versioning, how
1580 many versions to create, etc. The cost model will probably consist of
1581 generic considerations as well as target specific considerations (on
1582 powerpc for example, misaligned stores are more painful than misaligned
1583 loads).
1585 Here are the general steps involved in alignment enhancements:
1587 -- original loop, before alignment analysis:
1588 for (i=0; i<N; i++){
1589 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1590 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1593 -- After vect_compute_data_refs_alignment:
1594 for (i=0; i<N; i++){
1595 x = q[i]; # DR_MISALIGNMENT(q) = 3
1596 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1599 -- Possibility 1: we do loop versioning:
1600 if (p is aligned) {
1601 for (i=0; i<N; i++){ # loop 1A
1602 x = q[i]; # DR_MISALIGNMENT(q) = 3
1603 p[i] = y; # DR_MISALIGNMENT(p) = 0
1606 else {
1607 for (i=0; i<N; i++){ # loop 1B
1608 x = q[i]; # DR_MISALIGNMENT(q) = 3
1609 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1613 -- Possibility 2: we do loop peeling:
1614 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1615 x = q[i];
1616 p[i] = y;
1618 for (i = 3; i < N; i++){ # loop 2A
1619 x = q[i]; # DR_MISALIGNMENT(q) = 0
1620 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1623 -- Possibility 3: combination of loop peeling and versioning:
1624 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1625 x = q[i];
1626 p[i] = y;
1628 if (p is aligned) {
1629 for (i = 3; i<N; i++){ # loop 3A
1630 x = q[i]; # DR_MISALIGNMENT(q) = 0
1631 p[i] = y; # DR_MISALIGNMENT(p) = 0
1634 else {
1635 for (i = 3; i<N; i++){ # loop 3B
1636 x = q[i]; # DR_MISALIGNMENT(q) = 0
1637 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1641 These loops are later passed to loop_transform to be vectorized. The
1642 vectorizer will use the alignment information to guide the transformation
1643 (whether to generate regular loads/stores, or with special handling for
1644 misalignment). */
1646 bool
1647 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1649 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1650 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1651 enum dr_alignment_support supportable_dr_alignment;
1652 struct data_reference *dr0 = NULL, *first_store = NULL;
1653 struct data_reference *dr;
1654 unsigned int i, j;
1655 bool do_peeling = false;
1656 bool do_versioning = false;
1657 bool stat;
1658 gimple *stmt;
1659 stmt_vec_info stmt_info;
1660 unsigned int npeel = 0;
1661 bool one_misalignment_known = false;
1662 bool one_misalignment_unknown = false;
1663 bool one_dr_unsupportable = false;
1664 struct data_reference *unsupportable_dr = NULL;
1665 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1666 unsigned possible_npeel_number = 1;
1667 tree vectype;
1668 unsigned int mis, same_align_drs_max = 0;
1669 hash_table<peel_info_hasher> peeling_htab (1);
1671 if (dump_enabled_p ())
1672 dump_printf_loc (MSG_NOTE, vect_location,
1673 "=== vect_enhance_data_refs_alignment ===\n");
1675 /* Reset data so we can safely be called multiple times. */
1676 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1677 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1679 /* While cost model enhancements are expected in the future, the high level
1680 view of the code at this time is as follows:
1682 A) If there is a misaligned access then see if peeling to align
1683 this access can make all data references satisfy
1684 vect_supportable_dr_alignment. If so, update data structures
1685 as needed and return true.
1687 B) If peeling wasn't possible and there is a data reference with an
1688 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1689 then see if loop versioning checks can be used to make all data
1690 references satisfy vect_supportable_dr_alignment. If so, update
1691 data structures as needed and return true.
1693 C) If neither peeling nor versioning were successful then return false if
1694 any data reference does not satisfy vect_supportable_dr_alignment.
1696 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1698 Note, Possibility 3 above (which is peeling and versioning together) is not
1699 being done at this time. */
1701 /* (1) Peeling to force alignment. */
1703 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1704 Considerations:
1705 + How many accesses will become aligned due to the peeling
1706 - How many accesses will become unaligned due to the peeling,
1707 and the cost of misaligned accesses.
1708 - The cost of peeling (the extra runtime checks, the increase
1709 in code size). */
1711 FOR_EACH_VEC_ELT (datarefs, i, dr)
1713 stmt = DR_STMT (dr);
1714 stmt_info = vinfo_for_stmt (stmt);
1716 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1717 continue;
1719 /* For interleaving, only the alignment of the first access
1720 matters. */
1721 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1722 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1723 continue;
1725 /* For invariant accesses there is nothing to enhance. */
1726 if (integer_zerop (DR_STEP (dr)))
1727 continue;
1729 /* Strided accesses perform only component accesses, alignment is
1730 irrelevant for them. */
1731 if (STMT_VINFO_STRIDED_P (stmt_info)
1732 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1733 continue;
1735 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1736 do_peeling = vector_alignment_reachable_p (dr);
1737 if (do_peeling)
1739 if (known_alignment_for_access_p (dr))
1741 unsigned int npeel_tmp = 0;
1742 bool negative = tree_int_cst_compare (DR_STEP (dr),
1743 size_zero_node) < 0;
1745 vectype = STMT_VINFO_VECTYPE (stmt_info);
1746 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1747 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1748 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1749 if (DR_MISALIGNMENT (dr) != 0)
1750 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1752 /* For multiple types, it is possible that the bigger type access
1753 will have more than one peeling option. E.g., a loop with two
1754 types: one of size (vector size / 4), and the other one of
1755 size (vector size / 8). Vectorization factor will 8. If both
1756 accesses are misaligned by 3, the first one needs one scalar
1757 iteration to be aligned, and the second one needs 5. But the
1758 first one will be aligned also by peeling 5 scalar
1759 iterations, and in that case both accesses will be aligned.
1760 Hence, except for the immediate peeling amount, we also want
1761 to try to add full vector size, while we don't exceed
1762 vectorization factor.
1763 We do this automatically for cost model, since we calculate
1764 cost for every peeling option. */
1765 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1767 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1768 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1769 possible_npeel_number
1770 = vect_get_num_vectors (nscalars, vectype);
1772 /* NPEEL_TMP is 0 when there is no misalignment, but also
1773 allow peeling NELEMENTS. */
1774 if (DR_MISALIGNMENT (dr) == 0)
1775 possible_npeel_number++;
1778 /* Save info about DR in the hash table. Also include peeling
1779 amounts according to the explanation above. */
1780 for (j = 0; j < possible_npeel_number; j++)
1782 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1783 dr, npeel_tmp);
1784 npeel_tmp += target_align / dr_size;
1787 one_misalignment_known = true;
1789 else
1791 /* If we don't know any misalignment values, we prefer
1792 peeling for data-ref that has the maximum number of data-refs
1793 with the same alignment, unless the target prefers to align
1794 stores over load. */
1795 unsigned same_align_drs
1796 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1797 if (!dr0
1798 || same_align_drs_max < same_align_drs)
1800 same_align_drs_max = same_align_drs;
1801 dr0 = dr;
1803 /* For data-refs with the same number of related
1804 accesses prefer the one where the misalign
1805 computation will be invariant in the outermost loop. */
1806 else if (same_align_drs_max == same_align_drs)
1808 struct loop *ivloop0, *ivloop;
1809 ivloop0 = outermost_invariant_loop_for_expr
1810 (loop, DR_BASE_ADDRESS (dr0));
1811 ivloop = outermost_invariant_loop_for_expr
1812 (loop, DR_BASE_ADDRESS (dr));
1813 if ((ivloop && !ivloop0)
1814 || (ivloop && ivloop0
1815 && flow_loop_nested_p (ivloop, ivloop0)))
1816 dr0 = dr;
1819 one_misalignment_unknown = true;
1821 /* Check for data refs with unsupportable alignment that
1822 can be peeled. */
1823 if (!supportable_dr_alignment)
1825 one_dr_unsupportable = true;
1826 unsupportable_dr = dr;
1829 if (!first_store && DR_IS_WRITE (dr))
1830 first_store = dr;
1833 else
1835 if (!aligned_access_p (dr))
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839 "vector alignment may not be reachable\n");
1840 break;
1845 /* Check if we can possibly peel the loop. */
1846 if (!vect_can_advance_ivs_p (loop_vinfo)
1847 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1848 || loop->inner)
1849 do_peeling = false;
1851 struct _vect_peel_extended_info peel_for_known_alignment;
1852 struct _vect_peel_extended_info peel_for_unknown_alignment;
1853 struct _vect_peel_extended_info best_peel;
1855 peel_for_unknown_alignment.inside_cost = INT_MAX;
1856 peel_for_unknown_alignment.outside_cost = INT_MAX;
1857 peel_for_unknown_alignment.peel_info.count = 0;
1859 if (do_peeling
1860 && one_misalignment_unknown)
1862 /* Check if the target requires to prefer stores over loads, i.e., if
1863 misaligned stores are more expensive than misaligned loads (taking
1864 drs with same alignment into account). */
1865 unsigned int load_inside_cost = 0;
1866 unsigned int load_outside_cost = 0;
1867 unsigned int store_inside_cost = 0;
1868 unsigned int store_outside_cost = 0;
1869 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1871 stmt_vector_for_cost dummy;
1872 dummy.create (2);
1873 vect_get_peeling_costs_all_drs (datarefs, dr0,
1874 &load_inside_cost,
1875 &load_outside_cost,
1876 &dummy, &dummy, estimated_npeels, true);
1877 dummy.release ();
1879 if (first_store)
1881 dummy.create (2);
1882 vect_get_peeling_costs_all_drs (datarefs, first_store,
1883 &store_inside_cost,
1884 &store_outside_cost,
1885 &dummy, &dummy,
1886 estimated_npeels, true);
1887 dummy.release ();
1889 else
1891 store_inside_cost = INT_MAX;
1892 store_outside_cost = INT_MAX;
1895 if (load_inside_cost > store_inside_cost
1896 || (load_inside_cost == store_inside_cost
1897 && load_outside_cost > store_outside_cost))
1899 dr0 = first_store;
1900 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1901 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1903 else
1905 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1906 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1909 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1910 prologue_cost_vec.create (2);
1911 epilogue_cost_vec.create (2);
1913 int dummy2;
1914 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1915 (loop_vinfo, estimated_npeels, &dummy2,
1916 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1917 &prologue_cost_vec, &epilogue_cost_vec);
1919 prologue_cost_vec.release ();
1920 epilogue_cost_vec.release ();
1922 peel_for_unknown_alignment.peel_info.count = 1
1923 + STMT_VINFO_SAME_ALIGN_REFS
1924 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1927 peel_for_unknown_alignment.peel_info.npeel = 0;
1928 peel_for_unknown_alignment.peel_info.dr = dr0;
1930 best_peel = peel_for_unknown_alignment;
1932 peel_for_known_alignment.inside_cost = INT_MAX;
1933 peel_for_known_alignment.outside_cost = INT_MAX;
1934 peel_for_known_alignment.peel_info.count = 0;
1935 peel_for_known_alignment.peel_info.dr = NULL;
1937 if (do_peeling && one_misalignment_known)
1939 /* Peeling is possible, but there is no data access that is not supported
1940 unless aligned. So we try to choose the best possible peeling from
1941 the hash table. */
1942 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1943 (&peeling_htab, loop_vinfo);
1946 /* Compare costs of peeling for known and unknown alignment. */
1947 if (peel_for_known_alignment.peel_info.dr != NULL
1948 && peel_for_unknown_alignment.inside_cost
1949 >= peel_for_known_alignment.inside_cost)
1951 best_peel = peel_for_known_alignment;
1953 /* If the best peeling for known alignment has NPEEL == 0, perform no
1954 peeling at all except if there is an unsupportable dr that we can
1955 align. */
1956 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1957 do_peeling = false;
1960 /* If there is an unsupportable data ref, prefer this over all choices so far
1961 since we'd have to discard a chosen peeling except when it accidentally
1962 aligned the unsupportable data ref. */
1963 if (one_dr_unsupportable)
1964 dr0 = unsupportable_dr;
1965 else if (do_peeling)
1967 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1968 TODO: Use nopeel_outside_cost or get rid of it? */
1969 unsigned nopeel_inside_cost = 0;
1970 unsigned nopeel_outside_cost = 0;
1972 stmt_vector_for_cost dummy;
1973 dummy.create (2);
1974 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1975 &nopeel_outside_cost, &dummy, &dummy,
1976 0, false);
1977 dummy.release ();
1979 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1980 costs will be recorded. */
1981 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1982 prologue_cost_vec.create (2);
1983 epilogue_cost_vec.create (2);
1985 int dummy2;
1986 nopeel_outside_cost += vect_get_known_peeling_cost
1987 (loop_vinfo, 0, &dummy2,
1988 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1989 &prologue_cost_vec, &epilogue_cost_vec);
1991 prologue_cost_vec.release ();
1992 epilogue_cost_vec.release ();
1994 npeel = best_peel.peel_info.npeel;
1995 dr0 = best_peel.peel_info.dr;
1997 /* If no peeling is not more expensive than the best peeling we
1998 have so far, don't perform any peeling. */
1999 if (nopeel_inside_cost <= best_peel.inside_cost)
2000 do_peeling = false;
2003 if (do_peeling)
2005 stmt = DR_STMT (dr0);
2006 stmt_info = vinfo_for_stmt (stmt);
2007 vectype = STMT_VINFO_VECTYPE (stmt_info);
2009 if (known_alignment_for_access_p (dr0))
2011 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2012 size_zero_node) < 0;
2013 if (!npeel)
2015 /* Since it's known at compile time, compute the number of
2016 iterations in the peeled loop (the peeling factor) for use in
2017 updating DR_MISALIGNMENT values. The peeling factor is the
2018 vectorization factor minus the misalignment as an element
2019 count. */
2020 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2021 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2022 npeel = ((mis & (target_align - 1))
2023 / vect_get_scalar_dr_size (dr0));
2026 /* For interleaved data access every iteration accesses all the
2027 members of the group, therefore we divide the number of iterations
2028 by the group size. */
2029 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2030 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2031 npeel /= DR_GROUP_SIZE (stmt_info);
2033 if (dump_enabled_p ())
2034 dump_printf_loc (MSG_NOTE, vect_location,
2035 "Try peeling by %d\n", npeel);
2038 /* Ensure that all datarefs can be vectorized after the peel. */
2039 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2040 do_peeling = false;
2042 /* Check if all datarefs are supportable and log. */
2043 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2045 stat = vect_verify_datarefs_alignment (loop_vinfo);
2046 if (!stat)
2047 do_peeling = false;
2048 else
2049 return stat;
2052 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2053 if (do_peeling)
2055 unsigned max_allowed_peel
2056 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2057 if (max_allowed_peel != (unsigned)-1)
2059 unsigned max_peel = npeel;
2060 if (max_peel == 0)
2062 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2063 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2065 if (max_peel > max_allowed_peel)
2067 do_peeling = false;
2068 if (dump_enabled_p ())
2069 dump_printf_loc (MSG_NOTE, vect_location,
2070 "Disable peeling, max peels reached: %d\n", max_peel);
2075 /* Cost model #2 - if peeling may result in a remaining loop not
2076 iterating enough to be vectorized then do not peel. Since this
2077 is a cost heuristic rather than a correctness decision, use the
2078 most likely runtime value for variable vectorization factors. */
2079 if (do_peeling
2080 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2082 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2083 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2084 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2085 < assumed_vf + max_peel)
2086 do_peeling = false;
2089 if (do_peeling)
2091 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2092 If the misalignment of DR_i is identical to that of dr0 then set
2093 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2094 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2095 by the peeling factor times the element size of DR_i (MOD the
2096 vectorization factor times the size). Otherwise, the
2097 misalignment of DR_i must be set to unknown. */
2098 FOR_EACH_VEC_ELT (datarefs, i, dr)
2099 if (dr != dr0)
2101 /* Strided accesses perform only component accesses, alignment
2102 is irrelevant for them. */
2103 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2104 if (STMT_VINFO_STRIDED_P (stmt_info)
2105 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2106 continue;
2108 vect_update_misalignment_for_peel (dr, dr0, npeel);
2111 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2112 if (npeel)
2113 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2114 else
2115 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2116 = DR_MISALIGNMENT (dr0);
2117 SET_DR_MISALIGNMENT (dr0, 0);
2118 if (dump_enabled_p ())
2120 dump_printf_loc (MSG_NOTE, vect_location,
2121 "Alignment of access forced using peeling.\n");
2122 dump_printf_loc (MSG_NOTE, vect_location,
2123 "Peeling for alignment will be applied.\n");
2126 /* The inside-loop cost will be accounted for in vectorizable_load
2127 and vectorizable_store correctly with adjusted alignments.
2128 Drop the body_cst_vec on the floor here. */
2129 stat = vect_verify_datarefs_alignment (loop_vinfo);
2130 gcc_assert (stat);
2131 return stat;
2135 /* (2) Versioning to force alignment. */
2137 /* Try versioning if:
2138 1) optimize loop for speed
2139 2) there is at least one unsupported misaligned data ref with an unknown
2140 misalignment, and
2141 3) all misaligned data refs with a known misalignment are supported, and
2142 4) the number of runtime alignment checks is within reason. */
2144 do_versioning =
2145 optimize_loop_nest_for_speed_p (loop)
2146 && (!loop->inner); /* FORNOW */
2148 if (do_versioning)
2150 FOR_EACH_VEC_ELT (datarefs, i, dr)
2152 stmt = DR_STMT (dr);
2153 stmt_info = vinfo_for_stmt (stmt);
2155 /* For interleaving, only the alignment of the first access
2156 matters. */
2157 if (aligned_access_p (dr)
2158 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2159 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2160 continue;
2162 if (STMT_VINFO_STRIDED_P (stmt_info))
2164 /* Strided loads perform only component accesses, alignment is
2165 irrelevant for them. */
2166 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2167 continue;
2168 do_versioning = false;
2169 break;
2172 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2174 if (!supportable_dr_alignment)
2176 gimple *stmt;
2177 int mask;
2178 tree vectype;
2180 if (known_alignment_for_access_p (dr)
2181 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2182 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2184 do_versioning = false;
2185 break;
2188 stmt = DR_STMT (dr);
2189 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2190 gcc_assert (vectype);
2192 /* At present we don't support versioning for alignment
2193 with variable VF, since there's no guarantee that the
2194 VF is a power of two. We could relax this if we added
2195 a way of enforcing a power-of-two size. */
2196 unsigned HOST_WIDE_INT size;
2197 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2199 do_versioning = false;
2200 break;
2203 /* The rightmost bits of an aligned address must be zeros.
2204 Construct the mask needed for this test. For example,
2205 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2206 mask must be 15 = 0xf. */
2207 mask = size - 1;
2209 /* FORNOW: use the same mask to test all potentially unaligned
2210 references in the loop. The vectorizer currently supports
2211 a single vector size, see the reference to
2212 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2213 vectorization factor is computed. */
2214 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2215 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2216 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2217 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2218 DR_STMT (dr));
2222 /* Versioning requires at least one misaligned data reference. */
2223 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2224 do_versioning = false;
2225 else if (!do_versioning)
2226 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2229 if (do_versioning)
2231 vec<gimple *> may_misalign_stmts
2232 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2233 gimple *stmt;
2235 /* It can now be assumed that the data references in the statements
2236 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2237 of the loop being vectorized. */
2238 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2240 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2241 dr = STMT_VINFO_DATA_REF (stmt_info);
2242 SET_DR_MISALIGNMENT (dr, 0);
2243 if (dump_enabled_p ())
2244 dump_printf_loc (MSG_NOTE, vect_location,
2245 "Alignment of access forced using versioning.\n");
2248 if (dump_enabled_p ())
2249 dump_printf_loc (MSG_NOTE, vect_location,
2250 "Versioning for alignment will be applied.\n");
2252 /* Peeling and versioning can't be done together at this time. */
2253 gcc_assert (! (do_peeling && do_versioning));
2255 stat = vect_verify_datarefs_alignment (loop_vinfo);
2256 gcc_assert (stat);
2257 return stat;
2260 /* This point is reached if neither peeling nor versioning is being done. */
2261 gcc_assert (! (do_peeling || do_versioning));
2263 stat = vect_verify_datarefs_alignment (loop_vinfo);
2264 return stat;
2268 /* Function vect_find_same_alignment_drs.
2270 Update group and alignment relations according to the chosen
2271 vectorization factor. */
2273 static void
2274 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2276 struct data_reference *dra = DDR_A (ddr);
2277 struct data_reference *drb = DDR_B (ddr);
2278 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2279 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2281 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2282 return;
2284 if (dra == drb)
2285 return;
2287 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2288 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2289 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2290 return;
2292 /* Two references with distance zero have the same alignment. */
2293 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2294 - wi::to_poly_offset (DR_INIT (drb)));
2295 if (maybe_ne (diff, 0))
2297 /* Get the wider of the two alignments. */
2298 unsigned int align_a = (vect_calculate_target_alignment (dra)
2299 / BITS_PER_UNIT);
2300 unsigned int align_b = (vect_calculate_target_alignment (drb)
2301 / BITS_PER_UNIT);
2302 unsigned int max_align = MAX (align_a, align_b);
2304 /* Require the gap to be a multiple of the larger vector alignment. */
2305 if (!multiple_p (diff, max_align))
2306 return;
2309 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2310 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2311 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_NOTE, vect_location,
2314 "accesses have the same alignment: ");
2315 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2316 dump_printf (MSG_NOTE, " and ");
2317 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2318 dump_printf (MSG_NOTE, "\n");
2323 /* Function vect_analyze_data_refs_alignment
2325 Analyze the alignment of the data-references in the loop.
2326 Return FALSE if a data reference is found that cannot be vectorized. */
2328 bool
2329 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_NOTE, vect_location,
2333 "=== vect_analyze_data_refs_alignment ===\n");
2335 /* Mark groups of data references with same alignment using
2336 data dependence information. */
2337 vec<ddr_p> ddrs = vinfo->ddrs;
2338 struct data_dependence_relation *ddr;
2339 unsigned int i;
2341 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2342 vect_find_same_alignment_drs (ddr);
2344 vec<data_reference_p> datarefs = vinfo->datarefs;
2345 struct data_reference *dr;
2347 vect_record_base_alignments (vinfo);
2348 FOR_EACH_VEC_ELT (datarefs, i, dr)
2350 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2351 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2352 && !vect_compute_data_ref_alignment (dr))
2354 /* Strided accesses perform only component accesses, misalignment
2355 information is irrelevant for them. */
2356 if (STMT_VINFO_STRIDED_P (stmt_info)
2357 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2358 continue;
2360 if (dump_enabled_p ())
2361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2362 "not vectorized: can't calculate alignment "
2363 "for data ref.\n");
2365 return false;
2369 return true;
2373 /* Analyze alignment of DRs of stmts in NODE. */
2375 static bool
2376 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2378 /* We vectorize from the first scalar stmt in the node unless
2379 the node is permuted in which case we start from the first
2380 element in the group. */
2381 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2382 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2383 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2384 first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2386 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2387 if (! vect_compute_data_ref_alignment (dr)
2388 /* For creating the data-ref pointer we need alignment of the
2389 first element anyway. */
2390 || (dr != first_dr
2391 && ! vect_compute_data_ref_alignment (first_dr))
2392 || ! verify_data_ref_alignment (dr))
2394 if (dump_enabled_p ())
2395 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2396 "not vectorized: bad data alignment in basic "
2397 "block.\n");
2398 return false;
2401 return true;
2404 /* Function vect_slp_analyze_instance_alignment
2406 Analyze the alignment of the data-references in the SLP instance.
2407 Return FALSE if a data reference is found that cannot be vectorized. */
2409 bool
2410 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2412 if (dump_enabled_p ())
2413 dump_printf_loc (MSG_NOTE, vect_location,
2414 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2416 slp_tree node;
2417 unsigned i;
2418 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2419 if (! vect_slp_analyze_and_verify_node_alignment (node))
2420 return false;
2422 node = SLP_INSTANCE_TREE (instance);
2423 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2424 && ! vect_slp_analyze_and_verify_node_alignment
2425 (SLP_INSTANCE_TREE (instance)))
2426 return false;
2428 return true;
2432 /* Analyze groups of accesses: check that DR belongs to a group of
2433 accesses of legal size, step, etc. Detect gaps, single element
2434 interleaving, and other special cases. Set grouped access info.
2435 Collect groups of strided stores for further use in SLP analysis.
2436 Worker for vect_analyze_group_access. */
2438 static bool
2439 vect_analyze_group_access_1 (struct data_reference *dr)
2441 tree step = DR_STEP (dr);
2442 tree scalar_type = TREE_TYPE (DR_REF (dr));
2443 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2444 gimple *stmt = DR_STMT (dr);
2445 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2446 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2447 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2448 HOST_WIDE_INT dr_step = -1;
2449 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2450 bool slp_impossible = false;
2452 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2453 size of the interleaving group (including gaps). */
2454 if (tree_fits_shwi_p (step))
2456 dr_step = tree_to_shwi (step);
2457 /* Check that STEP is a multiple of type size. Otherwise there is
2458 a non-element-sized gap at the end of the group which we
2459 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2460 ??? As we can handle non-constant step fine here we should
2461 simply remove uses of DR_GROUP_GAP between the last and first
2462 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2463 simply not include that gap. */
2464 if ((dr_step % type_size) != 0)
2466 if (dump_enabled_p ())
2468 dump_printf_loc (MSG_NOTE, vect_location,
2469 "Step ");
2470 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2471 dump_printf (MSG_NOTE,
2472 " is not a multiple of the element size for ");
2473 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2474 dump_printf (MSG_NOTE, "\n");
2476 return false;
2478 groupsize = absu_hwi (dr_step) / type_size;
2480 else
2481 groupsize = 0;
2483 /* Not consecutive access is possible only if it is a part of interleaving. */
2484 if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2486 /* Check if it this DR is a part of interleaving, and is a single
2487 element of the group that is accessed in the loop. */
2489 /* Gaps are supported only for loads. STEP must be a multiple of the type
2490 size. */
2491 if (DR_IS_READ (dr)
2492 && (dr_step % type_size) == 0
2493 && groupsize > 0)
2495 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2496 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2497 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2498 if (dump_enabled_p ())
2500 dump_printf_loc (MSG_NOTE, vect_location,
2501 "Detected single element interleaving ");
2502 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2503 dump_printf (MSG_NOTE, " step ");
2504 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2505 dump_printf (MSG_NOTE, "\n");
2508 return true;
2511 if (dump_enabled_p ())
2513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2514 "not consecutive access ");
2515 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2518 if (bb_vinfo)
2520 /* Mark the statement as unvectorizable. */
2521 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2522 return true;
2525 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2526 STMT_VINFO_STRIDED_P (stmt_info) = true;
2527 return true;
2530 if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2532 /* First stmt in the interleaving chain. Check the chain. */
2533 gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2534 struct data_reference *data_ref = dr;
2535 unsigned int count = 1;
2536 tree prev_init = DR_INIT (data_ref);
2537 gimple *prev = stmt;
2538 HOST_WIDE_INT diff, gaps = 0;
2540 /* By construction, all group members have INTEGER_CST DR_INITs. */
2541 while (next)
2543 /* Skip same data-refs. In case that two or more stmts share
2544 data-ref (supported only for loads), we vectorize only the first
2545 stmt, and the rest get their vectorized loads from the first
2546 one. */
2547 if (!tree_int_cst_compare (DR_INIT (data_ref),
2548 DR_INIT (STMT_VINFO_DATA_REF (
2549 vinfo_for_stmt (next)))))
2551 if (DR_IS_WRITE (data_ref))
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555 "Two store stmts share the same dr.\n");
2556 return false;
2559 if (dump_enabled_p ())
2560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2561 "Two or more load stmts share the same dr.\n");
2563 /* For load use the same data-ref load. */
2564 DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2566 prev = next;
2567 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2568 continue;
2571 prev = next;
2572 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2574 /* All group members have the same STEP by construction. */
2575 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2577 /* Check that the distance between two accesses is equal to the type
2578 size. Otherwise, we have gaps. */
2579 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2580 - TREE_INT_CST_LOW (prev_init)) / type_size;
2581 if (diff != 1)
2583 /* FORNOW: SLP of accesses with gaps is not supported. */
2584 slp_impossible = true;
2585 if (DR_IS_WRITE (data_ref))
2587 if (dump_enabled_p ())
2588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2589 "interleaved store with gaps\n");
2590 return false;
2593 gaps += diff - 1;
2596 last_accessed_element += diff;
2598 /* Store the gap from the previous member of the group. If there is no
2599 gap in the access, DR_GROUP_GAP is always 1. */
2600 DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2602 prev_init = DR_INIT (data_ref);
2603 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2604 /* Count the number of data-refs in the chain. */
2605 count++;
2608 if (groupsize == 0)
2609 groupsize = count + gaps;
2611 /* This could be UINT_MAX but as we are generating code in a very
2612 inefficient way we have to cap earlier. See PR78699 for example. */
2613 if (groupsize > 4096)
2615 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2617 "group is too large\n");
2618 return false;
2621 /* Check that the size of the interleaving is equal to count for stores,
2622 i.e., that there are no gaps. */
2623 if (groupsize != count
2624 && !DR_IS_READ (dr))
2626 if (dump_enabled_p ())
2627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2628 "interleaved store with gaps\n");
2629 return false;
2632 /* If there is a gap after the last load in the group it is the
2633 difference between the groupsize and the last accessed
2634 element.
2635 When there is no gap, this difference should be 0. */
2636 DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2638 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2639 if (dump_enabled_p ())
2641 dump_printf_loc (MSG_NOTE, vect_location,
2642 "Detected interleaving ");
2643 if (DR_IS_READ (dr))
2644 dump_printf (MSG_NOTE, "load ");
2645 else
2646 dump_printf (MSG_NOTE, "store ");
2647 dump_printf (MSG_NOTE, "of size %u starting with ",
2648 (unsigned)groupsize);
2649 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2650 if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2651 dump_printf_loc (MSG_NOTE, vect_location,
2652 "There is a gap of %u elements after the group\n",
2653 DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2656 /* SLP: create an SLP data structure for every interleaving group of
2657 stores for further analysis in vect_analyse_slp. */
2658 if (DR_IS_WRITE (dr) && !slp_impossible)
2660 if (loop_vinfo)
2661 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2662 if (bb_vinfo)
2663 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2667 return true;
2670 /* Analyze groups of accesses: check that DR belongs to a group of
2671 accesses of legal size, step, etc. Detect gaps, single element
2672 interleaving, and other special cases. Set grouped access info.
2673 Collect groups of strided stores for further use in SLP analysis. */
2675 static bool
2676 vect_analyze_group_access (struct data_reference *dr)
2678 if (!vect_analyze_group_access_1 (dr))
2680 /* Dissolve the group if present. */
2681 gimple *next;
2682 gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2683 while (stmt)
2685 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2686 next = DR_GROUP_NEXT_ELEMENT (vinfo);
2687 DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2688 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2689 stmt = next;
2691 return false;
2693 return true;
2696 /* Analyze the access pattern of the data-reference DR.
2697 In case of non-consecutive accesses call vect_analyze_group_access() to
2698 analyze groups of accesses. */
2700 static bool
2701 vect_analyze_data_ref_access (struct data_reference *dr)
2703 tree step = DR_STEP (dr);
2704 tree scalar_type = TREE_TYPE (DR_REF (dr));
2705 gimple *stmt = DR_STMT (dr);
2706 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2707 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2708 struct loop *loop = NULL;
2710 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2711 return true;
2713 if (loop_vinfo)
2714 loop = LOOP_VINFO_LOOP (loop_vinfo);
2716 if (loop_vinfo && !step)
2718 if (dump_enabled_p ())
2719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2720 "bad data-ref access in loop\n");
2721 return false;
2724 /* Allow loads with zero step in inner-loop vectorization. */
2725 if (loop_vinfo && integer_zerop (step))
2727 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2728 if (!nested_in_vect_loop_p (loop, stmt))
2729 return DR_IS_READ (dr);
2730 /* Allow references with zero step for outer loops marked
2731 with pragma omp simd only - it guarantees absence of
2732 loop-carried dependencies between inner loop iterations. */
2733 if (loop->safelen < 2)
2735 if (dump_enabled_p ())
2736 dump_printf_loc (MSG_NOTE, vect_location,
2737 "zero step in inner loop of nest\n");
2738 return false;
2742 if (loop && nested_in_vect_loop_p (loop, stmt))
2744 /* Interleaved accesses are not yet supported within outer-loop
2745 vectorization for references in the inner-loop. */
2746 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2748 /* For the rest of the analysis we use the outer-loop step. */
2749 step = STMT_VINFO_DR_STEP (stmt_info);
2750 if (integer_zerop (step))
2752 if (dump_enabled_p ())
2753 dump_printf_loc (MSG_NOTE, vect_location,
2754 "zero step in outer loop.\n");
2755 return DR_IS_READ (dr);
2759 /* Consecutive? */
2760 if (TREE_CODE (step) == INTEGER_CST)
2762 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2763 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2764 || (dr_step < 0
2765 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2767 /* Mark that it is not interleaving. */
2768 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2769 return true;
2773 if (loop && nested_in_vect_loop_p (loop, stmt))
2775 if (dump_enabled_p ())
2776 dump_printf_loc (MSG_NOTE, vect_location,
2777 "grouped access in outer loop.\n");
2778 return false;
2782 /* Assume this is a DR handled by non-constant strided load case. */
2783 if (TREE_CODE (step) != INTEGER_CST)
2784 return (STMT_VINFO_STRIDED_P (stmt_info)
2785 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2786 || vect_analyze_group_access (dr)));
2788 /* Not consecutive access - check if it's a part of interleaving group. */
2789 return vect_analyze_group_access (dr);
2792 /* Compare two data-references DRA and DRB to group them into chunks
2793 suitable for grouping. */
2795 static int
2796 dr_group_sort_cmp (const void *dra_, const void *drb_)
2798 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2799 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2800 int cmp;
2802 /* Stabilize sort. */
2803 if (dra == drb)
2804 return 0;
2806 /* DRs in different loops never belong to the same group. */
2807 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2808 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2809 if (loopa != loopb)
2810 return loopa->num < loopb->num ? -1 : 1;
2812 /* Ordering of DRs according to base. */
2813 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2814 DR_BASE_ADDRESS (drb));
2815 if (cmp != 0)
2816 return cmp;
2818 /* And according to DR_OFFSET. */
2819 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2820 if (cmp != 0)
2821 return cmp;
2823 /* Put reads before writes. */
2824 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2825 return DR_IS_READ (dra) ? -1 : 1;
2827 /* Then sort after access size. */
2828 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2829 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2830 if (cmp != 0)
2831 return cmp;
2833 /* And after step. */
2834 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2835 if (cmp != 0)
2836 return cmp;
2838 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2839 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2840 if (cmp == 0)
2841 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2842 return cmp;
2845 /* If OP is the result of a conversion, return the unconverted value,
2846 otherwise return null. */
2848 static tree
2849 strip_conversion (tree op)
2851 if (TREE_CODE (op) != SSA_NAME)
2852 return NULL_TREE;
2853 gimple *stmt = SSA_NAME_DEF_STMT (op);
2854 if (!is_gimple_assign (stmt)
2855 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2856 return NULL_TREE;
2857 return gimple_assign_rhs1 (stmt);
2860 /* Return true if vectorizable_* routines can handle statements STMT1
2861 and STMT2 being in a single group. */
2863 static bool
2864 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2866 if (gimple_assign_single_p (stmt1))
2867 return gimple_assign_single_p (stmt2);
2869 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2871 /* Check for two masked loads or two masked stores. */
2872 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2873 return false;
2874 internal_fn ifn = gimple_call_internal_fn (stmt1);
2875 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2876 return false;
2877 if (ifn != gimple_call_internal_fn (stmt2))
2878 return false;
2880 /* Check that the masks are the same. Cope with casts of masks,
2881 like those created by build_mask_conversion. */
2882 tree mask1 = gimple_call_arg (stmt1, 2);
2883 tree mask2 = gimple_call_arg (stmt2, 2);
2884 if (!operand_equal_p (mask1, mask2, 0))
2886 mask1 = strip_conversion (mask1);
2887 if (!mask1)
2888 return false;
2889 mask2 = strip_conversion (mask2);
2890 if (!mask2)
2891 return false;
2892 if (!operand_equal_p (mask1, mask2, 0))
2893 return false;
2895 return true;
2898 return false;
2901 /* Function vect_analyze_data_ref_accesses.
2903 Analyze the access pattern of all the data references in the loop.
2905 FORNOW: the only access pattern that is considered vectorizable is a
2906 simple step 1 (consecutive) access.
2908 FORNOW: handle only arrays and pointer accesses. */
2910 bool
2911 vect_analyze_data_ref_accesses (vec_info *vinfo)
2913 unsigned int i;
2914 vec<data_reference_p> datarefs = vinfo->datarefs;
2915 struct data_reference *dr;
2917 if (dump_enabled_p ())
2918 dump_printf_loc (MSG_NOTE, vect_location,
2919 "=== vect_analyze_data_ref_accesses ===\n");
2921 if (datarefs.is_empty ())
2922 return true;
2924 /* Sort the array of datarefs to make building the interleaving chains
2925 linear. Don't modify the original vector's order, it is needed for
2926 determining what dependencies are reversed. */
2927 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2928 datarefs_copy.qsort (dr_group_sort_cmp);
2930 /* Build the interleaving chains. */
2931 for (i = 0; i < datarefs_copy.length () - 1;)
2933 data_reference_p dra = datarefs_copy[i];
2934 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2935 stmt_vec_info lastinfo = NULL;
2936 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2937 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2939 ++i;
2940 continue;
2942 for (i = i + 1; i < datarefs_copy.length (); ++i)
2944 data_reference_p drb = datarefs_copy[i];
2945 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2946 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2947 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2948 break;
2950 /* ??? Imperfect sorting (non-compatible types, non-modulo
2951 accesses, same accesses) can lead to a group to be artificially
2952 split here as we don't just skip over those. If it really
2953 matters we can push those to a worklist and re-iterate
2954 over them. The we can just skip ahead to the next DR here. */
2956 /* DRs in a different loop should not be put into the same
2957 interleaving group. */
2958 if (gimple_bb (DR_STMT (dra))->loop_father
2959 != gimple_bb (DR_STMT (drb))->loop_father)
2960 break;
2962 /* Check that the data-refs have same first location (except init)
2963 and they are both either store or load (not load and store,
2964 not masked loads or stores). */
2965 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2966 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2967 DR_BASE_ADDRESS (drb)) != 0
2968 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2969 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2970 break;
2972 /* Check that the data-refs have the same constant size. */
2973 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2974 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2975 if (!tree_fits_uhwi_p (sza)
2976 || !tree_fits_uhwi_p (szb)
2977 || !tree_int_cst_equal (sza, szb))
2978 break;
2980 /* Check that the data-refs have the same step. */
2981 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2982 break;
2984 /* Check the types are compatible.
2985 ??? We don't distinguish this during sorting. */
2986 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2987 TREE_TYPE (DR_REF (drb))))
2988 break;
2990 /* Check that the DR_INITs are compile-time constants. */
2991 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2992 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2993 break;
2995 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2996 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2997 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2998 HOST_WIDE_INT init_prev
2999 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3000 gcc_assert (init_a <= init_b
3001 && init_a <= init_prev
3002 && init_prev <= init_b);
3004 /* Do not place the same access in the interleaving chain twice. */
3005 if (init_b == init_prev)
3007 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3008 < gimple_uid (DR_STMT (drb)));
3009 /* ??? For now we simply "drop" the later reference which is
3010 otherwise the same rather than finishing off this group.
3011 In the end we'd want to re-process duplicates forming
3012 multiple groups from the refs, likely by just collecting
3013 all candidates (including duplicates and split points
3014 below) in a vector and then process them together. */
3015 continue;
3018 /* If init_b == init_a + the size of the type * k, we have an
3019 interleaving, and DRA is accessed before DRB. */
3020 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3021 if (type_size_a == 0
3022 || (init_b - init_a) % type_size_a != 0)
3023 break;
3025 /* If we have a store, the accesses are adjacent. This splits
3026 groups into chunks we support (we don't support vectorization
3027 of stores with gaps). */
3028 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3029 break;
3031 /* If the step (if not zero or non-constant) is greater than the
3032 difference between data-refs' inits this splits groups into
3033 suitable sizes. */
3034 if (tree_fits_shwi_p (DR_STEP (dra)))
3036 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3037 if (step != 0 && step <= (init_b - init_a))
3038 break;
3041 if (dump_enabled_p ())
3043 dump_printf_loc (MSG_NOTE, vect_location,
3044 "Detected interleaving ");
3045 if (DR_IS_READ (dra))
3046 dump_printf (MSG_NOTE, "load ");
3047 else
3048 dump_printf (MSG_NOTE, "store ");
3049 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3050 dump_printf (MSG_NOTE, " and ");
3051 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3052 dump_printf (MSG_NOTE, "\n");
3055 /* Link the found element into the group list. */
3056 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3058 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3059 lastinfo = stmtinfo_a;
3061 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3062 DR_GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3063 lastinfo = stmtinfo_b;
3067 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3068 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3069 && !vect_analyze_data_ref_access (dr))
3071 if (dump_enabled_p ())
3072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3073 "not vectorized: complicated access pattern.\n");
3075 if (is_a <bb_vec_info> (vinfo))
3077 /* Mark the statement as not vectorizable. */
3078 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3079 continue;
3081 else
3083 datarefs_copy.release ();
3084 return false;
3088 datarefs_copy.release ();
3089 return true;
3092 /* Function vect_vfa_segment_size.
3094 Input:
3095 DR: The data reference.
3096 LENGTH_FACTOR: segment length to consider.
3098 Return a value suitable for the dr_with_seg_len::seg_len field.
3099 This is the "distance travelled" by the pointer from the first
3100 iteration in the segment to the last. Note that it does not include
3101 the size of the access; in effect it only describes the first byte. */
3103 static tree
3104 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3106 length_factor = size_binop (MINUS_EXPR,
3107 fold_convert (sizetype, length_factor),
3108 size_one_node);
3109 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3110 length_factor);
3113 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3114 gives the worst-case number of bytes covered by the segment. */
3116 static unsigned HOST_WIDE_INT
3117 vect_vfa_access_size (data_reference *dr)
3119 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3120 tree ref_type = TREE_TYPE (DR_REF (dr));
3121 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3122 unsigned HOST_WIDE_INT access_size = ref_size;
3123 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3125 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3126 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3128 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3129 && (vect_supportable_dr_alignment (dr, false)
3130 == dr_explicit_realign_optimized))
3132 /* We might access a full vector's worth. */
3133 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3134 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3136 return access_size;
3139 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3141 static unsigned int
3142 vect_vfa_align (const data_reference *dr)
3144 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3147 /* Function vect_no_alias_p.
3149 Given data references A and B with equal base and offset, see whether
3150 the alias relation can be decided at compilation time. Return 1 if
3151 it can and the references alias, 0 if it can and the references do
3152 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3153 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3154 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3156 static int
3157 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3158 tree segment_length_a, tree segment_length_b,
3159 unsigned HOST_WIDE_INT access_size_a,
3160 unsigned HOST_WIDE_INT access_size_b)
3162 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3163 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3164 poly_uint64 const_length_a;
3165 poly_uint64 const_length_b;
3167 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3168 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3169 [a, a+12) */
3170 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3172 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3173 offset_a = (offset_a + access_size_a) - const_length_a;
3175 else
3176 const_length_a = tree_to_poly_uint64 (segment_length_a);
3177 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3179 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3180 offset_b = (offset_b + access_size_b) - const_length_b;
3182 else
3183 const_length_b = tree_to_poly_uint64 (segment_length_b);
3185 const_length_a += access_size_a;
3186 const_length_b += access_size_b;
3188 if (ranges_known_overlap_p (offset_a, const_length_a,
3189 offset_b, const_length_b))
3190 return 1;
3192 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3193 offset_b, const_length_b))
3194 return 0;
3196 return -1;
3199 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3200 in DDR is >= VF. */
3202 static bool
3203 dependence_distance_ge_vf (data_dependence_relation *ddr,
3204 unsigned int loop_depth, poly_uint64 vf)
3206 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3207 || DDR_NUM_DIST_VECTS (ddr) == 0)
3208 return false;
3210 /* If the dependence is exact, we should have limited the VF instead. */
3211 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3213 unsigned int i;
3214 lambda_vector dist_v;
3215 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3217 HOST_WIDE_INT dist = dist_v[loop_depth];
3218 if (dist != 0
3219 && !(dist > 0 && DDR_REVERSED_P (ddr))
3220 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3221 return false;
3224 if (dump_enabled_p ())
3226 dump_printf_loc (MSG_NOTE, vect_location,
3227 "dependence distance between ");
3228 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3229 dump_printf (MSG_NOTE, " and ");
3230 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3231 dump_printf (MSG_NOTE, " is >= VF\n");
3234 return true;
3237 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3239 static void
3240 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3242 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3243 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3244 dump_printf (dump_kind, ") >= ");
3245 dump_dec (dump_kind, lower_bound.min_value);
3248 /* Record that the vectorized loop requires the vec_lower_bound described
3249 by EXPR, UNSIGNED_P and MIN_VALUE. */
3251 static void
3252 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3253 poly_uint64 min_value)
3255 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3256 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3257 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3259 unsigned_p &= lower_bounds[i].unsigned_p;
3260 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3261 if (lower_bounds[i].unsigned_p != unsigned_p
3262 || maybe_lt (lower_bounds[i].min_value, min_value))
3264 lower_bounds[i].unsigned_p = unsigned_p;
3265 lower_bounds[i].min_value = min_value;
3266 if (dump_enabled_p ())
3268 dump_printf_loc (MSG_NOTE, vect_location,
3269 "updating run-time check to ");
3270 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3271 dump_printf (MSG_NOTE, "\n");
3274 return;
3277 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3278 if (dump_enabled_p ())
3280 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3281 dump_lower_bound (MSG_NOTE, lower_bound);
3282 dump_printf (MSG_NOTE, "\n");
3284 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3287 /* Return true if it's unlikely that the step of the vectorized form of DR
3288 will span fewer than GAP bytes. */
3290 static bool
3291 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3293 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3294 HOST_WIDE_INT count
3295 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3296 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3297 count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3298 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3301 /* Return true if we know that there is no alias between DR_A and DR_B
3302 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3303 *LOWER_BOUND_OUT to this N. */
3305 static bool
3306 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3307 poly_uint64 *lower_bound_out)
3309 /* Check that there is a constant gap of known sign between DR_A
3310 and DR_B. */
3311 poly_int64 init_a, init_b;
3312 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3313 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3314 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3315 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3316 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3317 || !ordered_p (init_a, init_b))
3318 return false;
3320 /* Sort DR_A and DR_B by the address they access. */
3321 if (maybe_lt (init_b, init_a))
3323 std::swap (init_a, init_b);
3324 std::swap (dr_a, dr_b);
3327 /* If the two accesses could be dependent within a scalar iteration,
3328 make sure that we'd retain their order. */
3329 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3330 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3331 return false;
3333 /* There is no alias if abs (DR_STEP) is greater than or equal to
3334 the bytes spanned by the combination of the two accesses. */
3335 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3336 return true;
3339 /* Function vect_prune_runtime_alias_test_list.
3341 Prune a list of ddrs to be tested at run-time by versioning for alias.
3342 Merge several alias checks into one if possible.
3343 Return FALSE if resulting list of ddrs is longer then allowed by
3344 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3346 bool
3347 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3349 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3350 hash_set <tree_pair_hash> compared_objects;
3352 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3353 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3354 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3355 vec<vec_object_pair> &check_unequal_addrs
3356 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3357 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3358 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3360 ddr_p ddr;
3361 unsigned int i;
3362 tree length_factor;
3364 if (dump_enabled_p ())
3365 dump_printf_loc (MSG_NOTE, vect_location,
3366 "=== vect_prune_runtime_alias_test_list ===\n");
3368 /* Step values are irrelevant for aliasing if the number of vector
3369 iterations is equal to the number of scalar iterations (which can
3370 happen for fully-SLP loops). */
3371 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3373 if (!ignore_step_p)
3375 /* Convert the checks for nonzero steps into bound tests. */
3376 tree value;
3377 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3378 vect_check_lower_bound (loop_vinfo, value, true, 1);
3381 if (may_alias_ddrs.is_empty ())
3382 return true;
3384 comp_alias_ddrs.create (may_alias_ddrs.length ());
3386 unsigned int loop_depth
3387 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3388 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3390 /* First, we collect all data ref pairs for aliasing checks. */
3391 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3393 int comp_res;
3394 poly_uint64 lower_bound;
3395 struct data_reference *dr_a, *dr_b;
3396 gimple *dr_group_first_a, *dr_group_first_b;
3397 tree segment_length_a, segment_length_b;
3398 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3399 unsigned int align_a, align_b;
3400 gimple *stmt_a, *stmt_b;
3402 /* Ignore the alias if the VF we chose ended up being no greater
3403 than the dependence distance. */
3404 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3405 continue;
3407 if (DDR_OBJECT_A (ddr))
3409 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3410 if (!compared_objects.add (new_pair))
3412 if (dump_enabled_p ())
3414 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3415 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3416 dump_printf (MSG_NOTE, " and ");
3417 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3418 dump_printf (MSG_NOTE, " have different addresses\n");
3420 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3422 continue;
3425 dr_a = DDR_A (ddr);
3426 stmt_a = DR_STMT (DDR_A (ddr));
3428 dr_b = DDR_B (ddr);
3429 stmt_b = DR_STMT (DDR_B (ddr));
3431 /* Skip the pair if inter-iteration dependencies are irrelevant
3432 and intra-iteration dependencies are guaranteed to be honored. */
3433 if (ignore_step_p
3434 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3435 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3437 if (dump_enabled_p ())
3439 dump_printf_loc (MSG_NOTE, vect_location,
3440 "no need for alias check between ");
3441 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3442 dump_printf (MSG_NOTE, " and ");
3443 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3444 dump_printf (MSG_NOTE, " when VF is 1\n");
3446 continue;
3449 /* See whether we can handle the alias using a bounds check on
3450 the step, and whether that's likely to be the best approach.
3451 (It might not be, for example, if the minimum step is much larger
3452 than the number of bytes handled by one vector iteration.) */
3453 if (!ignore_step_p
3454 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3455 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3456 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3457 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3459 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3460 if (dump_enabled_p ())
3462 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3463 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3464 dump_printf (MSG_NOTE, " and ");
3465 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3466 dump_printf (MSG_NOTE, " when the step ");
3467 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3468 dump_printf (MSG_NOTE, " is outside ");
3469 if (unsigned_p)
3470 dump_printf (MSG_NOTE, "[0");
3471 else
3473 dump_printf (MSG_NOTE, "(");
3474 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3476 dump_printf (MSG_NOTE, ", ");
3477 dump_dec (MSG_NOTE, lower_bound);
3478 dump_printf (MSG_NOTE, ")\n");
3480 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3481 lower_bound);
3482 continue;
3485 dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3486 if (dr_group_first_a)
3488 stmt_a = dr_group_first_a;
3489 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3492 dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3493 if (dr_group_first_b)
3495 stmt_b = dr_group_first_b;
3496 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3499 if (ignore_step_p)
3501 segment_length_a = size_zero_node;
3502 segment_length_b = size_zero_node;
3504 else
3506 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3507 length_factor = scalar_loop_iters;
3508 else
3509 length_factor = size_int (vect_factor);
3510 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3511 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3513 access_size_a = vect_vfa_access_size (dr_a);
3514 access_size_b = vect_vfa_access_size (dr_b);
3515 align_a = vect_vfa_align (dr_a);
3516 align_b = vect_vfa_align (dr_b);
3518 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3519 DR_BASE_ADDRESS (dr_b));
3520 if (comp_res == 0)
3521 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3522 DR_OFFSET (dr_b));
3524 /* See whether the alias is known at compilation time. */
3525 if (comp_res == 0
3526 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3527 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3528 && poly_int_tree_p (segment_length_a)
3529 && poly_int_tree_p (segment_length_b))
3531 int res = vect_compile_time_alias (dr_a, dr_b,
3532 segment_length_a,
3533 segment_length_b,
3534 access_size_a,
3535 access_size_b);
3536 if (res >= 0 && dump_enabled_p ())
3538 dump_printf_loc (MSG_NOTE, vect_location,
3539 "can tell at compile time that ");
3540 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3541 dump_printf (MSG_NOTE, " and ");
3542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3543 if (res == 0)
3544 dump_printf (MSG_NOTE, " do not alias\n");
3545 else
3546 dump_printf (MSG_NOTE, " alias\n");
3549 if (res == 0)
3550 continue;
3552 if (res == 1)
3554 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_NOTE, vect_location,
3556 "not vectorized: compilation time alias.\n");
3557 return false;
3561 dr_with_seg_len_pair_t dr_with_seg_len_pair
3562 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3563 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3565 /* Canonicalize pairs by sorting the two DR members. */
3566 if (comp_res > 0)
3567 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3569 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3572 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3574 unsigned int count = (comp_alias_ddrs.length ()
3575 + check_unequal_addrs.length ());
3577 dump_printf_loc (MSG_NOTE, vect_location,
3578 "improved number of alias checks from %d to %d\n",
3579 may_alias_ddrs.length (), count);
3580 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3582 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584 "number of versioning for alias "
3585 "run-time tests exceeds %d "
3586 "(--param vect-max-version-for-alias-checks)\n",
3587 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3588 return false;
3591 return true;
3594 /* Check whether we can use an internal function for a gather load
3595 or scatter store. READ_P is true for loads and false for stores.
3596 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3597 the type of the memory elements being loaded or stored. OFFSET_BITS
3598 is the number of bits in each scalar offset and OFFSET_SIGN is the
3599 sign of the offset. SCALE is the amount by which the offset should
3600 be multiplied *after* it has been converted to address width.
3602 Return true if the function is supported, storing the function
3603 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3605 bool
3606 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3607 tree memory_type, unsigned int offset_bits,
3608 signop offset_sign, int scale,
3609 internal_fn *ifn_out, tree *element_type_out)
3611 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3612 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3613 if (offset_bits > element_bits)
3614 /* Internal functions require the offset to be the same width as
3615 the vector elements. We can extend narrower offsets, but it isn't
3616 safe to truncate wider offsets. */
3617 return false;
3619 if (element_bits != memory_bits)
3620 /* For now the vector elements must be the same width as the
3621 memory elements. */
3622 return false;
3624 /* Work out which function we need. */
3625 internal_fn ifn;
3626 if (read_p)
3627 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3628 else
3629 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3631 /* Test whether the target supports this combination. */
3632 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3633 offset_sign, scale))
3634 return false;
3636 *ifn_out = ifn;
3637 *element_type_out = TREE_TYPE (vectype);
3638 return true;
3641 /* CALL is a call to an internal gather load or scatter store function.
3642 Describe the operation in INFO. */
3644 static void
3645 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3647 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3648 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3649 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3651 info->ifn = gimple_call_internal_fn (call);
3652 info->decl = NULL_TREE;
3653 info->base = gimple_call_arg (call, 0);
3654 info->offset = gimple_call_arg (call, 1);
3655 info->offset_dt = vect_unknown_def_type;
3656 info->offset_vectype = NULL_TREE;
3657 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3658 info->element_type = TREE_TYPE (vectype);
3659 info->memory_type = TREE_TYPE (DR_REF (dr));
3662 /* Return true if a non-affine read or write in STMT is suitable for a
3663 gather load or scatter store. Describe the operation in *INFO if so. */
3665 bool
3666 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3667 gather_scatter_info *info)
3669 HOST_WIDE_INT scale = 1;
3670 poly_int64 pbitpos, pbitsize;
3671 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3672 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3673 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3674 tree offtype = NULL_TREE;
3675 tree decl = NULL_TREE, base, off;
3676 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3677 tree memory_type = TREE_TYPE (DR_REF (dr));
3678 machine_mode pmode;
3679 int punsignedp, reversep, pvolatilep = 0;
3680 internal_fn ifn;
3681 tree element_type;
3682 bool masked_p = false;
3684 /* See whether this is already a call to a gather/scatter internal function.
3685 If not, see whether it's a masked load or store. */
3686 gcall *call = dyn_cast <gcall *> (stmt);
3687 if (call && gimple_call_internal_p (call))
3689 ifn = gimple_call_internal_fn (stmt);
3690 if (internal_gather_scatter_fn_p (ifn))
3692 vect_describe_gather_scatter_call (call, info);
3693 return true;
3695 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3698 /* True if we should aim to use internal functions rather than
3699 built-in functions. */
3700 bool use_ifn_p = (DR_IS_READ (dr)
3701 ? supports_vec_gather_load_p ()
3702 : supports_vec_scatter_store_p ());
3704 base = DR_REF (dr);
3705 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3706 see if we can use the def stmt of the address. */
3707 if (masked_p
3708 && TREE_CODE (base) == MEM_REF
3709 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3710 && integer_zerop (TREE_OPERAND (base, 1))
3711 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3713 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3714 if (is_gimple_assign (def_stmt)
3715 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3716 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3719 /* The gather and scatter builtins need address of the form
3720 loop_invariant + vector * {1, 2, 4, 8}
3722 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3723 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3724 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3725 multiplications and additions in it. To get a vector, we need
3726 a single SSA_NAME that will be defined in the loop and will
3727 contain everything that is not loop invariant and that can be
3728 vectorized. The following code attempts to find such a preexistng
3729 SSA_NAME OFF and put the loop invariants into a tree BASE
3730 that can be gimplified before the loop. */
3731 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3732 &punsignedp, &reversep, &pvolatilep);
3733 gcc_assert (base && !reversep);
3734 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3736 if (TREE_CODE (base) == MEM_REF)
3738 if (!integer_zerop (TREE_OPERAND (base, 1)))
3740 if (off == NULL_TREE)
3741 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3742 else
3743 off = size_binop (PLUS_EXPR, off,
3744 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3746 base = TREE_OPERAND (base, 0);
3748 else
3749 base = build_fold_addr_expr (base);
3751 if (off == NULL_TREE)
3752 off = size_zero_node;
3754 /* If base is not loop invariant, either off is 0, then we start with just
3755 the constant offset in the loop invariant BASE and continue with base
3756 as OFF, otherwise give up.
3757 We could handle that case by gimplifying the addition of base + off
3758 into some SSA_NAME and use that as off, but for now punt. */
3759 if (!expr_invariant_in_loop_p (loop, base))
3761 if (!integer_zerop (off))
3762 return false;
3763 off = base;
3764 base = size_int (pbytepos);
3766 /* Otherwise put base + constant offset into the loop invariant BASE
3767 and continue with OFF. */
3768 else
3770 base = fold_convert (sizetype, base);
3771 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3774 /* OFF at this point may be either a SSA_NAME or some tree expression
3775 from get_inner_reference. Try to peel off loop invariants from it
3776 into BASE as long as possible. */
3777 STRIP_NOPS (off);
3778 while (offtype == NULL_TREE)
3780 enum tree_code code;
3781 tree op0, op1, add = NULL_TREE;
3783 if (TREE_CODE (off) == SSA_NAME)
3785 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3787 if (expr_invariant_in_loop_p (loop, off))
3788 return false;
3790 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3791 break;
3793 op0 = gimple_assign_rhs1 (def_stmt);
3794 code = gimple_assign_rhs_code (def_stmt);
3795 op1 = gimple_assign_rhs2 (def_stmt);
3797 else
3799 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3800 return false;
3801 code = TREE_CODE (off);
3802 extract_ops_from_tree (off, &code, &op0, &op1);
3804 switch (code)
3806 case POINTER_PLUS_EXPR:
3807 case PLUS_EXPR:
3808 if (expr_invariant_in_loop_p (loop, op0))
3810 add = op0;
3811 off = op1;
3812 do_add:
3813 add = fold_convert (sizetype, add);
3814 if (scale != 1)
3815 add = size_binop (MULT_EXPR, add, size_int (scale));
3816 base = size_binop (PLUS_EXPR, base, add);
3817 continue;
3819 if (expr_invariant_in_loop_p (loop, op1))
3821 add = op1;
3822 off = op0;
3823 goto do_add;
3825 break;
3826 case MINUS_EXPR:
3827 if (expr_invariant_in_loop_p (loop, op1))
3829 add = fold_convert (sizetype, op1);
3830 add = size_binop (MINUS_EXPR, size_zero_node, add);
3831 off = op0;
3832 goto do_add;
3834 break;
3835 case MULT_EXPR:
3836 if (scale == 1 && tree_fits_shwi_p (op1))
3838 int new_scale = tree_to_shwi (op1);
3839 /* Only treat this as a scaling operation if the target
3840 supports it. */
3841 if (use_ifn_p
3842 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3843 vectype, memory_type, 1,
3844 TYPE_SIGN (TREE_TYPE (op0)),
3845 new_scale, &ifn,
3846 &element_type))
3847 break;
3848 scale = new_scale;
3849 off = op0;
3850 continue;
3852 break;
3853 case SSA_NAME:
3854 off = op0;
3855 continue;
3856 CASE_CONVERT:
3857 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3858 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3859 break;
3860 if (TYPE_PRECISION (TREE_TYPE (op0))
3861 == TYPE_PRECISION (TREE_TYPE (off)))
3863 off = op0;
3864 continue;
3867 /* The internal functions need the offset to be the same width
3868 as the elements of VECTYPE. Don't include operations that
3869 cast the offset from that width to a different width. */
3870 if (use_ifn_p
3871 && (int_size_in_bytes (TREE_TYPE (vectype))
3872 == int_size_in_bytes (TREE_TYPE (off))))
3873 break;
3875 if (TYPE_PRECISION (TREE_TYPE (op0))
3876 < TYPE_PRECISION (TREE_TYPE (off)))
3878 off = op0;
3879 offtype = TREE_TYPE (off);
3880 STRIP_NOPS (off);
3881 continue;
3883 break;
3884 default:
3885 break;
3887 break;
3890 /* If at the end OFF still isn't a SSA_NAME or isn't
3891 defined in the loop, punt. */
3892 if (TREE_CODE (off) != SSA_NAME
3893 || expr_invariant_in_loop_p (loop, off))
3894 return false;
3896 if (offtype == NULL_TREE)
3897 offtype = TREE_TYPE (off);
3899 if (use_ifn_p)
3901 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3902 memory_type, TYPE_PRECISION (offtype),
3903 TYPE_SIGN (offtype), scale, &ifn,
3904 &element_type))
3905 return false;
3907 else
3909 if (DR_IS_READ (dr))
3911 if (targetm.vectorize.builtin_gather)
3912 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3914 else
3916 if (targetm.vectorize.builtin_scatter)
3917 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3920 if (!decl)
3921 return false;
3923 ifn = IFN_LAST;
3924 element_type = TREE_TYPE (vectype);
3927 info->ifn = ifn;
3928 info->decl = decl;
3929 info->base = base;
3930 info->offset = off;
3931 info->offset_dt = vect_unknown_def_type;
3932 info->offset_vectype = NULL_TREE;
3933 info->scale = scale;
3934 info->element_type = element_type;
3935 info->memory_type = memory_type;
3936 return true;
3939 /* Function vect_analyze_data_refs.
3941 Find all the data references in the loop or basic block.
3943 The general structure of the analysis of data refs in the vectorizer is as
3944 follows:
3945 1- vect_analyze_data_refs(loop/bb): call
3946 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3947 in the loop/bb and their dependences.
3948 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3949 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3950 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3954 bool
3955 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3957 struct loop *loop = NULL;
3958 unsigned int i;
3959 struct data_reference *dr;
3960 tree scalar_type;
3962 if (dump_enabled_p ())
3963 dump_printf_loc (MSG_NOTE, vect_location,
3964 "=== vect_analyze_data_refs ===\n");
3966 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3967 loop = LOOP_VINFO_LOOP (loop_vinfo);
3969 /* Go through the data-refs, check that the analysis succeeded. Update
3970 pointer from stmt_vec_info struct to DR and vectype. */
3972 vec<data_reference_p> datarefs = vinfo->datarefs;
3973 FOR_EACH_VEC_ELT (datarefs, i, dr)
3975 gimple *stmt;
3976 stmt_vec_info stmt_info;
3977 tree base, offset, init;
3978 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3979 bool simd_lane_access = false;
3980 poly_uint64 vf;
3982 again:
3983 if (!dr || !DR_REF (dr))
3985 if (dump_enabled_p ())
3986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3987 "not vectorized: unhandled data-ref\n");
3988 return false;
3991 stmt = DR_STMT (dr);
3992 stmt_info = vinfo_for_stmt (stmt);
3994 /* Discard clobbers from the dataref vector. We will remove
3995 clobber stmts during vectorization. */
3996 if (gimple_clobber_p (stmt))
3998 free_data_ref (dr);
3999 if (i == datarefs.length () - 1)
4001 datarefs.pop ();
4002 break;
4004 datarefs.ordered_remove (i);
4005 dr = datarefs[i];
4006 goto again;
4009 /* Check that analysis of the data-ref succeeded. */
4010 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4011 || !DR_STEP (dr))
4013 bool maybe_gather
4014 = DR_IS_READ (dr)
4015 && !TREE_THIS_VOLATILE (DR_REF (dr))
4016 && (targetm.vectorize.builtin_gather != NULL
4017 || supports_vec_gather_load_p ());
4018 bool maybe_scatter
4019 = DR_IS_WRITE (dr)
4020 && !TREE_THIS_VOLATILE (DR_REF (dr))
4021 && (targetm.vectorize.builtin_scatter != NULL
4022 || supports_vec_scatter_store_p ());
4023 bool maybe_simd_lane_access
4024 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4026 /* If target supports vector gather loads or scatter stores, or if
4027 this might be a SIMD lane access, see if they can't be used. */
4028 if (is_a <loop_vec_info> (vinfo)
4029 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4030 && !nested_in_vect_loop_p (loop, stmt))
4032 struct data_reference *newdr
4033 = create_data_ref (NULL, loop_containing_stmt (stmt),
4034 DR_REF (dr), stmt, !maybe_scatter,
4035 DR_IS_CONDITIONAL_IN_STMT (dr));
4036 gcc_assert (newdr != NULL && DR_REF (newdr));
4037 if (DR_BASE_ADDRESS (newdr)
4038 && DR_OFFSET (newdr)
4039 && DR_INIT (newdr)
4040 && DR_STEP (newdr)
4041 && integer_zerop (DR_STEP (newdr)))
4043 if (maybe_simd_lane_access)
4045 tree off = DR_OFFSET (newdr);
4046 STRIP_NOPS (off);
4047 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4048 && TREE_CODE (off) == MULT_EXPR
4049 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4051 tree step = TREE_OPERAND (off, 1);
4052 off = TREE_OPERAND (off, 0);
4053 STRIP_NOPS (off);
4054 if (CONVERT_EXPR_P (off)
4055 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4056 0)))
4057 < TYPE_PRECISION (TREE_TYPE (off)))
4058 off = TREE_OPERAND (off, 0);
4059 if (TREE_CODE (off) == SSA_NAME)
4061 gimple *def = SSA_NAME_DEF_STMT (off);
4062 tree reft = TREE_TYPE (DR_REF (newdr));
4063 if (is_gimple_call (def)
4064 && gimple_call_internal_p (def)
4065 && (gimple_call_internal_fn (def)
4066 == IFN_GOMP_SIMD_LANE))
4068 tree arg = gimple_call_arg (def, 0);
4069 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4070 arg = SSA_NAME_VAR (arg);
4071 if (arg == loop->simduid
4072 /* For now. */
4073 && tree_int_cst_equal
4074 (TYPE_SIZE_UNIT (reft),
4075 step))
4077 DR_OFFSET (newdr) = ssize_int (0);
4078 DR_STEP (newdr) = step;
4079 DR_OFFSET_ALIGNMENT (newdr)
4080 = BIGGEST_ALIGNMENT;
4081 DR_STEP_ALIGNMENT (newdr)
4082 = highest_pow2_factor (step);
4083 dr = newdr;
4084 simd_lane_access = true;
4090 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4092 dr = newdr;
4093 if (maybe_gather)
4094 gatherscatter = GATHER;
4095 else
4096 gatherscatter = SCATTER;
4099 if (gatherscatter == SG_NONE && !simd_lane_access)
4100 free_data_ref (newdr);
4103 if (gatherscatter == SG_NONE && !simd_lane_access)
4105 if (dump_enabled_p ())
4107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4108 "not vectorized: data ref analysis "
4109 "failed ");
4110 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4113 if (is_a <bb_vec_info> (vinfo))
4114 break;
4116 return false;
4120 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4122 if (dump_enabled_p ())
4123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4124 "not vectorized: base addr of dr is a "
4125 "constant\n");
4127 if (is_a <bb_vec_info> (vinfo))
4128 break;
4130 if (gatherscatter != SG_NONE || simd_lane_access)
4131 free_data_ref (dr);
4132 return false;
4135 if (TREE_THIS_VOLATILE (DR_REF (dr)))
4137 if (dump_enabled_p ())
4139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4140 "not vectorized: volatile type ");
4141 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4144 if (is_a <bb_vec_info> (vinfo))
4145 break;
4147 return false;
4150 if (stmt_can_throw_internal (stmt))
4152 if (dump_enabled_p ())
4154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4155 "not vectorized: statement can throw an "
4156 "exception ");
4157 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4160 if (is_a <bb_vec_info> (vinfo))
4161 break;
4163 if (gatherscatter != SG_NONE || simd_lane_access)
4164 free_data_ref (dr);
4165 return false;
4168 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4169 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4171 if (dump_enabled_p ())
4173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4174 "not vectorized: statement is bitfield "
4175 "access ");
4176 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4179 if (is_a <bb_vec_info> (vinfo))
4180 break;
4182 if (gatherscatter != SG_NONE || simd_lane_access)
4183 free_data_ref (dr);
4184 return false;
4187 base = unshare_expr (DR_BASE_ADDRESS (dr));
4188 offset = unshare_expr (DR_OFFSET (dr));
4189 init = unshare_expr (DR_INIT (dr));
4191 if (is_gimple_call (stmt)
4192 && (!gimple_call_internal_p (stmt)
4193 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4194 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4196 if (dump_enabled_p ())
4198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4199 "not vectorized: dr in a call ");
4200 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4203 if (is_a <bb_vec_info> (vinfo))
4204 break;
4206 if (gatherscatter != SG_NONE || simd_lane_access)
4207 free_data_ref (dr);
4208 return false;
4211 /* Update DR field in stmt_vec_info struct. */
4213 /* If the dataref is in an inner-loop of the loop that is considered for
4214 for vectorization, we also want to analyze the access relative to
4215 the outer-loop (DR contains information only relative to the
4216 inner-most enclosing loop). We do that by building a reference to the
4217 first location accessed by the inner-loop, and analyze it relative to
4218 the outer-loop. */
4219 if (loop && nested_in_vect_loop_p (loop, stmt))
4221 /* Build a reference to the first location accessed by the
4222 inner loop: *(BASE + INIT + OFFSET). By construction,
4223 this address must be invariant in the inner loop, so we
4224 can consider it as being used in the outer loop. */
4225 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4226 init, offset);
4227 tree init_addr = fold_build_pointer_plus (base, init_offset);
4228 tree init_ref = build_fold_indirect_ref (init_addr);
4230 if (dump_enabled_p ())
4232 dump_printf_loc (MSG_NOTE, vect_location,
4233 "analyze in outer loop: ");
4234 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4235 dump_printf (MSG_NOTE, "\n");
4238 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4239 init_ref, loop))
4240 /* dr_analyze_innermost already explained the failure. */
4241 return false;
4243 if (dump_enabled_p ())
4245 dump_printf_loc (MSG_NOTE, vect_location,
4246 "\touter base_address: ");
4247 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4248 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4249 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4250 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4251 STMT_VINFO_DR_OFFSET (stmt_info));
4252 dump_printf (MSG_NOTE,
4253 "\n\touter constant offset from base address: ");
4254 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4255 STMT_VINFO_DR_INIT (stmt_info));
4256 dump_printf (MSG_NOTE, "\n\touter step: ");
4257 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4258 STMT_VINFO_DR_STEP (stmt_info));
4259 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4260 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4261 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4262 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4263 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4264 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4265 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4266 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4270 if (STMT_VINFO_DATA_REF (stmt_info))
4272 if (dump_enabled_p ())
4274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4275 "not vectorized: more than one data ref "
4276 "in stmt: ");
4277 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4280 if (is_a <bb_vec_info> (vinfo))
4281 break;
4283 if (gatherscatter != SG_NONE || simd_lane_access)
4284 free_data_ref (dr);
4285 return false;
4288 STMT_VINFO_DATA_REF (stmt_info) = dr;
4289 if (simd_lane_access)
4291 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4292 free_data_ref (datarefs[i]);
4293 datarefs[i] = dr;
4296 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4297 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4298 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4300 if (dump_enabled_p ())
4302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4303 "not vectorized: base object not addressable "
4304 "for stmt: ");
4305 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4307 if (is_a <bb_vec_info> (vinfo))
4309 /* In BB vectorization the ref can still participate
4310 in dependence analysis, we just can't vectorize it. */
4311 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4312 continue;
4314 return false;
4317 /* Set vectype for STMT. */
4318 scalar_type = TREE_TYPE (DR_REF (dr));
4319 STMT_VINFO_VECTYPE (stmt_info)
4320 = get_vectype_for_scalar_type (scalar_type);
4321 if (!STMT_VINFO_VECTYPE (stmt_info))
4323 if (dump_enabled_p ())
4325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4326 "not vectorized: no vectype for stmt: ");
4327 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4328 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4329 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4330 scalar_type);
4331 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4334 if (is_a <bb_vec_info> (vinfo))
4336 /* No vector type is fine, the ref can still participate
4337 in dependence analysis, we just can't vectorize it. */
4338 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4339 continue;
4342 if (gatherscatter != SG_NONE || simd_lane_access)
4344 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4345 if (gatherscatter != SG_NONE)
4346 free_data_ref (dr);
4348 return false;
4350 else
4352 if (dump_enabled_p ())
4354 dump_printf_loc (MSG_NOTE, vect_location,
4355 "got vectype for stmt: ");
4356 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4357 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4358 STMT_VINFO_VECTYPE (stmt_info));
4359 dump_printf (MSG_NOTE, "\n");
4363 /* Adjust the minimal vectorization factor according to the
4364 vector type. */
4365 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4366 *min_vf = upper_bound (*min_vf, vf);
4368 if (gatherscatter != SG_NONE)
4370 gather_scatter_info gs_info;
4371 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4372 &gs_info)
4373 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4375 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4376 free_data_ref (dr);
4377 if (dump_enabled_p ())
4379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4380 (gatherscatter == GATHER) ?
4381 "not vectorized: not suitable for gather "
4382 "load " :
4383 "not vectorized: not suitable for scatter "
4384 "store ");
4385 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4387 return false;
4390 free_data_ref (datarefs[i]);
4391 datarefs[i] = dr;
4392 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4395 else if (is_a <loop_vec_info> (vinfo)
4396 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4398 if (nested_in_vect_loop_p (loop, stmt))
4400 if (dump_enabled_p ())
4402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4403 "not vectorized: not suitable for strided "
4404 "load ");
4405 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4407 return false;
4409 STMT_VINFO_STRIDED_P (stmt_info) = true;
4413 /* If we stopped analysis at the first dataref we could not analyze
4414 when trying to vectorize a basic-block mark the rest of the datarefs
4415 as not vectorizable and truncate the vector of datarefs. That
4416 avoids spending useless time in analyzing their dependence. */
4417 if (i != datarefs.length ())
4419 gcc_assert (is_a <bb_vec_info> (vinfo));
4420 for (unsigned j = i; j < datarefs.length (); ++j)
4422 data_reference_p dr = datarefs[j];
4423 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4424 free_data_ref (dr);
4426 datarefs.truncate (i);
4429 return true;
4433 /* Function vect_get_new_vect_var.
4435 Returns a name for a new variable. The current naming scheme appends the
4436 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4437 the name of vectorizer generated variables, and appends that to NAME if
4438 provided. */
4440 tree
4441 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4443 const char *prefix;
4444 tree new_vect_var;
4446 switch (var_kind)
4448 case vect_simple_var:
4449 prefix = "vect";
4450 break;
4451 case vect_scalar_var:
4452 prefix = "stmp";
4453 break;
4454 case vect_mask_var:
4455 prefix = "mask";
4456 break;
4457 case vect_pointer_var:
4458 prefix = "vectp";
4459 break;
4460 default:
4461 gcc_unreachable ();
4464 if (name)
4466 char* tmp = concat (prefix, "_", name, NULL);
4467 new_vect_var = create_tmp_reg (type, tmp);
4468 free (tmp);
4470 else
4471 new_vect_var = create_tmp_reg (type, prefix);
4473 return new_vect_var;
4476 /* Like vect_get_new_vect_var but return an SSA name. */
4478 tree
4479 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4481 const char *prefix;
4482 tree new_vect_var;
4484 switch (var_kind)
4486 case vect_simple_var:
4487 prefix = "vect";
4488 break;
4489 case vect_scalar_var:
4490 prefix = "stmp";
4491 break;
4492 case vect_pointer_var:
4493 prefix = "vectp";
4494 break;
4495 default:
4496 gcc_unreachable ();
4499 if (name)
4501 char* tmp = concat (prefix, "_", name, NULL);
4502 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4503 free (tmp);
4505 else
4506 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4508 return new_vect_var;
4511 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4513 static void
4514 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4516 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4517 int misalign = DR_MISALIGNMENT (dr);
4518 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4519 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4520 else
4521 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4522 DR_TARGET_ALIGNMENT (dr), misalign);
4525 /* Function vect_create_addr_base_for_vector_ref.
4527 Create an expression that computes the address of the first memory location
4528 that will be accessed for a data reference.
4530 Input:
4531 STMT: The statement containing the data reference.
4532 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4533 OFFSET: Optional. If supplied, it is be added to the initial address.
4534 LOOP: Specify relative to which loop-nest should the address be computed.
4535 For example, when the dataref is in an inner-loop nested in an
4536 outer-loop that is now being vectorized, LOOP can be either the
4537 outer-loop, or the inner-loop. The first memory location accessed
4538 by the following dataref ('in' points to short):
4540 for (i=0; i<N; i++)
4541 for (j=0; j<M; j++)
4542 s += in[i+j]
4544 is as follows:
4545 if LOOP=i_loop: &in (relative to i_loop)
4546 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4547 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4548 initial address. Unlike OFFSET, which is number of elements to
4549 be added, BYTE_OFFSET is measured in bytes.
4551 Output:
4552 1. Return an SSA_NAME whose value is the address of the memory location of
4553 the first vector of the data reference.
4554 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4555 these statement(s) which define the returned SSA_NAME.
4557 FORNOW: We are only handling array accesses with step 1. */
4559 tree
4560 vect_create_addr_base_for_vector_ref (gimple *stmt,
4561 gimple_seq *new_stmt_list,
4562 tree offset,
4563 tree byte_offset)
4565 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4566 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4567 const char *base_name;
4568 tree addr_base;
4569 tree dest;
4570 gimple_seq seq = NULL;
4571 tree vect_ptr_type;
4572 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4573 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4574 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4576 tree data_ref_base = unshare_expr (drb->base_address);
4577 tree base_offset = unshare_expr (drb->offset);
4578 tree init = unshare_expr (drb->init);
4580 if (loop_vinfo)
4581 base_name = get_name (data_ref_base);
4582 else
4584 base_offset = ssize_int (0);
4585 init = ssize_int (0);
4586 base_name = get_name (DR_REF (dr));
4589 /* Create base_offset */
4590 base_offset = size_binop (PLUS_EXPR,
4591 fold_convert (sizetype, base_offset),
4592 fold_convert (sizetype, init));
4594 if (offset)
4596 offset = fold_build2 (MULT_EXPR, sizetype,
4597 fold_convert (sizetype, offset), step);
4598 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4599 base_offset, offset);
4601 if (byte_offset)
4603 byte_offset = fold_convert (sizetype, byte_offset);
4604 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4605 base_offset, byte_offset);
4608 /* base + base_offset */
4609 if (loop_vinfo)
4610 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4611 else
4613 addr_base = build1 (ADDR_EXPR,
4614 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4615 unshare_expr (DR_REF (dr)));
4618 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4619 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4620 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4621 gimple_seq_add_seq (new_stmt_list, seq);
4623 if (DR_PTR_INFO (dr)
4624 && TREE_CODE (addr_base) == SSA_NAME
4625 && !SSA_NAME_PTR_INFO (addr_base))
4627 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4628 if (offset || byte_offset)
4629 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4632 if (dump_enabled_p ())
4634 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4635 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4636 dump_printf (MSG_NOTE, "\n");
4639 return addr_base;
4643 /* Function vect_create_data_ref_ptr.
4645 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4646 location accessed in the loop by STMT, along with the def-use update
4647 chain to appropriately advance the pointer through the loop iterations.
4648 Also set aliasing information for the pointer. This pointer is used by
4649 the callers to this function to create a memory reference expression for
4650 vector load/store access.
4652 Input:
4653 1. STMT: a stmt that references memory. Expected to be of the form
4654 GIMPLE_ASSIGN <name, data-ref> or
4655 GIMPLE_ASSIGN <data-ref, name>.
4656 2. AGGR_TYPE: the type of the reference, which should be either a vector
4657 or an array.
4658 3. AT_LOOP: the loop where the vector memref is to be created.
4659 4. OFFSET (optional): an offset to be added to the initial address accessed
4660 by the data-ref in STMT.
4661 5. BSI: location where the new stmts are to be placed if there is no loop
4662 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4663 pointing to the initial address.
4664 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4665 to the initial address accessed by the data-ref in STMT. This is
4666 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4667 in bytes.
4668 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4669 to the IV during each iteration of the loop. NULL says to move
4670 by one copy of AGGR_TYPE up or down, depending on the step of the
4671 data reference.
4673 Output:
4674 1. Declare a new ptr to vector_type, and have it point to the base of the
4675 data reference (initial addressed accessed by the data reference).
4676 For example, for vector of type V8HI, the following code is generated:
4678 v8hi *ap;
4679 ap = (v8hi *)initial_address;
4681 if OFFSET is not supplied:
4682 initial_address = &a[init];
4683 if OFFSET is supplied:
4684 initial_address = &a[init + OFFSET];
4685 if BYTE_OFFSET is supplied:
4686 initial_address = &a[init] + BYTE_OFFSET;
4688 Return the initial_address in INITIAL_ADDRESS.
4690 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4691 update the pointer in each iteration of the loop.
4693 Return the increment stmt that updates the pointer in PTR_INCR.
4695 3. Set INV_P to true if the access pattern of the data reference in the
4696 vectorized loop is invariant. Set it to false otherwise.
4698 4. Return the pointer. */
4700 tree
4701 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4702 tree offset, tree *initial_address,
4703 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4704 bool only_init, bool *inv_p, tree byte_offset,
4705 tree iv_step)
4707 const char *base_name;
4708 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4709 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4710 struct loop *loop = NULL;
4711 bool nested_in_vect_loop = false;
4712 struct loop *containing_loop = NULL;
4713 tree aggr_ptr_type;
4714 tree aggr_ptr;
4715 tree new_temp;
4716 gimple_seq new_stmt_list = NULL;
4717 edge pe = NULL;
4718 basic_block new_bb;
4719 tree aggr_ptr_init;
4720 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4721 tree aptr;
4722 gimple_stmt_iterator incr_gsi;
4723 bool insert_after;
4724 tree indx_before_incr, indx_after_incr;
4725 gimple *incr;
4726 tree step;
4727 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4729 gcc_assert (iv_step != NULL_TREE
4730 || TREE_CODE (aggr_type) == ARRAY_TYPE
4731 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4733 if (loop_vinfo)
4735 loop = LOOP_VINFO_LOOP (loop_vinfo);
4736 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4737 containing_loop = (gimple_bb (stmt))->loop_father;
4738 pe = loop_preheader_edge (loop);
4740 else
4742 gcc_assert (bb_vinfo);
4743 only_init = true;
4744 *ptr_incr = NULL;
4747 /* Check the step (evolution) of the load in LOOP, and record
4748 whether it's invariant. */
4749 step = vect_dr_behavior (dr)->step;
4750 if (integer_zerop (step))
4751 *inv_p = true;
4752 else
4753 *inv_p = false;
4755 /* Create an expression for the first address accessed by this load
4756 in LOOP. */
4757 base_name = get_name (DR_BASE_ADDRESS (dr));
4759 if (dump_enabled_p ())
4761 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4762 dump_printf_loc (MSG_NOTE, vect_location,
4763 "create %s-pointer variable to type: ",
4764 get_tree_code_name (TREE_CODE (aggr_type)));
4765 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4766 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4767 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4768 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4769 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4770 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4771 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4772 else
4773 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4774 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4775 dump_printf (MSG_NOTE, "\n");
4778 /* (1) Create the new aggregate-pointer variable.
4779 Vector and array types inherit the alias set of their component
4780 type by default so we need to use a ref-all pointer if the data
4781 reference does not conflict with the created aggregated data
4782 reference because it is not addressable. */
4783 bool need_ref_all = false;
4784 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4785 get_alias_set (DR_REF (dr))))
4786 need_ref_all = true;
4787 /* Likewise for any of the data references in the stmt group. */
4788 else if (DR_GROUP_SIZE (stmt_info) > 1)
4790 gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4793 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4794 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4795 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4796 get_alias_set (DR_REF (sdr))))
4798 need_ref_all = true;
4799 break;
4801 orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4803 while (orig_stmt);
4805 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4806 need_ref_all);
4807 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4810 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4811 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4812 def-use update cycles for the pointer: one relative to the outer-loop
4813 (LOOP), which is what steps (3) and (4) below do. The other is relative
4814 to the inner-loop (which is the inner-most loop containing the dataref),
4815 and this is done be step (5) below.
4817 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4818 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4819 redundant. Steps (3),(4) create the following:
4821 vp0 = &base_addr;
4822 LOOP: vp1 = phi(vp0,vp2)
4825 vp2 = vp1 + step
4826 goto LOOP
4828 If there is an inner-loop nested in loop, then step (5) will also be
4829 applied, and an additional update in the inner-loop will be created:
4831 vp0 = &base_addr;
4832 LOOP: vp1 = phi(vp0,vp2)
4834 inner: vp3 = phi(vp1,vp4)
4835 vp4 = vp3 + inner_step
4836 if () goto inner
4838 vp2 = vp1 + step
4839 if () goto LOOP */
4841 /* (2) Calculate the initial address of the aggregate-pointer, and set
4842 the aggregate-pointer to point to it before the loop. */
4844 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4846 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4847 offset, byte_offset);
4848 if (new_stmt_list)
4850 if (pe)
4852 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4853 gcc_assert (!new_bb);
4855 else
4856 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4859 *initial_address = new_temp;
4860 aggr_ptr_init = new_temp;
4862 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4863 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4864 inner-loop nested in LOOP (during outer-loop vectorization). */
4866 /* No update in loop is required. */
4867 if (only_init && (!loop_vinfo || at_loop == loop))
4868 aptr = aggr_ptr_init;
4869 else
4871 if (iv_step == NULL_TREE)
4873 /* The step of the aggregate pointer is the type size. */
4874 iv_step = TYPE_SIZE_UNIT (aggr_type);
4875 /* One exception to the above is when the scalar step of the load in
4876 LOOP is zero. In this case the step here is also zero. */
4877 if (*inv_p)
4878 iv_step = size_zero_node;
4879 else if (tree_int_cst_sgn (step) == -1)
4880 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4883 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4885 create_iv (aggr_ptr_init,
4886 fold_convert (aggr_ptr_type, iv_step),
4887 aggr_ptr, loop, &incr_gsi, insert_after,
4888 &indx_before_incr, &indx_after_incr);
4889 incr = gsi_stmt (incr_gsi);
4890 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4892 /* Copy the points-to information if it exists. */
4893 if (DR_PTR_INFO (dr))
4895 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4896 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4898 if (ptr_incr)
4899 *ptr_incr = incr;
4901 aptr = indx_before_incr;
4904 if (!nested_in_vect_loop || only_init)
4905 return aptr;
4908 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4909 nested in LOOP, if exists. */
4911 gcc_assert (nested_in_vect_loop);
4912 if (!only_init)
4914 standard_iv_increment_position (containing_loop, &incr_gsi,
4915 &insert_after);
4916 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4917 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4918 &indx_after_incr);
4919 incr = gsi_stmt (incr_gsi);
4920 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4922 /* Copy the points-to information if it exists. */
4923 if (DR_PTR_INFO (dr))
4925 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4926 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4928 if (ptr_incr)
4929 *ptr_incr = incr;
4931 return indx_before_incr;
4933 else
4934 gcc_unreachable ();
4938 /* Function bump_vector_ptr
4940 Increment a pointer (to a vector type) by vector-size. If requested,
4941 i.e. if PTR-INCR is given, then also connect the new increment stmt
4942 to the existing def-use update-chain of the pointer, by modifying
4943 the PTR_INCR as illustrated below:
4945 The pointer def-use update-chain before this function:
4946 DATAREF_PTR = phi (p_0, p_2)
4947 ....
4948 PTR_INCR: p_2 = DATAREF_PTR + step
4950 The pointer def-use update-chain after this function:
4951 DATAREF_PTR = phi (p_0, p_2)
4952 ....
4953 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4954 ....
4955 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4957 Input:
4958 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4959 in the loop.
4960 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4961 the loop. The increment amount across iterations is expected
4962 to be vector_size.
4963 BSI - location where the new update stmt is to be placed.
4964 STMT - the original scalar memory-access stmt that is being vectorized.
4965 BUMP - optional. The offset by which to bump the pointer. If not given,
4966 the offset is assumed to be vector_size.
4968 Output: Return NEW_DATAREF_PTR as illustrated above.
4972 tree
4973 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4974 gimple *stmt, tree bump)
4976 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4977 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4978 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4979 tree update = TYPE_SIZE_UNIT (vectype);
4980 gassign *incr_stmt;
4981 ssa_op_iter iter;
4982 use_operand_p use_p;
4983 tree new_dataref_ptr;
4985 if (bump)
4986 update = bump;
4988 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4989 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4990 else
4991 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4992 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4993 dataref_ptr, update);
4994 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4996 /* Copy the points-to information if it exists. */
4997 if (DR_PTR_INFO (dr))
4999 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5000 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5003 if (!ptr_incr)
5004 return new_dataref_ptr;
5006 /* Update the vector-pointer's cross-iteration increment. */
5007 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5009 tree use = USE_FROM_PTR (use_p);
5011 if (use == dataref_ptr)
5012 SET_USE (use_p, new_dataref_ptr);
5013 else
5014 gcc_assert (operand_equal_p (use, update, 0));
5017 return new_dataref_ptr;
5021 /* Copy memory reference info such as base/clique from the SRC reference
5022 to the DEST MEM_REF. */
5024 void
5025 vect_copy_ref_info (tree dest, tree src)
5027 if (TREE_CODE (dest) != MEM_REF)
5028 return;
5030 tree src_base = src;
5031 while (handled_component_p (src_base))
5032 src_base = TREE_OPERAND (src_base, 0);
5033 if (TREE_CODE (src_base) != MEM_REF
5034 && TREE_CODE (src_base) != TARGET_MEM_REF)
5035 return;
5037 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5038 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5042 /* Function vect_create_destination_var.
5044 Create a new temporary of type VECTYPE. */
5046 tree
5047 vect_create_destination_var (tree scalar_dest, tree vectype)
5049 tree vec_dest;
5050 const char *name;
5051 char *new_name;
5052 tree type;
5053 enum vect_var_kind kind;
5055 kind = vectype
5056 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5057 ? vect_mask_var
5058 : vect_simple_var
5059 : vect_scalar_var;
5060 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5062 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5064 name = get_name (scalar_dest);
5065 if (name)
5066 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5067 else
5068 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5069 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5070 free (new_name);
5072 return vec_dest;
5075 /* Function vect_grouped_store_supported.
5077 Returns TRUE if interleave high and interleave low permutations
5078 are supported, and FALSE otherwise. */
5080 bool
5081 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5083 machine_mode mode = TYPE_MODE (vectype);
5085 /* vect_permute_store_chain requires the group size to be equal to 3 or
5086 be a power of two. */
5087 if (count != 3 && exact_log2 (count) == -1)
5089 if (dump_enabled_p ())
5090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5091 "the size of the group of accesses"
5092 " is not a power of 2 or not eqaul to 3\n");
5093 return false;
5096 /* Check that the permutation is supported. */
5097 if (VECTOR_MODE_P (mode))
5099 unsigned int i;
5100 if (count == 3)
5102 unsigned int j0 = 0, j1 = 0, j2 = 0;
5103 unsigned int i, j;
5105 unsigned int nelt;
5106 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5108 if (dump_enabled_p ())
5109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5110 "cannot handle groups of 3 stores for"
5111 " variable-length vectors\n");
5112 return false;
5115 vec_perm_builder sel (nelt, nelt, 1);
5116 sel.quick_grow (nelt);
5117 vec_perm_indices indices;
5118 for (j = 0; j < 3; j++)
5120 int nelt0 = ((3 - j) * nelt) % 3;
5121 int nelt1 = ((3 - j) * nelt + 1) % 3;
5122 int nelt2 = ((3 - j) * nelt + 2) % 3;
5123 for (i = 0; i < nelt; i++)
5125 if (3 * i + nelt0 < nelt)
5126 sel[3 * i + nelt0] = j0++;
5127 if (3 * i + nelt1 < nelt)
5128 sel[3 * i + nelt1] = nelt + j1++;
5129 if (3 * i + nelt2 < nelt)
5130 sel[3 * i + nelt2] = 0;
5132 indices.new_vector (sel, 2, nelt);
5133 if (!can_vec_perm_const_p (mode, indices))
5135 if (dump_enabled_p ())
5136 dump_printf (MSG_MISSED_OPTIMIZATION,
5137 "permutation op not supported by target.\n");
5138 return false;
5141 for (i = 0; i < nelt; i++)
5143 if (3 * i + nelt0 < nelt)
5144 sel[3 * i + nelt0] = 3 * i + nelt0;
5145 if (3 * i + nelt1 < nelt)
5146 sel[3 * i + nelt1] = 3 * i + nelt1;
5147 if (3 * i + nelt2 < nelt)
5148 sel[3 * i + nelt2] = nelt + j2++;
5150 indices.new_vector (sel, 2, nelt);
5151 if (!can_vec_perm_const_p (mode, indices))
5153 if (dump_enabled_p ())
5154 dump_printf (MSG_MISSED_OPTIMIZATION,
5155 "permutation op not supported by target.\n");
5156 return false;
5159 return true;
5161 else
5163 /* If length is not equal to 3 then only power of 2 is supported. */
5164 gcc_assert (pow2p_hwi (count));
5165 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5167 /* The encoding has 2 interleaved stepped patterns. */
5168 vec_perm_builder sel (nelt, 2, 3);
5169 sel.quick_grow (6);
5170 for (i = 0; i < 3; i++)
5172 sel[i * 2] = i;
5173 sel[i * 2 + 1] = i + nelt;
5175 vec_perm_indices indices (sel, 2, nelt);
5176 if (can_vec_perm_const_p (mode, indices))
5178 for (i = 0; i < 6; i++)
5179 sel[i] += exact_div (nelt, 2);
5180 indices.new_vector (sel, 2, nelt);
5181 if (can_vec_perm_const_p (mode, indices))
5182 return true;
5187 if (dump_enabled_p ())
5188 dump_printf (MSG_MISSED_OPTIMIZATION,
5189 "permutaion op not supported by target.\n");
5190 return false;
5194 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5195 type VECTYPE. MASKED_P says whether the masked form is needed. */
5197 bool
5198 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5199 bool masked_p)
5201 if (masked_p)
5202 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5203 vec_mask_store_lanes_optab,
5204 vectype, count);
5205 else
5206 return vect_lanes_optab_supported_p ("vec_store_lanes",
5207 vec_store_lanes_optab,
5208 vectype, count);
5212 /* Function vect_permute_store_chain.
5214 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5215 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5216 the data correctly for the stores. Return the final references for stores
5217 in RESULT_CHAIN.
5219 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5220 The input is 4 vectors each containing 8 elements. We assign a number to
5221 each element, the input sequence is:
5223 1st vec: 0 1 2 3 4 5 6 7
5224 2nd vec: 8 9 10 11 12 13 14 15
5225 3rd vec: 16 17 18 19 20 21 22 23
5226 4th vec: 24 25 26 27 28 29 30 31
5228 The output sequence should be:
5230 1st vec: 0 8 16 24 1 9 17 25
5231 2nd vec: 2 10 18 26 3 11 19 27
5232 3rd vec: 4 12 20 28 5 13 21 30
5233 4th vec: 6 14 22 30 7 15 23 31
5235 i.e., we interleave the contents of the four vectors in their order.
5237 We use interleave_high/low instructions to create such output. The input of
5238 each interleave_high/low operation is two vectors:
5239 1st vec 2nd vec
5240 0 1 2 3 4 5 6 7
5241 the even elements of the result vector are obtained left-to-right from the
5242 high/low elements of the first vector. The odd elements of the result are
5243 obtained left-to-right from the high/low elements of the second vector.
5244 The output of interleave_high will be: 0 4 1 5
5245 and of interleave_low: 2 6 3 7
5248 The permutation is done in log LENGTH stages. In each stage interleave_high
5249 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5250 where the first argument is taken from the first half of DR_CHAIN and the
5251 second argument from it's second half.
5252 In our example,
5254 I1: interleave_high (1st vec, 3rd vec)
5255 I2: interleave_low (1st vec, 3rd vec)
5256 I3: interleave_high (2nd vec, 4th vec)
5257 I4: interleave_low (2nd vec, 4th vec)
5259 The output for the first stage is:
5261 I1: 0 16 1 17 2 18 3 19
5262 I2: 4 20 5 21 6 22 7 23
5263 I3: 8 24 9 25 10 26 11 27
5264 I4: 12 28 13 29 14 30 15 31
5266 The output of the second stage, i.e. the final result is:
5268 I1: 0 8 16 24 1 9 17 25
5269 I2: 2 10 18 26 3 11 19 27
5270 I3: 4 12 20 28 5 13 21 30
5271 I4: 6 14 22 30 7 15 23 31. */
5273 void
5274 vect_permute_store_chain (vec<tree> dr_chain,
5275 unsigned int length,
5276 gimple *stmt,
5277 gimple_stmt_iterator *gsi,
5278 vec<tree> *result_chain)
5280 tree vect1, vect2, high, low;
5281 gimple *perm_stmt;
5282 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5283 tree perm_mask_low, perm_mask_high;
5284 tree data_ref;
5285 tree perm3_mask_low, perm3_mask_high;
5286 unsigned int i, j, n, log_length = exact_log2 (length);
5288 result_chain->quick_grow (length);
5289 memcpy (result_chain->address (), dr_chain.address (),
5290 length * sizeof (tree));
5292 if (length == 3)
5294 /* vect_grouped_store_supported ensures that this is constant. */
5295 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5296 unsigned int j0 = 0, j1 = 0, j2 = 0;
5298 vec_perm_builder sel (nelt, nelt, 1);
5299 sel.quick_grow (nelt);
5300 vec_perm_indices indices;
5301 for (j = 0; j < 3; j++)
5303 int nelt0 = ((3 - j) * nelt) % 3;
5304 int nelt1 = ((3 - j) * nelt + 1) % 3;
5305 int nelt2 = ((3 - j) * nelt + 2) % 3;
5307 for (i = 0; i < nelt; i++)
5309 if (3 * i + nelt0 < nelt)
5310 sel[3 * i + nelt0] = j0++;
5311 if (3 * i + nelt1 < nelt)
5312 sel[3 * i + nelt1] = nelt + j1++;
5313 if (3 * i + nelt2 < nelt)
5314 sel[3 * i + nelt2] = 0;
5316 indices.new_vector (sel, 2, nelt);
5317 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5319 for (i = 0; i < nelt; i++)
5321 if (3 * i + nelt0 < nelt)
5322 sel[3 * i + nelt0] = 3 * i + nelt0;
5323 if (3 * i + nelt1 < nelt)
5324 sel[3 * i + nelt1] = 3 * i + nelt1;
5325 if (3 * i + nelt2 < nelt)
5326 sel[3 * i + nelt2] = nelt + j2++;
5328 indices.new_vector (sel, 2, nelt);
5329 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5331 vect1 = dr_chain[0];
5332 vect2 = dr_chain[1];
5334 /* Create interleaving stmt:
5335 low = VEC_PERM_EXPR <vect1, vect2,
5336 {j, nelt, *, j + 1, nelt + j + 1, *,
5337 j + 2, nelt + j + 2, *, ...}> */
5338 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5339 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5340 vect2, perm3_mask_low);
5341 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5343 vect1 = data_ref;
5344 vect2 = dr_chain[2];
5345 /* Create interleaving stmt:
5346 low = VEC_PERM_EXPR <vect1, vect2,
5347 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5348 6, 7, nelt + j + 2, ...}> */
5349 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5350 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5351 vect2, perm3_mask_high);
5352 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5353 (*result_chain)[j] = data_ref;
5356 else
5358 /* If length is not equal to 3 then only power of 2 is supported. */
5359 gcc_assert (pow2p_hwi (length));
5361 /* The encoding has 2 interleaved stepped patterns. */
5362 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5363 vec_perm_builder sel (nelt, 2, 3);
5364 sel.quick_grow (6);
5365 for (i = 0; i < 3; i++)
5367 sel[i * 2] = i;
5368 sel[i * 2 + 1] = i + nelt;
5370 vec_perm_indices indices (sel, 2, nelt);
5371 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5373 for (i = 0; i < 6; i++)
5374 sel[i] += exact_div (nelt, 2);
5375 indices.new_vector (sel, 2, nelt);
5376 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5378 for (i = 0, n = log_length; i < n; i++)
5380 for (j = 0; j < length/2; j++)
5382 vect1 = dr_chain[j];
5383 vect2 = dr_chain[j+length/2];
5385 /* Create interleaving stmt:
5386 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5387 ...}> */
5388 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5389 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5390 vect2, perm_mask_high);
5391 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5392 (*result_chain)[2*j] = high;
5394 /* Create interleaving stmt:
5395 low = VEC_PERM_EXPR <vect1, vect2,
5396 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5397 ...}> */
5398 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5399 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5400 vect2, perm_mask_low);
5401 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5402 (*result_chain)[2*j+1] = low;
5404 memcpy (dr_chain.address (), result_chain->address (),
5405 length * sizeof (tree));
5410 /* Function vect_setup_realignment
5412 This function is called when vectorizing an unaligned load using
5413 the dr_explicit_realign[_optimized] scheme.
5414 This function generates the following code at the loop prolog:
5416 p = initial_addr;
5417 x msq_init = *(floor(p)); # prolog load
5418 realignment_token = call target_builtin;
5419 loop:
5420 x msq = phi (msq_init, ---)
5422 The stmts marked with x are generated only for the case of
5423 dr_explicit_realign_optimized.
5425 The code above sets up a new (vector) pointer, pointing to the first
5426 location accessed by STMT, and a "floor-aligned" load using that pointer.
5427 It also generates code to compute the "realignment-token" (if the relevant
5428 target hook was defined), and creates a phi-node at the loop-header bb
5429 whose arguments are the result of the prolog-load (created by this
5430 function) and the result of a load that takes place in the loop (to be
5431 created by the caller to this function).
5433 For the case of dr_explicit_realign_optimized:
5434 The caller to this function uses the phi-result (msq) to create the
5435 realignment code inside the loop, and sets up the missing phi argument,
5436 as follows:
5437 loop:
5438 msq = phi (msq_init, lsq)
5439 lsq = *(floor(p')); # load in loop
5440 result = realign_load (msq, lsq, realignment_token);
5442 For the case of dr_explicit_realign:
5443 loop:
5444 msq = *(floor(p)); # load in loop
5445 p' = p + (VS-1);
5446 lsq = *(floor(p')); # load in loop
5447 result = realign_load (msq, lsq, realignment_token);
5449 Input:
5450 STMT - (scalar) load stmt to be vectorized. This load accesses
5451 a memory location that may be unaligned.
5452 BSI - place where new code is to be inserted.
5453 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5454 is used.
5456 Output:
5457 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5458 target hook, if defined.
5459 Return value - the result of the loop-header phi node. */
5461 tree
5462 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5463 tree *realignment_token,
5464 enum dr_alignment_support alignment_support_scheme,
5465 tree init_addr,
5466 struct loop **at_loop)
5468 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5469 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5470 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5471 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5472 struct loop *loop = NULL;
5473 edge pe = NULL;
5474 tree scalar_dest = gimple_assign_lhs (stmt);
5475 tree vec_dest;
5476 gimple *inc;
5477 tree ptr;
5478 tree data_ref;
5479 basic_block new_bb;
5480 tree msq_init = NULL_TREE;
5481 tree new_temp;
5482 gphi *phi_stmt;
5483 tree msq = NULL_TREE;
5484 gimple_seq stmts = NULL;
5485 bool inv_p;
5486 bool compute_in_loop = false;
5487 bool nested_in_vect_loop = false;
5488 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5489 struct loop *loop_for_initial_load = NULL;
5491 if (loop_vinfo)
5493 loop = LOOP_VINFO_LOOP (loop_vinfo);
5494 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5497 gcc_assert (alignment_support_scheme == dr_explicit_realign
5498 || alignment_support_scheme == dr_explicit_realign_optimized);
5500 /* We need to generate three things:
5501 1. the misalignment computation
5502 2. the extra vector load (for the optimized realignment scheme).
5503 3. the phi node for the two vectors from which the realignment is
5504 done (for the optimized realignment scheme). */
5506 /* 1. Determine where to generate the misalignment computation.
5508 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5509 calculation will be generated by this function, outside the loop (in the
5510 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5511 caller, inside the loop.
5513 Background: If the misalignment remains fixed throughout the iterations of
5514 the loop, then both realignment schemes are applicable, and also the
5515 misalignment computation can be done outside LOOP. This is because we are
5516 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5517 are a multiple of VS (the Vector Size), and therefore the misalignment in
5518 different vectorized LOOP iterations is always the same.
5519 The problem arises only if the memory access is in an inner-loop nested
5520 inside LOOP, which is now being vectorized using outer-loop vectorization.
5521 This is the only case when the misalignment of the memory access may not
5522 remain fixed throughout the iterations of the inner-loop (as explained in
5523 detail in vect_supportable_dr_alignment). In this case, not only is the
5524 optimized realignment scheme not applicable, but also the misalignment
5525 computation (and generation of the realignment token that is passed to
5526 REALIGN_LOAD) have to be done inside the loop.
5528 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5529 or not, which in turn determines if the misalignment is computed inside
5530 the inner-loop, or outside LOOP. */
5532 if (init_addr != NULL_TREE || !loop_vinfo)
5534 compute_in_loop = true;
5535 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5539 /* 2. Determine where to generate the extra vector load.
5541 For the optimized realignment scheme, instead of generating two vector
5542 loads in each iteration, we generate a single extra vector load in the
5543 preheader of the loop, and in each iteration reuse the result of the
5544 vector load from the previous iteration. In case the memory access is in
5545 an inner-loop nested inside LOOP, which is now being vectorized using
5546 outer-loop vectorization, we need to determine whether this initial vector
5547 load should be generated at the preheader of the inner-loop, or can be
5548 generated at the preheader of LOOP. If the memory access has no evolution
5549 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5550 to be generated inside LOOP (in the preheader of the inner-loop). */
5552 if (nested_in_vect_loop)
5554 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5555 bool invariant_in_outerloop =
5556 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5557 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5559 else
5560 loop_for_initial_load = loop;
5561 if (at_loop)
5562 *at_loop = loop_for_initial_load;
5564 if (loop_for_initial_load)
5565 pe = loop_preheader_edge (loop_for_initial_load);
5567 /* 3. For the case of the optimized realignment, create the first vector
5568 load at the loop preheader. */
5570 if (alignment_support_scheme == dr_explicit_realign_optimized)
5572 /* Create msq_init = *(floor(p1)) in the loop preheader */
5573 gassign *new_stmt;
5575 gcc_assert (!compute_in_loop);
5576 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5577 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5578 NULL_TREE, &init_addr, NULL, &inc,
5579 true, &inv_p);
5580 if (TREE_CODE (ptr) == SSA_NAME)
5581 new_temp = copy_ssa_name (ptr);
5582 else
5583 new_temp = make_ssa_name (TREE_TYPE (ptr));
5584 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5585 new_stmt = gimple_build_assign
5586 (new_temp, BIT_AND_EXPR, ptr,
5587 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5588 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5589 gcc_assert (!new_bb);
5590 data_ref
5591 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5592 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5593 vect_copy_ref_info (data_ref, DR_REF (dr));
5594 new_stmt = gimple_build_assign (vec_dest, data_ref);
5595 new_temp = make_ssa_name (vec_dest, new_stmt);
5596 gimple_assign_set_lhs (new_stmt, new_temp);
5597 if (pe)
5599 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5600 gcc_assert (!new_bb);
5602 else
5603 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5605 msq_init = gimple_assign_lhs (new_stmt);
5608 /* 4. Create realignment token using a target builtin, if available.
5609 It is done either inside the containing loop, or before LOOP (as
5610 determined above). */
5612 if (targetm.vectorize.builtin_mask_for_load)
5614 gcall *new_stmt;
5615 tree builtin_decl;
5617 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5618 if (!init_addr)
5620 /* Generate the INIT_ADDR computation outside LOOP. */
5621 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5622 NULL_TREE);
5623 if (loop)
5625 pe = loop_preheader_edge (loop);
5626 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5627 gcc_assert (!new_bb);
5629 else
5630 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5633 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5634 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5635 vec_dest =
5636 vect_create_destination_var (scalar_dest,
5637 gimple_call_return_type (new_stmt));
5638 new_temp = make_ssa_name (vec_dest, new_stmt);
5639 gimple_call_set_lhs (new_stmt, new_temp);
5641 if (compute_in_loop)
5642 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5643 else
5645 /* Generate the misalignment computation outside LOOP. */
5646 pe = loop_preheader_edge (loop);
5647 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5648 gcc_assert (!new_bb);
5651 *realignment_token = gimple_call_lhs (new_stmt);
5653 /* The result of the CALL_EXPR to this builtin is determined from
5654 the value of the parameter and no global variables are touched
5655 which makes the builtin a "const" function. Requiring the
5656 builtin to have the "const" attribute makes it unnecessary
5657 to call mark_call_clobbered. */
5658 gcc_assert (TREE_READONLY (builtin_decl));
5661 if (alignment_support_scheme == dr_explicit_realign)
5662 return msq;
5664 gcc_assert (!compute_in_loop);
5665 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5668 /* 5. Create msq = phi <msq_init, lsq> in loop */
5670 pe = loop_preheader_edge (containing_loop);
5671 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5672 msq = make_ssa_name (vec_dest);
5673 phi_stmt = create_phi_node (msq, containing_loop->header);
5674 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5676 return msq;
5680 /* Function vect_grouped_load_supported.
5682 COUNT is the size of the load group (the number of statements plus the
5683 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5684 only one statement, with a gap of COUNT - 1.
5686 Returns true if a suitable permute exists. */
5688 bool
5689 vect_grouped_load_supported (tree vectype, bool single_element_p,
5690 unsigned HOST_WIDE_INT count)
5692 machine_mode mode = TYPE_MODE (vectype);
5694 /* If this is single-element interleaving with an element distance
5695 that leaves unused vector loads around punt - we at least create
5696 very sub-optimal code in that case (and blow up memory,
5697 see PR65518). */
5698 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5700 if (dump_enabled_p ())
5701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702 "single-element interleaving not supported "
5703 "for not adjacent vector loads\n");
5704 return false;
5707 /* vect_permute_load_chain requires the group size to be equal to 3 or
5708 be a power of two. */
5709 if (count != 3 && exact_log2 (count) == -1)
5711 if (dump_enabled_p ())
5712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5713 "the size of the group of accesses"
5714 " is not a power of 2 or not equal to 3\n");
5715 return false;
5718 /* Check that the permutation is supported. */
5719 if (VECTOR_MODE_P (mode))
5721 unsigned int i, j;
5722 if (count == 3)
5724 unsigned int nelt;
5725 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5727 if (dump_enabled_p ())
5728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5729 "cannot handle groups of 3 loads for"
5730 " variable-length vectors\n");
5731 return false;
5734 vec_perm_builder sel (nelt, nelt, 1);
5735 sel.quick_grow (nelt);
5736 vec_perm_indices indices;
5737 unsigned int k;
5738 for (k = 0; k < 3; k++)
5740 for (i = 0; i < nelt; i++)
5741 if (3 * i + k < 2 * nelt)
5742 sel[i] = 3 * i + k;
5743 else
5744 sel[i] = 0;
5745 indices.new_vector (sel, 2, nelt);
5746 if (!can_vec_perm_const_p (mode, indices))
5748 if (dump_enabled_p ())
5749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5750 "shuffle of 3 loads is not supported by"
5751 " target\n");
5752 return false;
5754 for (i = 0, j = 0; i < nelt; i++)
5755 if (3 * i + k < 2 * nelt)
5756 sel[i] = i;
5757 else
5758 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5759 indices.new_vector (sel, 2, nelt);
5760 if (!can_vec_perm_const_p (mode, indices))
5762 if (dump_enabled_p ())
5763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5764 "shuffle of 3 loads is not supported by"
5765 " target\n");
5766 return false;
5769 return true;
5771 else
5773 /* If length is not equal to 3 then only power of 2 is supported. */
5774 gcc_assert (pow2p_hwi (count));
5775 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5777 /* The encoding has a single stepped pattern. */
5778 vec_perm_builder sel (nelt, 1, 3);
5779 sel.quick_grow (3);
5780 for (i = 0; i < 3; i++)
5781 sel[i] = i * 2;
5782 vec_perm_indices indices (sel, 2, nelt);
5783 if (can_vec_perm_const_p (mode, indices))
5785 for (i = 0; i < 3; i++)
5786 sel[i] = i * 2 + 1;
5787 indices.new_vector (sel, 2, nelt);
5788 if (can_vec_perm_const_p (mode, indices))
5789 return true;
5794 if (dump_enabled_p ())
5795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5796 "extract even/odd not supported by target\n");
5797 return false;
5800 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5801 type VECTYPE. MASKED_P says whether the masked form is needed. */
5803 bool
5804 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5805 bool masked_p)
5807 if (masked_p)
5808 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5809 vec_mask_load_lanes_optab,
5810 vectype, count);
5811 else
5812 return vect_lanes_optab_supported_p ("vec_load_lanes",
5813 vec_load_lanes_optab,
5814 vectype, count);
5817 /* Function vect_permute_load_chain.
5819 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5820 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5821 the input data correctly. Return the final references for loads in
5822 RESULT_CHAIN.
5824 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5825 The input is 4 vectors each containing 8 elements. We assign a number to each
5826 element, the input sequence is:
5828 1st vec: 0 1 2 3 4 5 6 7
5829 2nd vec: 8 9 10 11 12 13 14 15
5830 3rd vec: 16 17 18 19 20 21 22 23
5831 4th vec: 24 25 26 27 28 29 30 31
5833 The output sequence should be:
5835 1st vec: 0 4 8 12 16 20 24 28
5836 2nd vec: 1 5 9 13 17 21 25 29
5837 3rd vec: 2 6 10 14 18 22 26 30
5838 4th vec: 3 7 11 15 19 23 27 31
5840 i.e., the first output vector should contain the first elements of each
5841 interleaving group, etc.
5843 We use extract_even/odd instructions to create such output. The input of
5844 each extract_even/odd operation is two vectors
5845 1st vec 2nd vec
5846 0 1 2 3 4 5 6 7
5848 and the output is the vector of extracted even/odd elements. The output of
5849 extract_even will be: 0 2 4 6
5850 and of extract_odd: 1 3 5 7
5853 The permutation is done in log LENGTH stages. In each stage extract_even
5854 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5855 their order. In our example,
5857 E1: extract_even (1st vec, 2nd vec)
5858 E2: extract_odd (1st vec, 2nd vec)
5859 E3: extract_even (3rd vec, 4th vec)
5860 E4: extract_odd (3rd vec, 4th vec)
5862 The output for the first stage will be:
5864 E1: 0 2 4 6 8 10 12 14
5865 E2: 1 3 5 7 9 11 13 15
5866 E3: 16 18 20 22 24 26 28 30
5867 E4: 17 19 21 23 25 27 29 31
5869 In order to proceed and create the correct sequence for the next stage (or
5870 for the correct output, if the second stage is the last one, as in our
5871 example), we first put the output of extract_even operation and then the
5872 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5873 The input for the second stage is:
5875 1st vec (E1): 0 2 4 6 8 10 12 14
5876 2nd vec (E3): 16 18 20 22 24 26 28 30
5877 3rd vec (E2): 1 3 5 7 9 11 13 15
5878 4th vec (E4): 17 19 21 23 25 27 29 31
5880 The output of the second stage:
5882 E1: 0 4 8 12 16 20 24 28
5883 E2: 2 6 10 14 18 22 26 30
5884 E3: 1 5 9 13 17 21 25 29
5885 E4: 3 7 11 15 19 23 27 31
5887 And RESULT_CHAIN after reordering:
5889 1st vec (E1): 0 4 8 12 16 20 24 28
5890 2nd vec (E3): 1 5 9 13 17 21 25 29
5891 3rd vec (E2): 2 6 10 14 18 22 26 30
5892 4th vec (E4): 3 7 11 15 19 23 27 31. */
5894 static void
5895 vect_permute_load_chain (vec<tree> dr_chain,
5896 unsigned int length,
5897 gimple *stmt,
5898 gimple_stmt_iterator *gsi,
5899 vec<tree> *result_chain)
5901 tree data_ref, first_vect, second_vect;
5902 tree perm_mask_even, perm_mask_odd;
5903 tree perm3_mask_low, perm3_mask_high;
5904 gimple *perm_stmt;
5905 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5906 unsigned int i, j, log_length = exact_log2 (length);
5908 result_chain->quick_grow (length);
5909 memcpy (result_chain->address (), dr_chain.address (),
5910 length * sizeof (tree));
5912 if (length == 3)
5914 /* vect_grouped_load_supported ensures that this is constant. */
5915 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5916 unsigned int k;
5918 vec_perm_builder sel (nelt, nelt, 1);
5919 sel.quick_grow (nelt);
5920 vec_perm_indices indices;
5921 for (k = 0; k < 3; k++)
5923 for (i = 0; i < nelt; i++)
5924 if (3 * i + k < 2 * nelt)
5925 sel[i] = 3 * i + k;
5926 else
5927 sel[i] = 0;
5928 indices.new_vector (sel, 2, nelt);
5929 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5931 for (i = 0, j = 0; i < nelt; i++)
5932 if (3 * i + k < 2 * nelt)
5933 sel[i] = i;
5934 else
5935 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5936 indices.new_vector (sel, 2, nelt);
5937 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5939 first_vect = dr_chain[0];
5940 second_vect = dr_chain[1];
5942 /* Create interleaving stmt (low part of):
5943 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5944 ...}> */
5945 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5946 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5947 second_vect, perm3_mask_low);
5948 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5950 /* Create interleaving stmt (high part of):
5951 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5952 ...}> */
5953 first_vect = data_ref;
5954 second_vect = dr_chain[2];
5955 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5956 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5957 second_vect, perm3_mask_high);
5958 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5959 (*result_chain)[k] = data_ref;
5962 else
5964 /* If length is not equal to 3 then only power of 2 is supported. */
5965 gcc_assert (pow2p_hwi (length));
5967 /* The encoding has a single stepped pattern. */
5968 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5969 vec_perm_builder sel (nelt, 1, 3);
5970 sel.quick_grow (3);
5971 for (i = 0; i < 3; ++i)
5972 sel[i] = i * 2;
5973 vec_perm_indices indices (sel, 2, nelt);
5974 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5976 for (i = 0; i < 3; ++i)
5977 sel[i] = i * 2 + 1;
5978 indices.new_vector (sel, 2, nelt);
5979 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5981 for (i = 0; i < log_length; i++)
5983 for (j = 0; j < length; j += 2)
5985 first_vect = dr_chain[j];
5986 second_vect = dr_chain[j+1];
5988 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5989 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5990 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5991 first_vect, second_vect,
5992 perm_mask_even);
5993 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5994 (*result_chain)[j/2] = data_ref;
5996 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5997 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5998 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5999 first_vect, second_vect,
6000 perm_mask_odd);
6001 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6002 (*result_chain)[j/2+length/2] = data_ref;
6004 memcpy (dr_chain.address (), result_chain->address (),
6005 length * sizeof (tree));
6010 /* Function vect_shift_permute_load_chain.
6012 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6013 sequence of stmts to reorder the input data accordingly.
6014 Return the final references for loads in RESULT_CHAIN.
6015 Return true if successed, false otherwise.
6017 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6018 The input is 3 vectors each containing 8 elements. We assign a
6019 number to each element, the input sequence is:
6021 1st vec: 0 1 2 3 4 5 6 7
6022 2nd vec: 8 9 10 11 12 13 14 15
6023 3rd vec: 16 17 18 19 20 21 22 23
6025 The output sequence should be:
6027 1st vec: 0 3 6 9 12 15 18 21
6028 2nd vec: 1 4 7 10 13 16 19 22
6029 3rd vec: 2 5 8 11 14 17 20 23
6031 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6033 First we shuffle all 3 vectors to get correct elements order:
6035 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6036 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6037 3rd vec: (16 19 22) (17 20 23) (18 21)
6039 Next we unite and shift vector 3 times:
6041 1st step:
6042 shift right by 6 the concatenation of:
6043 "1st vec" and "2nd vec"
6044 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6045 "2nd vec" and "3rd vec"
6046 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6047 "3rd vec" and "1st vec"
6048 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6049 | New vectors |
6051 So that now new vectors are:
6053 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6054 2nd vec: (10 13) (16 19 22) (17 20 23)
6055 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6057 2nd step:
6058 shift right by 5 the concatenation of:
6059 "1st vec" and "3rd vec"
6060 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6061 "2nd vec" and "1st vec"
6062 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6063 "3rd vec" and "2nd vec"
6064 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6065 | New vectors |
6067 So that now new vectors are:
6069 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6070 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6071 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6073 3rd step:
6074 shift right by 5 the concatenation of:
6075 "1st vec" and "1st vec"
6076 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6077 shift right by 3 the concatenation of:
6078 "2nd vec" and "2nd vec"
6079 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6080 | New vectors |
6082 So that now all vectors are READY:
6083 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6084 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6085 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6087 This algorithm is faster than one in vect_permute_load_chain if:
6088 1. "shift of a concatination" is faster than general permutation.
6089 This is usually so.
6090 2. The TARGET machine can't execute vector instructions in parallel.
6091 This is because each step of the algorithm depends on previous.
6092 The algorithm in vect_permute_load_chain is much more parallel.
6094 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6097 static bool
6098 vect_shift_permute_load_chain (vec<tree> dr_chain,
6099 unsigned int length,
6100 gimple *stmt,
6101 gimple_stmt_iterator *gsi,
6102 vec<tree> *result_chain)
6104 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6105 tree perm2_mask1, perm2_mask2, perm3_mask;
6106 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6107 gimple *perm_stmt;
6109 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6110 unsigned int i;
6111 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6112 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6114 unsigned HOST_WIDE_INT nelt, vf;
6115 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6116 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6117 /* Not supported for variable-length vectors. */
6118 return false;
6120 vec_perm_builder sel (nelt, nelt, 1);
6121 sel.quick_grow (nelt);
6123 result_chain->quick_grow (length);
6124 memcpy (result_chain->address (), dr_chain.address (),
6125 length * sizeof (tree));
6127 if (pow2p_hwi (length) && vf > 4)
6129 unsigned int j, log_length = exact_log2 (length);
6130 for (i = 0; i < nelt / 2; ++i)
6131 sel[i] = i * 2;
6132 for (i = 0; i < nelt / 2; ++i)
6133 sel[nelt / 2 + i] = i * 2 + 1;
6134 vec_perm_indices indices (sel, 2, nelt);
6135 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6137 if (dump_enabled_p ())
6138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6139 "shuffle of 2 fields structure is not \
6140 supported by target\n");
6141 return false;
6143 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6145 for (i = 0; i < nelt / 2; ++i)
6146 sel[i] = i * 2 + 1;
6147 for (i = 0; i < nelt / 2; ++i)
6148 sel[nelt / 2 + i] = i * 2;
6149 indices.new_vector (sel, 2, nelt);
6150 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6152 if (dump_enabled_p ())
6153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6154 "shuffle of 2 fields structure is not \
6155 supported by target\n");
6156 return false;
6158 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6160 /* Generating permutation constant to shift all elements.
6161 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6162 for (i = 0; i < nelt; i++)
6163 sel[i] = nelt / 2 + i;
6164 indices.new_vector (sel, 2, nelt);
6165 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6167 if (dump_enabled_p ())
6168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6169 "shift permutation is not supported by target\n");
6170 return false;
6172 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6174 /* Generating permutation constant to select vector from 2.
6175 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6176 for (i = 0; i < nelt / 2; i++)
6177 sel[i] = i;
6178 for (i = nelt / 2; i < nelt; i++)
6179 sel[i] = nelt + i;
6180 indices.new_vector (sel, 2, nelt);
6181 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6183 if (dump_enabled_p ())
6184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6185 "select is not supported by target\n");
6186 return false;
6188 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6190 for (i = 0; i < log_length; i++)
6192 for (j = 0; j < length; j += 2)
6194 first_vect = dr_chain[j];
6195 second_vect = dr_chain[j + 1];
6197 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6198 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6199 first_vect, first_vect,
6200 perm2_mask1);
6201 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6202 vect[0] = data_ref;
6204 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6205 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6206 second_vect, second_vect,
6207 perm2_mask2);
6208 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6209 vect[1] = data_ref;
6211 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6212 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6213 vect[0], vect[1], shift1_mask);
6214 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6215 (*result_chain)[j/2 + length/2] = data_ref;
6217 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6218 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6219 vect[0], vect[1], select_mask);
6220 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6221 (*result_chain)[j/2] = data_ref;
6223 memcpy (dr_chain.address (), result_chain->address (),
6224 length * sizeof (tree));
6226 return true;
6228 if (length == 3 && vf > 2)
6230 unsigned int k = 0, l = 0;
6232 /* Generating permutation constant to get all elements in rigth order.
6233 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6234 for (i = 0; i < nelt; i++)
6236 if (3 * k + (l % 3) >= nelt)
6238 k = 0;
6239 l += (3 - (nelt % 3));
6241 sel[i] = 3 * k + (l % 3);
6242 k++;
6244 vec_perm_indices indices (sel, 2, nelt);
6245 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6247 if (dump_enabled_p ())
6248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6249 "shuffle of 3 fields structure is not \
6250 supported by target\n");
6251 return false;
6253 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6255 /* Generating permutation constant to shift all elements.
6256 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6257 for (i = 0; i < nelt; i++)
6258 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6259 indices.new_vector (sel, 2, nelt);
6260 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6262 if (dump_enabled_p ())
6263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6264 "shift permutation is not supported by target\n");
6265 return false;
6267 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6269 /* Generating permutation constant to shift all elements.
6270 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6271 for (i = 0; i < nelt; i++)
6272 sel[i] = 2 * (nelt / 3) + 1 + i;
6273 indices.new_vector (sel, 2, nelt);
6274 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6276 if (dump_enabled_p ())
6277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6278 "shift permutation is not supported by target\n");
6279 return false;
6281 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6283 /* Generating permutation constant to shift all elements.
6284 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6285 for (i = 0; i < nelt; i++)
6286 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6287 indices.new_vector (sel, 2, nelt);
6288 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6290 if (dump_enabled_p ())
6291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6292 "shift permutation is not supported by target\n");
6293 return false;
6295 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6297 /* Generating permutation constant to shift all elements.
6298 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6299 for (i = 0; i < nelt; i++)
6300 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6301 indices.new_vector (sel, 2, nelt);
6302 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6304 if (dump_enabled_p ())
6305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6306 "shift permutation is not supported by target\n");
6307 return false;
6309 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6311 for (k = 0; k < 3; k++)
6313 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6314 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6315 dr_chain[k], dr_chain[k],
6316 perm3_mask);
6317 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6318 vect[k] = data_ref;
6321 for (k = 0; k < 3; k++)
6323 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6324 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6325 vect[k % 3], vect[(k + 1) % 3],
6326 shift1_mask);
6327 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6328 vect_shift[k] = data_ref;
6331 for (k = 0; k < 3; k++)
6333 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6334 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6335 vect_shift[(4 - k) % 3],
6336 vect_shift[(3 - k) % 3],
6337 shift2_mask);
6338 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6339 vect[k] = data_ref;
6342 (*result_chain)[3 - (nelt % 3)] = vect[2];
6344 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6345 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6346 vect[0], shift3_mask);
6347 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6348 (*result_chain)[nelt % 3] = data_ref;
6350 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6351 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6352 vect[1], shift4_mask);
6353 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6354 (*result_chain)[0] = data_ref;
6355 return true;
6357 return false;
6360 /* Function vect_transform_grouped_load.
6362 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6363 to perform their permutation and ascribe the result vectorized statements to
6364 the scalar statements.
6367 void
6368 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6369 gimple_stmt_iterator *gsi)
6371 machine_mode mode;
6372 vec<tree> result_chain = vNULL;
6374 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6375 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6376 vectors, that are ready for vector computation. */
6377 result_chain.create (size);
6379 /* If reassociation width for vector type is 2 or greater target machine can
6380 execute 2 or more vector instructions in parallel. Otherwise try to
6381 get chain for loads group using vect_shift_permute_load_chain. */
6382 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6383 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6384 || pow2p_hwi (size)
6385 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6386 gsi, &result_chain))
6387 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6388 vect_record_grouped_load_vectors (stmt, result_chain);
6389 result_chain.release ();
6392 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6393 generated as part of the vectorization of STMT. Assign the statement
6394 for each vector to the associated scalar statement. */
6396 void
6397 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6399 gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6400 gimple *next_stmt, *new_stmt;
6401 unsigned int i, gap_count;
6402 tree tmp_data_ref;
6404 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6405 Since we scan the chain starting from it's first node, their order
6406 corresponds the order of data-refs in RESULT_CHAIN. */
6407 next_stmt = first_stmt;
6408 gap_count = 1;
6409 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6411 if (!next_stmt)
6412 break;
6414 /* Skip the gaps. Loads created for the gaps will be removed by dead
6415 code elimination pass later. No need to check for the first stmt in
6416 the group, since it always exists.
6417 DR_GROUP_GAP is the number of steps in elements from the previous
6418 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6419 correspond to the gaps. */
6420 if (next_stmt != first_stmt
6421 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6423 gap_count++;
6424 continue;
6427 while (next_stmt)
6429 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6430 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6431 copies, and we put the new vector statement in the first available
6432 RELATED_STMT. */
6433 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6434 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6435 else
6437 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6439 gimple *prev_stmt =
6440 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6441 gimple *rel_stmt =
6442 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6443 while (rel_stmt)
6445 prev_stmt = rel_stmt;
6446 rel_stmt =
6447 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6450 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6451 new_stmt;
6455 next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6456 gap_count = 1;
6457 /* If NEXT_STMT accesses the same DR as the previous statement,
6458 put the same TMP_DATA_REF as its vectorized statement; otherwise
6459 get the next data-ref from RESULT_CHAIN. */
6460 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6461 break;
6466 /* Function vect_force_dr_alignment_p.
6468 Returns whether the alignment of a DECL can be forced to be aligned
6469 on ALIGNMENT bit boundary. */
6471 bool
6472 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6474 if (!VAR_P (decl))
6475 return false;
6477 if (decl_in_symtab_p (decl)
6478 && !symtab_node::get (decl)->can_increase_alignment_p ())
6479 return false;
6481 if (TREE_STATIC (decl))
6482 return (alignment <= MAX_OFILE_ALIGNMENT);
6483 else
6484 return (alignment <= MAX_STACK_ALIGNMENT);
6488 /* Return whether the data reference DR is supported with respect to its
6489 alignment.
6490 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6491 it is aligned, i.e., check if it is possible to vectorize it with different
6492 alignment. */
6494 enum dr_alignment_support
6495 vect_supportable_dr_alignment (struct data_reference *dr,
6496 bool check_aligned_accesses)
6498 gimple *stmt = DR_STMT (dr);
6499 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6500 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6501 machine_mode mode = TYPE_MODE (vectype);
6502 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6503 struct loop *vect_loop = NULL;
6504 bool nested_in_vect_loop = false;
6506 if (aligned_access_p (dr) && !check_aligned_accesses)
6507 return dr_aligned;
6509 /* For now assume all conditional loads/stores support unaligned
6510 access without any special code. */
6511 if (is_gimple_call (stmt)
6512 && gimple_call_internal_p (stmt)
6513 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6514 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6515 return dr_unaligned_supported;
6517 if (loop_vinfo)
6519 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6520 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6523 /* Possibly unaligned access. */
6525 /* We can choose between using the implicit realignment scheme (generating
6526 a misaligned_move stmt) and the explicit realignment scheme (generating
6527 aligned loads with a REALIGN_LOAD). There are two variants to the
6528 explicit realignment scheme: optimized, and unoptimized.
6529 We can optimize the realignment only if the step between consecutive
6530 vector loads is equal to the vector size. Since the vector memory
6531 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6532 is guaranteed that the misalignment amount remains the same throughout the
6533 execution of the vectorized loop. Therefore, we can create the
6534 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6535 at the loop preheader.
6537 However, in the case of outer-loop vectorization, when vectorizing a
6538 memory access in the inner-loop nested within the LOOP that is now being
6539 vectorized, while it is guaranteed that the misalignment of the
6540 vectorized memory access will remain the same in different outer-loop
6541 iterations, it is *not* guaranteed that is will remain the same throughout
6542 the execution of the inner-loop. This is because the inner-loop advances
6543 with the original scalar step (and not in steps of VS). If the inner-loop
6544 step happens to be a multiple of VS, then the misalignment remains fixed
6545 and we can use the optimized realignment scheme. For example:
6547 for (i=0; i<N; i++)
6548 for (j=0; j<M; j++)
6549 s += a[i+j];
6551 When vectorizing the i-loop in the above example, the step between
6552 consecutive vector loads is 1, and so the misalignment does not remain
6553 fixed across the execution of the inner-loop, and the realignment cannot
6554 be optimized (as illustrated in the following pseudo vectorized loop):
6556 for (i=0; i<N; i+=4)
6557 for (j=0; j<M; j++){
6558 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6559 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6560 // (assuming that we start from an aligned address).
6563 We therefore have to use the unoptimized realignment scheme:
6565 for (i=0; i<N; i+=4)
6566 for (j=k; j<M; j+=4)
6567 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6568 // that the misalignment of the initial address is
6569 // 0).
6571 The loop can then be vectorized as follows:
6573 for (k=0; k<4; k++){
6574 rt = get_realignment_token (&vp[k]);
6575 for (i=0; i<N; i+=4){
6576 v1 = vp[i+k];
6577 for (j=k; j<M; j+=4){
6578 v2 = vp[i+j+VS-1];
6579 va = REALIGN_LOAD <v1,v2,rt>;
6580 vs += va;
6581 v1 = v2;
6584 } */
6586 if (DR_IS_READ (dr))
6588 bool is_packed = false;
6589 tree type = (TREE_TYPE (DR_REF (dr)));
6591 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6592 && (!targetm.vectorize.builtin_mask_for_load
6593 || targetm.vectorize.builtin_mask_for_load ()))
6595 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6597 /* If we are doing SLP then the accesses need not have the
6598 same alignment, instead it depends on the SLP group size. */
6599 if (loop_vinfo
6600 && STMT_SLP_TYPE (stmt_info)
6601 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6602 * DR_GROUP_SIZE (vinfo_for_stmt
6603 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6604 TYPE_VECTOR_SUBPARTS (vectype)))
6606 else if (!loop_vinfo
6607 || (nested_in_vect_loop
6608 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6609 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6610 return dr_explicit_realign;
6611 else
6612 return dr_explicit_realign_optimized;
6614 if (!known_alignment_for_access_p (dr))
6615 is_packed = not_size_aligned (DR_REF (dr));
6617 if (targetm.vectorize.support_vector_misalignment
6618 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6619 /* Can't software pipeline the loads, but can at least do them. */
6620 return dr_unaligned_supported;
6622 else
6624 bool is_packed = false;
6625 tree type = (TREE_TYPE (DR_REF (dr)));
6627 if (!known_alignment_for_access_p (dr))
6628 is_packed = not_size_aligned (DR_REF (dr));
6630 if (targetm.vectorize.support_vector_misalignment
6631 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6632 return dr_unaligned_supported;
6635 /* Unsupported. */
6636 return dr_unaligned_unsupported;