2018-05-25 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob331423af8215b864498afc44c527d85e3ce62110
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
216 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
219 /* A subroutine of vect_analyze_data_ref_dependence. Handle
220 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
221 distances. These distances are conservatively correct but they don't
222 reflect a guaranteed dependence.
224 Return true if this function does all the work necessary to avoid
225 an alias or false if the caller should use the dependence distances
226 to limit the vectorization factor in the usual way. LOOP_DEPTH is
227 the depth of the loop described by LOOP_VINFO and the other arguments
228 are as for vect_analyze_data_ref_dependence. */
230 static bool
231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 int loop_depth, unsigned int *max_vf)
235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
236 lambda_vector dist_v;
237 unsigned int i;
238 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
240 int dist = dist_v[loop_depth];
241 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
243 /* If the user asserted safelen >= DIST consecutive iterations
244 can be executed concurrently, assume independence.
246 ??? An alternative would be to add the alias check even
247 in this case, and vectorize the fallback loop with the
248 maximum VF set to safelen. However, if the user has
249 explicitly given a length, it's less likely that that
250 would be a win. */
251 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
253 if ((unsigned int) loop->safelen < *max_vf)
254 *max_vf = loop->safelen;
255 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
256 continue;
259 /* For dependence distances of 2 or more, we have the option
260 of limiting VF or checking for an alias at runtime.
261 Prefer to check at runtime if we can, to avoid limiting
262 the VF unnecessarily when the bases are in fact independent.
264 Note that the alias checks will be removed if the VF ends up
265 being small enough. */
266 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
269 return true;
273 /* Function vect_analyze_data_ref_dependence.
275 Return TRUE if there (might) exist a dependence between a memory-reference
276 DRA and a memory-reference DRB. When versioning for alias may check a
277 dependence at run-time, return FALSE. Adjust *MAX_VF according to
278 the data dependence. */
280 static bool
281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
282 loop_vec_info loop_vinfo,
283 unsigned int *max_vf)
285 unsigned int i;
286 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
287 struct data_reference *dra = DDR_A (ddr);
288 struct data_reference *drb = DDR_B (ddr);
289 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
290 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
291 lambda_vector dist_v;
292 unsigned int loop_depth;
294 /* In loop analysis all data references should be vectorizable. */
295 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
296 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
297 gcc_unreachable ();
299 /* Independent data accesses. */
300 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
301 return false;
303 if (dra == drb
304 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
305 return false;
307 /* We do not have to consider dependences between accesses that belong
308 to the same group, unless the stride could be smaller than the
309 group size. */
310 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
311 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
312 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
313 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
314 return false;
316 /* Even if we have an anti-dependence then, as the vectorized loop covers at
317 least two scalar iterations, there is always also a true dependence.
318 As the vectorizer does not re-order loads and stores we can ignore
319 the anti-dependence if TBAA can disambiguate both DRs similar to the
320 case with known negative distance anti-dependences (positive
321 distance anti-dependences would violate TBAA constraints). */
322 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
323 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
324 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
325 get_alias_set (DR_REF (drb))))
326 return false;
328 /* Unknown data dependence. */
329 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
331 /* If user asserted safelen consecutive iterations can be
332 executed concurrently, assume independence. */
333 if (loop->safelen >= 2)
335 if ((unsigned int) loop->safelen < *max_vf)
336 *max_vf = loop->safelen;
337 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
338 return false;
341 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
342 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
344 if (dump_enabled_p ())
346 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
347 "versioning for alias not supported for: "
348 "can't determine dependence between ");
349 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
350 DR_REF (dra));
351 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
353 DR_REF (drb));
354 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 return true;
359 if (dump_enabled_p ())
361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
362 "versioning for alias required: "
363 "can't determine dependence between ");
364 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
365 DR_REF (dra));
366 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
367 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
368 DR_REF (drb));
369 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
372 /* Add to list of ddrs that need to be tested at run-time. */
373 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
376 /* Known data dependence. */
377 if (DDR_NUM_DIST_VECTS (ddr) == 0)
379 /* If user asserted safelen consecutive iterations can be
380 executed concurrently, assume independence. */
381 if (loop->safelen >= 2)
383 if ((unsigned int) loop->safelen < *max_vf)
384 *max_vf = loop->safelen;
385 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
386 return false;
389 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
390 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
392 if (dump_enabled_p ())
394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
395 "versioning for alias not supported for: "
396 "bad dist vector for ");
397 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
398 DR_REF (dra));
399 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
400 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
401 DR_REF (drb));
402 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
404 return true;
407 if (dump_enabled_p ())
409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
410 "versioning for alias required: "
411 "bad dist vector for ");
412 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
413 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
414 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
415 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
417 /* Add to list of ddrs that need to be tested at run-time. */
418 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
421 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
423 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
424 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
425 loop_depth, max_vf))
426 return false;
428 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
430 int dist = dist_v[loop_depth];
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_NOTE, vect_location,
434 "dependence distance = %d.\n", dist);
436 if (dist == 0)
438 if (dump_enabled_p ())
440 dump_printf_loc (MSG_NOTE, vect_location,
441 "dependence distance == 0 between ");
442 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
443 dump_printf (MSG_NOTE, " and ");
444 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
445 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
448 /* When we perform grouped accesses and perform implicit CSE
449 by detecting equal accesses and doing disambiguation with
450 runtime alias tests like for
451 .. = a[i];
452 .. = a[i+1];
453 a[i] = ..;
454 a[i+1] = ..;
455 *p = ..;
456 .. = a[i];
457 .. = a[i+1];
458 where we will end up loading { a[i], a[i+1] } once, make
459 sure that inserting group loads before the first load and
460 stores after the last store will do the right thing.
461 Similar for groups like
462 a[i] = ...;
463 ... = a[i];
464 a[i+1] = ...;
465 where loads from the group interleave with the store. */
466 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
470 "READ_WRITE dependence in interleaving.\n");
471 return true;
474 if (loop->safelen < 2)
476 tree indicator = dr_zero_step_indicator (dra);
477 if (TREE_CODE (indicator) != INTEGER_CST)
478 vect_check_nonzero_value (loop_vinfo, indicator);
479 else if (integer_zerop (indicator))
481 if (dump_enabled_p ())
482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
483 "access also has a zero step\n");
484 return true;
487 continue;
490 if (dist > 0 && DDR_REVERSED_P (ddr))
492 /* If DDR_REVERSED_P the order of the data-refs in DDR was
493 reversed (to make distance vector positive), and the actual
494 distance is negative. */
495 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497 "dependence distance negative.\n");
498 /* Record a negative dependence distance to later limit the
499 amount of stmt copying / unrolling we can perform.
500 Only need to handle read-after-write dependence. */
501 if (DR_IS_READ (drb)
502 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
503 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
504 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
505 continue;
508 unsigned int abs_dist = abs (dist);
509 if (abs_dist >= 2 && abs_dist < *max_vf)
511 /* The dependence distance requires reduction of the maximal
512 vectorization factor. */
513 *max_vf = abs (dist);
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location,
516 "adjusting maximal vectorization factor to %i\n",
517 *max_vf);
520 if (abs_dist >= *max_vf)
522 /* Dependence distance does not create dependence, as far as
523 vectorization is concerned, in this case. */
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location,
526 "dependence distance >= VF.\n");
527 continue;
530 if (dump_enabled_p ())
532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
533 "not vectorized, possible dependence "
534 "between data-refs ");
535 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
536 dump_printf (MSG_NOTE, " and ");
537 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
538 dump_printf (MSG_NOTE, "\n");
541 return true;
544 return false;
547 /* Function vect_analyze_data_ref_dependences.
549 Examine all the data references in the loop, and make sure there do not
550 exist any data dependences between them. Set *MAX_VF according to
551 the maximum vectorization factor the data dependences allow. */
553 bool
554 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
555 unsigned int *max_vf)
557 unsigned int i;
558 struct data_dependence_relation *ddr;
560 if (dump_enabled_p ())
561 dump_printf_loc (MSG_NOTE, vect_location,
562 "=== vect_analyze_data_ref_dependences ===\n");
564 LOOP_VINFO_DDRS (loop_vinfo)
565 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
566 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
567 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
568 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
569 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
570 &LOOP_VINFO_DDRS (loop_vinfo),
571 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
572 return false;
574 /* For epilogues we either have no aliases or alias versioning
575 was applied to original loop. Therefore we may just get max_vf
576 using VF of original loop. */
577 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
578 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
579 else
580 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
581 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
582 return false;
584 return true;
588 /* Function vect_slp_analyze_data_ref_dependence.
590 Return TRUE if there (might) exist a dependence between a memory-reference
591 DRA and a memory-reference DRB. When versioning for alias may check a
592 dependence at run-time, return FALSE. Adjust *MAX_VF according to
593 the data dependence. */
595 static bool
596 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
598 struct data_reference *dra = DDR_A (ddr);
599 struct data_reference *drb = DDR_B (ddr);
601 /* We need to check dependences of statements marked as unvectorizable
602 as well, they still can prohibit vectorization. */
604 /* Independent data accesses. */
605 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
606 return false;
608 if (dra == drb)
609 return false;
611 /* Read-read is OK. */
612 if (DR_IS_READ (dra) && DR_IS_READ (drb))
613 return false;
615 /* If dra and drb are part of the same interleaving chain consider
616 them independent. */
617 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
618 && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
619 == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
620 return false;
622 /* Unknown data dependence. */
623 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
625 if (dump_enabled_p ())
627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
628 "can't determine dependence between ");
629 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
630 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
631 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
632 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
635 else if (dump_enabled_p ())
637 dump_printf_loc (MSG_NOTE, vect_location,
638 "determined dependence between ");
639 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
640 dump_printf (MSG_NOTE, " and ");
641 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
642 dump_printf (MSG_NOTE, "\n");
645 return true;
649 /* Analyze dependences involved in the transform of SLP NODE. STORES
650 contain the vector of scalar stores of this instance if we are
651 disambiguating the loads. */
653 static bool
654 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
655 vec<gimple *> stores, gimple *last_store)
657 /* This walks over all stmts involved in the SLP load/store done
658 in NODE verifying we can sink them up to the last stmt in the
659 group. */
660 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
661 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
663 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
664 if (access == last_access)
665 continue;
666 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
667 ao_ref ref;
668 bool ref_initialized_p = false;
669 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
670 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
672 gimple *stmt = gsi_stmt (gsi);
673 if (! gimple_vuse (stmt)
674 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
675 continue;
677 /* If we couldn't record a (single) data reference for this
678 stmt we have to resort to the alias oracle. */
679 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
680 if (!dr_b)
682 /* We are moving a store or sinking a load - this means
683 we cannot use TBAA for disambiguation. */
684 if (!ref_initialized_p)
685 ao_ref_init (&ref, DR_REF (dr_a));
686 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
687 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
688 return false;
689 continue;
692 bool dependent = false;
693 /* If we run into a store of this same instance (we've just
694 marked those) then delay dependence checking until we run
695 into the last store because this is where it will have
696 been sunk to (and we verify if we can do that as well). */
697 if (gimple_visited_p (stmt))
699 if (stmt != last_store)
700 continue;
701 unsigned i;
702 gimple *store;
703 FOR_EACH_VEC_ELT (stores, i, store)
705 data_reference *store_dr
706 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
707 ddr_p ddr = initialize_data_dependence_relation
708 (dr_a, store_dr, vNULL);
709 dependent = vect_slp_analyze_data_ref_dependence (ddr);
710 free_dependence_relation (ddr);
711 if (dependent)
712 break;
715 else
717 ddr_p ddr = initialize_data_dependence_relation (dr_a,
718 dr_b, vNULL);
719 dependent = vect_slp_analyze_data_ref_dependence (ddr);
720 free_dependence_relation (ddr);
722 if (dependent)
723 return false;
726 return true;
730 /* Function vect_analyze_data_ref_dependences.
732 Examine all the data references in the basic-block, and make sure there
733 do not exist any data dependences between them. Set *MAX_VF according to
734 the maximum vectorization factor the data dependences allow. */
736 bool
737 vect_slp_analyze_instance_dependence (slp_instance instance)
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location,
741 "=== vect_slp_analyze_instance_dependence ===\n");
743 /* The stores of this instance are at the root of the SLP tree. */
744 slp_tree store = SLP_INSTANCE_TREE (instance);
745 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
746 store = NULL;
748 /* Verify we can sink stores to the vectorized stmt insert location. */
749 gimple *last_store = NULL;
750 if (store)
752 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
753 return false;
755 /* Mark stores in this instance and remember the last one. */
756 last_store = vect_find_last_scalar_stmt_in_slp (store);
757 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
758 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
761 bool res = true;
763 /* Verify we can sink loads to the vectorized stmt insert location,
764 special-casing stores of this instance. */
765 slp_tree load;
766 unsigned int i;
767 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
768 if (! vect_slp_analyze_node_dependences (instance, load,
769 store
770 ? SLP_TREE_SCALAR_STMTS (store)
771 : vNULL, last_store))
773 res = false;
774 break;
777 /* Unset the visited flag. */
778 if (store)
779 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
780 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
782 return res;
785 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
786 the statement that contains DRB, which is useful for recording in the
787 dump file. */
789 static void
790 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
791 innermost_loop_behavior *drb)
793 bool existed;
794 innermost_loop_behavior *&entry
795 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
796 if (!existed || entry->base_alignment < drb->base_alignment)
798 entry = drb;
799 if (dump_enabled_p ())
801 dump_printf_loc (MSG_NOTE, vect_location,
802 "recording new base alignment for ");
803 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
804 dump_printf (MSG_NOTE, "\n");
805 dump_printf_loc (MSG_NOTE, vect_location,
806 " alignment: %d\n", drb->base_alignment);
807 dump_printf_loc (MSG_NOTE, vect_location,
808 " misalignment: %d\n", drb->base_misalignment);
809 dump_printf_loc (MSG_NOTE, vect_location,
810 " based on: ");
811 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
816 /* If the region we're going to vectorize is reached, all unconditional
817 data references occur at least once. We can therefore pool the base
818 alignment guarantees from each unconditional reference. Do this by
819 going through all the data references in VINFO and checking whether
820 the containing statement makes the reference unconditionally. If so,
821 record the alignment of the base address in VINFO so that it can be
822 used for all other references with the same base. */
824 void
825 vect_record_base_alignments (vec_info *vinfo)
827 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
828 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
829 data_reference *dr;
830 unsigned int i;
831 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
832 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
834 gimple *stmt = DR_STMT (dr);
835 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
837 /* If DR is nested in the loop that is being vectorized, we can also
838 record the alignment of the base wrt the outer loop. */
839 if (loop && nested_in_vect_loop_p (loop, stmt))
841 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
842 vect_record_base_alignment
843 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
848 /* Return the target alignment for the vectorized form of DR. */
850 static unsigned int
851 vect_calculate_target_alignment (struct data_reference *dr)
853 gimple *stmt = DR_STMT (dr);
854 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
855 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
856 return targetm.vectorize.preferred_vector_alignment (vectype);
859 /* Function vect_compute_data_ref_alignment
861 Compute the misalignment of the data reference DR.
863 Output:
864 1. If during the misalignment computation it is found that the data reference
865 cannot be vectorized then false is returned.
866 2. DR_MISALIGNMENT (DR) is defined.
868 FOR NOW: No analysis is actually performed. Misalignment is calculated
869 only for trivial cases. TODO. */
871 bool
872 vect_compute_data_ref_alignment (struct data_reference *dr)
874 gimple *stmt = DR_STMT (dr);
875 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
876 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
877 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
878 struct loop *loop = NULL;
879 tree ref = DR_REF (dr);
880 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
882 if (dump_enabled_p ())
883 dump_printf_loc (MSG_NOTE, vect_location,
884 "vect_compute_data_ref_alignment:\n");
886 if (loop_vinfo)
887 loop = LOOP_VINFO_LOOP (loop_vinfo);
889 /* Initialize misalignment to unknown. */
890 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
892 innermost_loop_behavior *drb = vect_dr_behavior (dr);
893 bool step_preserves_misalignment_p;
895 unsigned HOST_WIDE_INT vector_alignment
896 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
897 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
899 /* No step for BB vectorization. */
900 if (!loop)
902 gcc_assert (integer_zerop (drb->step));
903 step_preserves_misalignment_p = true;
906 /* In case the dataref is in an inner-loop of the loop that is being
907 vectorized (LOOP), we use the base and misalignment information
908 relative to the outer-loop (LOOP). This is ok only if the misalignment
909 stays the same throughout the execution of the inner-loop, which is why
910 we have to check that the stride of the dataref in the inner-loop evenly
911 divides by the vector alignment. */
912 else if (nested_in_vect_loop_p (loop, stmt))
914 step_preserves_misalignment_p
915 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
917 if (dump_enabled_p ())
919 if (step_preserves_misalignment_p)
920 dump_printf_loc (MSG_NOTE, vect_location,
921 "inner step divides the vector alignment.\n");
922 else
923 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
924 "inner step doesn't divide the vector"
925 " alignment.\n");
929 /* Similarly we can only use base and misalignment information relative to
930 an innermost loop if the misalignment stays the same throughout the
931 execution of the loop. As above, this is the case if the stride of
932 the dataref evenly divides by the alignment. */
933 else
935 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
936 step_preserves_misalignment_p
937 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
939 if (!step_preserves_misalignment_p && dump_enabled_p ())
940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
941 "step doesn't divide the vector alignment.\n");
944 unsigned int base_alignment = drb->base_alignment;
945 unsigned int base_misalignment = drb->base_misalignment;
947 /* Calculate the maximum of the pooled base address alignment and the
948 alignment that we can compute for DR itself. */
949 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
950 if (entry && base_alignment < (*entry)->base_alignment)
952 base_alignment = (*entry)->base_alignment;
953 base_misalignment = (*entry)->base_misalignment;
956 if (drb->offset_alignment < vector_alignment
957 || !step_preserves_misalignment_p
958 /* We need to know whether the step wrt the vectorized loop is
959 negative when computing the starting misalignment below. */
960 || TREE_CODE (drb->step) != INTEGER_CST)
962 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 "Unknown alignment for access: ");
966 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
967 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
969 return true;
972 if (base_alignment < vector_alignment)
974 unsigned int max_alignment;
975 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
976 if (max_alignment < vector_alignment
977 || !vect_can_force_dr_alignment_p (base,
978 vector_alignment * BITS_PER_UNIT))
980 if (dump_enabled_p ())
982 dump_printf_loc (MSG_NOTE, vect_location,
983 "can't force alignment of ref: ");
984 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
985 dump_printf (MSG_NOTE, "\n");
987 return true;
990 /* Force the alignment of the decl.
991 NOTE: This is the only change to the code we make during
992 the analysis phase, before deciding to vectorize the loop. */
993 if (dump_enabled_p ())
995 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
996 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
997 dump_printf (MSG_NOTE, "\n");
1000 DR_VECT_AUX (dr)->base_decl = base;
1001 DR_VECT_AUX (dr)->base_misaligned = true;
1002 base_misalignment = 0;
1004 poly_int64 misalignment
1005 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1007 /* If this is a backward running DR then first access in the larger
1008 vectype actually is N-1 elements before the address in the DR.
1009 Adjust misalign accordingly. */
1010 if (tree_int_cst_sgn (drb->step) < 0)
1011 /* PLUS because STEP is negative. */
1012 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1013 * TREE_INT_CST_LOW (drb->step));
1015 unsigned int const_misalignment;
1016 if (!known_misalignment (misalignment, vector_alignment,
1017 &const_misalignment))
1019 if (dump_enabled_p ())
1021 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1022 "Non-constant misalignment for access: ");
1023 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1024 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1026 return true;
1029 SET_DR_MISALIGNMENT (dr, const_misalignment);
1031 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1035 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1036 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1039 return true;
1042 /* Function vect_update_misalignment_for_peel.
1043 Sets DR's misalignment
1044 - to 0 if it has the same alignment as DR_PEEL,
1045 - to the misalignment computed using NPEEL if DR's salignment is known,
1046 - to -1 (unknown) otherwise.
1048 DR - the data reference whose misalignment is to be adjusted.
1049 DR_PEEL - the data reference whose misalignment is being made
1050 zero in the vector loop by the peel.
1051 NPEEL - the number of iterations in the peel loop if the misalignment
1052 of DR_PEEL is known at compile time. */
1054 static void
1055 vect_update_misalignment_for_peel (struct data_reference *dr,
1056 struct data_reference *dr_peel, int npeel)
1058 unsigned int i;
1059 vec<dr_p> same_aligned_drs;
1060 struct data_reference *current_dr;
1061 int dr_size = vect_get_scalar_dr_size (dr);
1062 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1063 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1064 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1066 /* For interleaved data accesses the step in the loop must be multiplied by
1067 the size of the interleaving group. */
1068 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1069 dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1070 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1071 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1073 /* It can be assumed that the data refs with the same alignment as dr_peel
1074 are aligned in the vector loop. */
1075 same_aligned_drs
1076 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1077 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1079 if (current_dr != dr)
1080 continue;
1081 gcc_assert (!known_alignment_for_access_p (dr)
1082 || !known_alignment_for_access_p (dr_peel)
1083 || (DR_MISALIGNMENT (dr) / dr_size
1084 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1085 SET_DR_MISALIGNMENT (dr, 0);
1086 return;
1089 if (known_alignment_for_access_p (dr)
1090 && known_alignment_for_access_p (dr_peel))
1092 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1093 int misal = DR_MISALIGNMENT (dr);
1094 misal += negative ? -npeel * dr_size : npeel * dr_size;
1095 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1096 SET_DR_MISALIGNMENT (dr, misal);
1097 return;
1100 if (dump_enabled_p ())
1101 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1102 "to unknown (-1).\n");
1103 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1107 /* Function verify_data_ref_alignment
1109 Return TRUE if DR can be handled with respect to alignment. */
1111 static bool
1112 verify_data_ref_alignment (data_reference_p dr)
1114 enum dr_alignment_support supportable_dr_alignment
1115 = vect_supportable_dr_alignment (dr, false);
1116 if (!supportable_dr_alignment)
1118 if (dump_enabled_p ())
1120 if (DR_IS_READ (dr))
1121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1122 "not vectorized: unsupported unaligned load.");
1123 else
1124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1125 "not vectorized: unsupported unaligned "
1126 "store.");
1128 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1129 DR_REF (dr));
1130 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1132 return false;
1135 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1136 dump_printf_loc (MSG_NOTE, vect_location,
1137 "Vectorizing an unaligned access.\n");
1139 return true;
1142 /* Function vect_verify_datarefs_alignment
1144 Return TRUE if all data references in the loop can be
1145 handled with respect to alignment. */
1147 bool
1148 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1150 vec<data_reference_p> datarefs = vinfo->datarefs;
1151 struct data_reference *dr;
1152 unsigned int i;
1154 FOR_EACH_VEC_ELT (datarefs, i, dr)
1156 gimple *stmt = DR_STMT (dr);
1157 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1159 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1160 continue;
1162 /* For interleaving, only the alignment of the first access matters. */
1163 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1164 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1165 continue;
1167 /* Strided accesses perform only component accesses, alignment is
1168 irrelevant for them. */
1169 if (STMT_VINFO_STRIDED_P (stmt_info)
1170 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1171 continue;
1173 if (! verify_data_ref_alignment (dr))
1174 return false;
1177 return true;
1180 /* Given an memory reference EXP return whether its alignment is less
1181 than its size. */
1183 static bool
1184 not_size_aligned (tree exp)
1186 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1187 return true;
1189 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1190 > get_object_alignment (exp));
1193 /* Function vector_alignment_reachable_p
1195 Return true if vector alignment for DR is reachable by peeling
1196 a few loop iterations. Return false otherwise. */
1198 static bool
1199 vector_alignment_reachable_p (struct data_reference *dr)
1201 gimple *stmt = DR_STMT (dr);
1202 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1203 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1205 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1207 /* For interleaved access we peel only if number of iterations in
1208 the prolog loop ({VF - misalignment}), is a multiple of the
1209 number of the interleaved accesses. */
1210 int elem_size, mis_in_elements;
1212 /* FORNOW: handle only known alignment. */
1213 if (!known_alignment_for_access_p (dr))
1214 return false;
1216 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1217 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1218 elem_size = vector_element_size (vector_size, nelements);
1219 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1221 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1222 return false;
1225 /* If misalignment is known at the compile time then allow peeling
1226 only if natural alignment is reachable through peeling. */
1227 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1229 HOST_WIDE_INT elmsize =
1230 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1231 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_NOTE, vect_location,
1234 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1235 dump_printf (MSG_NOTE,
1236 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1238 if (DR_MISALIGNMENT (dr) % elmsize)
1240 if (dump_enabled_p ())
1241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1242 "data size does not divide the misalignment.\n");
1243 return false;
1247 if (!known_alignment_for_access_p (dr))
1249 tree type = TREE_TYPE (DR_REF (dr));
1250 bool is_packed = not_size_aligned (DR_REF (dr));
1251 if (dump_enabled_p ())
1252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1253 "Unknown misalignment, %snaturally aligned\n",
1254 is_packed ? "not " : "");
1255 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1258 return true;
1262 /* Calculate the cost of the memory access represented by DR. */
1264 static void
1265 vect_get_data_access_cost (struct data_reference *dr,
1266 unsigned int *inside_cost,
1267 unsigned int *outside_cost,
1268 stmt_vector_for_cost *body_cost_vec,
1269 stmt_vector_for_cost *prologue_cost_vec)
1271 gimple *stmt = DR_STMT (dr);
1272 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1273 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1274 int ncopies;
1276 if (PURE_SLP_STMT (stmt_info))
1277 ncopies = 1;
1278 else
1279 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1281 if (DR_IS_READ (dr))
1282 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1283 prologue_cost_vec, body_cost_vec, false);
1284 else
1285 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1287 if (dump_enabled_p ())
1288 dump_printf_loc (MSG_NOTE, vect_location,
1289 "vect_get_data_access_cost: inside_cost = %d, "
1290 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1294 typedef struct _vect_peel_info
1296 struct data_reference *dr;
1297 int npeel;
1298 unsigned int count;
1299 } *vect_peel_info;
1301 typedef struct _vect_peel_extended_info
1303 struct _vect_peel_info peel_info;
1304 unsigned int inside_cost;
1305 unsigned int outside_cost;
1306 } *vect_peel_extended_info;
1309 /* Peeling hashtable helpers. */
1311 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1313 static inline hashval_t hash (const _vect_peel_info *);
1314 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1317 inline hashval_t
1318 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1320 return (hashval_t) peel_info->npeel;
1323 inline bool
1324 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1326 return (a->npeel == b->npeel);
1330 /* Insert DR into peeling hash table with NPEEL as key. */
1332 static void
1333 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1334 loop_vec_info loop_vinfo, struct data_reference *dr,
1335 int npeel)
1337 struct _vect_peel_info elem, *slot;
1338 _vect_peel_info **new_slot;
1339 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1341 elem.npeel = npeel;
1342 slot = peeling_htab->find (&elem);
1343 if (slot)
1344 slot->count++;
1345 else
1347 slot = XNEW (struct _vect_peel_info);
1348 slot->npeel = npeel;
1349 slot->dr = dr;
1350 slot->count = 1;
1351 new_slot = peeling_htab->find_slot (slot, INSERT);
1352 *new_slot = slot;
1355 if (!supportable_dr_alignment
1356 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1357 slot->count += VECT_MAX_COST;
1361 /* Traverse peeling hash table to find peeling option that aligns maximum
1362 number of data accesses. */
1365 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1366 _vect_peel_extended_info *max)
1368 vect_peel_info elem = *slot;
1370 if (elem->count > max->peel_info.count
1371 || (elem->count == max->peel_info.count
1372 && max->peel_info.npeel > elem->npeel))
1374 max->peel_info.npeel = elem->npeel;
1375 max->peel_info.count = elem->count;
1376 max->peel_info.dr = elem->dr;
1379 return 1;
1382 /* Get the costs of peeling NPEEL iterations checking data access costs
1383 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1384 misalignment will be zero after peeling. */
1386 static void
1387 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1388 struct data_reference *dr0,
1389 unsigned int *inside_cost,
1390 unsigned int *outside_cost,
1391 stmt_vector_for_cost *body_cost_vec,
1392 stmt_vector_for_cost *prologue_cost_vec,
1393 unsigned int npeel,
1394 bool unknown_misalignment)
1396 unsigned i;
1397 data_reference *dr;
1399 FOR_EACH_VEC_ELT (datarefs, i, dr)
1401 gimple *stmt = DR_STMT (dr);
1402 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1403 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1404 continue;
1406 /* For interleaving, only the alignment of the first access
1407 matters. */
1408 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1409 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1410 continue;
1412 /* Strided accesses perform only component accesses, alignment is
1413 irrelevant for them. */
1414 if (STMT_VINFO_STRIDED_P (stmt_info)
1415 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1416 continue;
1418 int save_misalignment;
1419 save_misalignment = DR_MISALIGNMENT (dr);
1420 if (npeel == 0)
1422 else if (unknown_misalignment && dr == dr0)
1423 SET_DR_MISALIGNMENT (dr, 0);
1424 else
1425 vect_update_misalignment_for_peel (dr, dr0, npeel);
1426 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1427 body_cost_vec, prologue_cost_vec);
1428 SET_DR_MISALIGNMENT (dr, save_misalignment);
1432 /* Traverse peeling hash table and calculate cost for each peeling option.
1433 Find the one with the lowest cost. */
1436 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1437 _vect_peel_extended_info *min)
1439 vect_peel_info elem = *slot;
1440 int dummy;
1441 unsigned int inside_cost = 0, outside_cost = 0;
1442 gimple *stmt = DR_STMT (elem->dr);
1443 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1444 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1445 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1446 epilogue_cost_vec;
1448 prologue_cost_vec.create (2);
1449 body_cost_vec.create (2);
1450 epilogue_cost_vec.create (2);
1452 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1453 elem->dr, &inside_cost, &outside_cost,
1454 &body_cost_vec, &prologue_cost_vec,
1455 elem->npeel, false);
1457 body_cost_vec.release ();
1459 outside_cost += vect_get_known_peeling_cost
1460 (loop_vinfo, elem->npeel, &dummy,
1461 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1462 &prologue_cost_vec, &epilogue_cost_vec);
1464 /* Prologue and epilogue costs are added to the target model later.
1465 These costs depend only on the scalar iteration cost, the
1466 number of peeling iterations finally chosen, and the number of
1467 misaligned statements. So discard the information found here. */
1468 prologue_cost_vec.release ();
1469 epilogue_cost_vec.release ();
1471 if (inside_cost < min->inside_cost
1472 || (inside_cost == min->inside_cost
1473 && outside_cost < min->outside_cost))
1475 min->inside_cost = inside_cost;
1476 min->outside_cost = outside_cost;
1477 min->peel_info.dr = elem->dr;
1478 min->peel_info.npeel = elem->npeel;
1479 min->peel_info.count = elem->count;
1482 return 1;
1486 /* Choose best peeling option by traversing peeling hash table and either
1487 choosing an option with the lowest cost (if cost model is enabled) or the
1488 option that aligns as many accesses as possible. */
1490 static struct _vect_peel_extended_info
1491 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1492 loop_vec_info loop_vinfo)
1494 struct _vect_peel_extended_info res;
1496 res.peel_info.dr = NULL;
1498 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1500 res.inside_cost = INT_MAX;
1501 res.outside_cost = INT_MAX;
1502 peeling_htab->traverse <_vect_peel_extended_info *,
1503 vect_peeling_hash_get_lowest_cost> (&res);
1505 else
1507 res.peel_info.count = 0;
1508 peeling_htab->traverse <_vect_peel_extended_info *,
1509 vect_peeling_hash_get_most_frequent> (&res);
1510 res.inside_cost = 0;
1511 res.outside_cost = 0;
1514 return res;
1517 /* Return true if the new peeling NPEEL is supported. */
1519 static bool
1520 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1521 unsigned npeel)
1523 unsigned i;
1524 struct data_reference *dr = NULL;
1525 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1526 gimple *stmt;
1527 stmt_vec_info stmt_info;
1528 enum dr_alignment_support supportable_dr_alignment;
1530 /* Ensure that all data refs can be vectorized after the peel. */
1531 FOR_EACH_VEC_ELT (datarefs, i, dr)
1533 int save_misalignment;
1535 if (dr == dr0)
1536 continue;
1538 stmt = DR_STMT (dr);
1539 stmt_info = vinfo_for_stmt (stmt);
1540 /* For interleaving, only the alignment of the first access
1541 matters. */
1542 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1543 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1544 continue;
1546 /* Strided accesses perform only component accesses, alignment is
1547 irrelevant for them. */
1548 if (STMT_VINFO_STRIDED_P (stmt_info)
1549 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1550 continue;
1552 save_misalignment = DR_MISALIGNMENT (dr);
1553 vect_update_misalignment_for_peel (dr, dr0, npeel);
1554 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1555 SET_DR_MISALIGNMENT (dr, save_misalignment);
1557 if (!supportable_dr_alignment)
1558 return false;
1561 return true;
1564 /* Function vect_enhance_data_refs_alignment
1566 This pass will use loop versioning and loop peeling in order to enhance
1567 the alignment of data references in the loop.
1569 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1570 original loop is to be vectorized. Any other loops that are created by
1571 the transformations performed in this pass - are not supposed to be
1572 vectorized. This restriction will be relaxed.
1574 This pass will require a cost model to guide it whether to apply peeling
1575 or versioning or a combination of the two. For example, the scheme that
1576 intel uses when given a loop with several memory accesses, is as follows:
1577 choose one memory access ('p') which alignment you want to force by doing
1578 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1579 other accesses are not necessarily aligned, or (2) use loop versioning to
1580 generate one loop in which all accesses are aligned, and another loop in
1581 which only 'p' is necessarily aligned.
1583 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1584 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1585 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1587 Devising a cost model is the most critical aspect of this work. It will
1588 guide us on which access to peel for, whether to use loop versioning, how
1589 many versions to create, etc. The cost model will probably consist of
1590 generic considerations as well as target specific considerations (on
1591 powerpc for example, misaligned stores are more painful than misaligned
1592 loads).
1594 Here are the general steps involved in alignment enhancements:
1596 -- original loop, before alignment analysis:
1597 for (i=0; i<N; i++){
1598 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1599 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1602 -- After vect_compute_data_refs_alignment:
1603 for (i=0; i<N; i++){
1604 x = q[i]; # DR_MISALIGNMENT(q) = 3
1605 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1608 -- Possibility 1: we do loop versioning:
1609 if (p is aligned) {
1610 for (i=0; i<N; i++){ # loop 1A
1611 x = q[i]; # DR_MISALIGNMENT(q) = 3
1612 p[i] = y; # DR_MISALIGNMENT(p) = 0
1615 else {
1616 for (i=0; i<N; i++){ # loop 1B
1617 x = q[i]; # DR_MISALIGNMENT(q) = 3
1618 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1622 -- Possibility 2: we do loop peeling:
1623 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1624 x = q[i];
1625 p[i] = y;
1627 for (i = 3; i < N; i++){ # loop 2A
1628 x = q[i]; # DR_MISALIGNMENT(q) = 0
1629 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1632 -- Possibility 3: combination of loop peeling and versioning:
1633 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1634 x = q[i];
1635 p[i] = y;
1637 if (p is aligned) {
1638 for (i = 3; i<N; i++){ # loop 3A
1639 x = q[i]; # DR_MISALIGNMENT(q) = 0
1640 p[i] = y; # DR_MISALIGNMENT(p) = 0
1643 else {
1644 for (i = 3; i<N; i++){ # loop 3B
1645 x = q[i]; # DR_MISALIGNMENT(q) = 0
1646 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1650 These loops are later passed to loop_transform to be vectorized. The
1651 vectorizer will use the alignment information to guide the transformation
1652 (whether to generate regular loads/stores, or with special handling for
1653 misalignment). */
1655 bool
1656 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1658 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1659 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1660 enum dr_alignment_support supportable_dr_alignment;
1661 struct data_reference *dr0 = NULL, *first_store = NULL;
1662 struct data_reference *dr;
1663 unsigned int i, j;
1664 bool do_peeling = false;
1665 bool do_versioning = false;
1666 bool stat;
1667 gimple *stmt;
1668 stmt_vec_info stmt_info;
1669 unsigned int npeel = 0;
1670 bool one_misalignment_known = false;
1671 bool one_misalignment_unknown = false;
1672 bool one_dr_unsupportable = false;
1673 struct data_reference *unsupportable_dr = NULL;
1674 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1675 unsigned possible_npeel_number = 1;
1676 tree vectype;
1677 unsigned int mis, same_align_drs_max = 0;
1678 hash_table<peel_info_hasher> peeling_htab (1);
1680 if (dump_enabled_p ())
1681 dump_printf_loc (MSG_NOTE, vect_location,
1682 "=== vect_enhance_data_refs_alignment ===\n");
1684 /* Reset data so we can safely be called multiple times. */
1685 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1686 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1688 /* While cost model enhancements are expected in the future, the high level
1689 view of the code at this time is as follows:
1691 A) If there is a misaligned access then see if peeling to align
1692 this access can make all data references satisfy
1693 vect_supportable_dr_alignment. If so, update data structures
1694 as needed and return true.
1696 B) If peeling wasn't possible and there is a data reference with an
1697 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1698 then see if loop versioning checks can be used to make all data
1699 references satisfy vect_supportable_dr_alignment. If so, update
1700 data structures as needed and return true.
1702 C) If neither peeling nor versioning were successful then return false if
1703 any data reference does not satisfy vect_supportable_dr_alignment.
1705 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1707 Note, Possibility 3 above (which is peeling and versioning together) is not
1708 being done at this time. */
1710 /* (1) Peeling to force alignment. */
1712 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1713 Considerations:
1714 + How many accesses will become aligned due to the peeling
1715 - How many accesses will become unaligned due to the peeling,
1716 and the cost of misaligned accesses.
1717 - The cost of peeling (the extra runtime checks, the increase
1718 in code size). */
1720 FOR_EACH_VEC_ELT (datarefs, i, dr)
1722 stmt = DR_STMT (dr);
1723 stmt_info = vinfo_for_stmt (stmt);
1725 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1726 continue;
1728 /* For interleaving, only the alignment of the first access
1729 matters. */
1730 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1731 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1732 continue;
1734 /* For invariant accesses there is nothing to enhance. */
1735 if (integer_zerop (DR_STEP (dr)))
1736 continue;
1738 /* Strided accesses perform only component accesses, alignment is
1739 irrelevant for them. */
1740 if (STMT_VINFO_STRIDED_P (stmt_info)
1741 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1742 continue;
1744 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1745 do_peeling = vector_alignment_reachable_p (dr);
1746 if (do_peeling)
1748 if (known_alignment_for_access_p (dr))
1750 unsigned int npeel_tmp = 0;
1751 bool negative = tree_int_cst_compare (DR_STEP (dr),
1752 size_zero_node) < 0;
1754 vectype = STMT_VINFO_VECTYPE (stmt_info);
1755 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1756 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1757 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1758 if (DR_MISALIGNMENT (dr) != 0)
1759 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1761 /* For multiple types, it is possible that the bigger type access
1762 will have more than one peeling option. E.g., a loop with two
1763 types: one of size (vector size / 4), and the other one of
1764 size (vector size / 8). Vectorization factor will 8. If both
1765 accesses are misaligned by 3, the first one needs one scalar
1766 iteration to be aligned, and the second one needs 5. But the
1767 first one will be aligned also by peeling 5 scalar
1768 iterations, and in that case both accesses will be aligned.
1769 Hence, except for the immediate peeling amount, we also want
1770 to try to add full vector size, while we don't exceed
1771 vectorization factor.
1772 We do this automatically for cost model, since we calculate
1773 cost for every peeling option. */
1774 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1776 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1777 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1778 possible_npeel_number
1779 = vect_get_num_vectors (nscalars, vectype);
1781 /* NPEEL_TMP is 0 when there is no misalignment, but also
1782 allow peeling NELEMENTS. */
1783 if (DR_MISALIGNMENT (dr) == 0)
1784 possible_npeel_number++;
1787 /* Save info about DR in the hash table. Also include peeling
1788 amounts according to the explanation above. */
1789 for (j = 0; j < possible_npeel_number; j++)
1791 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1792 dr, npeel_tmp);
1793 npeel_tmp += target_align / dr_size;
1796 one_misalignment_known = true;
1798 else
1800 /* If we don't know any misalignment values, we prefer
1801 peeling for data-ref that has the maximum number of data-refs
1802 with the same alignment, unless the target prefers to align
1803 stores over load. */
1804 unsigned same_align_drs
1805 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1806 if (!dr0
1807 || same_align_drs_max < same_align_drs)
1809 same_align_drs_max = same_align_drs;
1810 dr0 = dr;
1812 /* For data-refs with the same number of related
1813 accesses prefer the one where the misalign
1814 computation will be invariant in the outermost loop. */
1815 else if (same_align_drs_max == same_align_drs)
1817 struct loop *ivloop0, *ivloop;
1818 ivloop0 = outermost_invariant_loop_for_expr
1819 (loop, DR_BASE_ADDRESS (dr0));
1820 ivloop = outermost_invariant_loop_for_expr
1821 (loop, DR_BASE_ADDRESS (dr));
1822 if ((ivloop && !ivloop0)
1823 || (ivloop && ivloop0
1824 && flow_loop_nested_p (ivloop, ivloop0)))
1825 dr0 = dr;
1828 one_misalignment_unknown = true;
1830 /* Check for data refs with unsupportable alignment that
1831 can be peeled. */
1832 if (!supportable_dr_alignment)
1834 one_dr_unsupportable = true;
1835 unsupportable_dr = dr;
1838 if (!first_store && DR_IS_WRITE (dr))
1839 first_store = dr;
1842 else
1844 if (!aligned_access_p (dr))
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848 "vector alignment may not be reachable\n");
1849 break;
1854 /* Check if we can possibly peel the loop. */
1855 if (!vect_can_advance_ivs_p (loop_vinfo)
1856 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1857 || loop->inner)
1858 do_peeling = false;
1860 struct _vect_peel_extended_info peel_for_known_alignment;
1861 struct _vect_peel_extended_info peel_for_unknown_alignment;
1862 struct _vect_peel_extended_info best_peel;
1864 peel_for_unknown_alignment.inside_cost = INT_MAX;
1865 peel_for_unknown_alignment.outside_cost = INT_MAX;
1866 peel_for_unknown_alignment.peel_info.count = 0;
1868 if (do_peeling
1869 && one_misalignment_unknown)
1871 /* Check if the target requires to prefer stores over loads, i.e., if
1872 misaligned stores are more expensive than misaligned loads (taking
1873 drs with same alignment into account). */
1874 unsigned int load_inside_cost = 0;
1875 unsigned int load_outside_cost = 0;
1876 unsigned int store_inside_cost = 0;
1877 unsigned int store_outside_cost = 0;
1878 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1880 stmt_vector_for_cost dummy;
1881 dummy.create (2);
1882 vect_get_peeling_costs_all_drs (datarefs, dr0,
1883 &load_inside_cost,
1884 &load_outside_cost,
1885 &dummy, &dummy, estimated_npeels, true);
1886 dummy.release ();
1888 if (first_store)
1890 dummy.create (2);
1891 vect_get_peeling_costs_all_drs (datarefs, first_store,
1892 &store_inside_cost,
1893 &store_outside_cost,
1894 &dummy, &dummy,
1895 estimated_npeels, true);
1896 dummy.release ();
1898 else
1900 store_inside_cost = INT_MAX;
1901 store_outside_cost = INT_MAX;
1904 if (load_inside_cost > store_inside_cost
1905 || (load_inside_cost == store_inside_cost
1906 && load_outside_cost > store_outside_cost))
1908 dr0 = first_store;
1909 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1910 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1912 else
1914 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1915 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1918 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1919 prologue_cost_vec.create (2);
1920 epilogue_cost_vec.create (2);
1922 int dummy2;
1923 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1924 (loop_vinfo, estimated_npeels, &dummy2,
1925 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1926 &prologue_cost_vec, &epilogue_cost_vec);
1928 prologue_cost_vec.release ();
1929 epilogue_cost_vec.release ();
1931 peel_for_unknown_alignment.peel_info.count = 1
1932 + STMT_VINFO_SAME_ALIGN_REFS
1933 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1936 peel_for_unknown_alignment.peel_info.npeel = 0;
1937 peel_for_unknown_alignment.peel_info.dr = dr0;
1939 best_peel = peel_for_unknown_alignment;
1941 peel_for_known_alignment.inside_cost = INT_MAX;
1942 peel_for_known_alignment.outside_cost = INT_MAX;
1943 peel_for_known_alignment.peel_info.count = 0;
1944 peel_for_known_alignment.peel_info.dr = NULL;
1946 if (do_peeling && one_misalignment_known)
1948 /* Peeling is possible, but there is no data access that is not supported
1949 unless aligned. So we try to choose the best possible peeling from
1950 the hash table. */
1951 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1952 (&peeling_htab, loop_vinfo);
1955 /* Compare costs of peeling for known and unknown alignment. */
1956 if (peel_for_known_alignment.peel_info.dr != NULL
1957 && peel_for_unknown_alignment.inside_cost
1958 >= peel_for_known_alignment.inside_cost)
1960 best_peel = peel_for_known_alignment;
1962 /* If the best peeling for known alignment has NPEEL == 0, perform no
1963 peeling at all except if there is an unsupportable dr that we can
1964 align. */
1965 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1966 do_peeling = false;
1969 /* If there is an unsupportable data ref, prefer this over all choices so far
1970 since we'd have to discard a chosen peeling except when it accidentally
1971 aligned the unsupportable data ref. */
1972 if (one_dr_unsupportable)
1973 dr0 = unsupportable_dr;
1974 else if (do_peeling)
1976 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1977 TODO: Use nopeel_outside_cost or get rid of it? */
1978 unsigned nopeel_inside_cost = 0;
1979 unsigned nopeel_outside_cost = 0;
1981 stmt_vector_for_cost dummy;
1982 dummy.create (2);
1983 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1984 &nopeel_outside_cost, &dummy, &dummy,
1985 0, false);
1986 dummy.release ();
1988 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1989 costs will be recorded. */
1990 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1991 prologue_cost_vec.create (2);
1992 epilogue_cost_vec.create (2);
1994 int dummy2;
1995 nopeel_outside_cost += vect_get_known_peeling_cost
1996 (loop_vinfo, 0, &dummy2,
1997 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1998 &prologue_cost_vec, &epilogue_cost_vec);
2000 prologue_cost_vec.release ();
2001 epilogue_cost_vec.release ();
2003 npeel = best_peel.peel_info.npeel;
2004 dr0 = best_peel.peel_info.dr;
2006 /* If no peeling is not more expensive than the best peeling we
2007 have so far, don't perform any peeling. */
2008 if (nopeel_inside_cost <= best_peel.inside_cost)
2009 do_peeling = false;
2012 if (do_peeling)
2014 stmt = DR_STMT (dr0);
2015 stmt_info = vinfo_for_stmt (stmt);
2016 vectype = STMT_VINFO_VECTYPE (stmt_info);
2018 if (known_alignment_for_access_p (dr0))
2020 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2021 size_zero_node) < 0;
2022 if (!npeel)
2024 /* Since it's known at compile time, compute the number of
2025 iterations in the peeled loop (the peeling factor) for use in
2026 updating DR_MISALIGNMENT values. The peeling factor is the
2027 vectorization factor minus the misalignment as an element
2028 count. */
2029 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2030 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2031 npeel = ((mis & (target_align - 1))
2032 / vect_get_scalar_dr_size (dr0));
2035 /* For interleaved data access every iteration accesses all the
2036 members of the group, therefore we divide the number of iterations
2037 by the group size. */
2038 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2039 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2040 npeel /= DR_GROUP_SIZE (stmt_info);
2042 if (dump_enabled_p ())
2043 dump_printf_loc (MSG_NOTE, vect_location,
2044 "Try peeling by %d\n", npeel);
2047 /* Ensure that all datarefs can be vectorized after the peel. */
2048 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2049 do_peeling = false;
2051 /* Check if all datarefs are supportable and log. */
2052 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2054 stat = vect_verify_datarefs_alignment (loop_vinfo);
2055 if (!stat)
2056 do_peeling = false;
2057 else
2058 return stat;
2061 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2062 if (do_peeling)
2064 unsigned max_allowed_peel
2065 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2066 if (max_allowed_peel != (unsigned)-1)
2068 unsigned max_peel = npeel;
2069 if (max_peel == 0)
2071 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2072 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2074 if (max_peel > max_allowed_peel)
2076 do_peeling = false;
2077 if (dump_enabled_p ())
2078 dump_printf_loc (MSG_NOTE, vect_location,
2079 "Disable peeling, max peels reached: %d\n", max_peel);
2084 /* Cost model #2 - if peeling may result in a remaining loop not
2085 iterating enough to be vectorized then do not peel. Since this
2086 is a cost heuristic rather than a correctness decision, use the
2087 most likely runtime value for variable vectorization factors. */
2088 if (do_peeling
2089 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2091 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2092 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2093 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2094 < assumed_vf + max_peel)
2095 do_peeling = false;
2098 if (do_peeling)
2100 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2101 If the misalignment of DR_i is identical to that of dr0 then set
2102 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2103 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2104 by the peeling factor times the element size of DR_i (MOD the
2105 vectorization factor times the size). Otherwise, the
2106 misalignment of DR_i must be set to unknown. */
2107 FOR_EACH_VEC_ELT (datarefs, i, dr)
2108 if (dr != dr0)
2110 /* Strided accesses perform only component accesses, alignment
2111 is irrelevant for them. */
2112 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2113 if (STMT_VINFO_STRIDED_P (stmt_info)
2114 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2115 continue;
2117 vect_update_misalignment_for_peel (dr, dr0, npeel);
2120 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2121 if (npeel)
2122 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2123 else
2124 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2125 = DR_MISALIGNMENT (dr0);
2126 SET_DR_MISALIGNMENT (dr0, 0);
2127 if (dump_enabled_p ())
2129 dump_printf_loc (MSG_NOTE, vect_location,
2130 "Alignment of access forced using peeling.\n");
2131 dump_printf_loc (MSG_NOTE, vect_location,
2132 "Peeling for alignment will be applied.\n");
2135 /* The inside-loop cost will be accounted for in vectorizable_load
2136 and vectorizable_store correctly with adjusted alignments.
2137 Drop the body_cst_vec on the floor here. */
2138 stat = vect_verify_datarefs_alignment (loop_vinfo);
2139 gcc_assert (stat);
2140 return stat;
2144 /* (2) Versioning to force alignment. */
2146 /* Try versioning if:
2147 1) optimize loop for speed
2148 2) there is at least one unsupported misaligned data ref with an unknown
2149 misalignment, and
2150 3) all misaligned data refs with a known misalignment are supported, and
2151 4) the number of runtime alignment checks is within reason. */
2153 do_versioning =
2154 optimize_loop_nest_for_speed_p (loop)
2155 && (!loop->inner); /* FORNOW */
2157 if (do_versioning)
2159 FOR_EACH_VEC_ELT (datarefs, i, dr)
2161 stmt = DR_STMT (dr);
2162 stmt_info = vinfo_for_stmt (stmt);
2164 /* For interleaving, only the alignment of the first access
2165 matters. */
2166 if (aligned_access_p (dr)
2167 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2168 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2169 continue;
2171 if (STMT_VINFO_STRIDED_P (stmt_info))
2173 /* Strided loads perform only component accesses, alignment is
2174 irrelevant for them. */
2175 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2176 continue;
2177 do_versioning = false;
2178 break;
2181 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2183 if (!supportable_dr_alignment)
2185 gimple *stmt;
2186 int mask;
2187 tree vectype;
2189 if (known_alignment_for_access_p (dr)
2190 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2191 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2193 do_versioning = false;
2194 break;
2197 stmt = DR_STMT (dr);
2198 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2199 gcc_assert (vectype);
2201 /* At present we don't support versioning for alignment
2202 with variable VF, since there's no guarantee that the
2203 VF is a power of two. We could relax this if we added
2204 a way of enforcing a power-of-two size. */
2205 unsigned HOST_WIDE_INT size;
2206 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2208 do_versioning = false;
2209 break;
2212 /* The rightmost bits of an aligned address must be zeros.
2213 Construct the mask needed for this test. For example,
2214 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2215 mask must be 15 = 0xf. */
2216 mask = size - 1;
2218 /* FORNOW: use the same mask to test all potentially unaligned
2219 references in the loop. The vectorizer currently supports
2220 a single vector size, see the reference to
2221 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2222 vectorization factor is computed. */
2223 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2224 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2225 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2226 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2227 DR_STMT (dr));
2231 /* Versioning requires at least one misaligned data reference. */
2232 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2233 do_versioning = false;
2234 else if (!do_versioning)
2235 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2238 if (do_versioning)
2240 vec<gimple *> may_misalign_stmts
2241 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2242 gimple *stmt;
2244 /* It can now be assumed that the data references in the statements
2245 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2246 of the loop being vectorized. */
2247 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2249 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2250 dr = STMT_VINFO_DATA_REF (stmt_info);
2251 SET_DR_MISALIGNMENT (dr, 0);
2252 if (dump_enabled_p ())
2253 dump_printf_loc (MSG_NOTE, vect_location,
2254 "Alignment of access forced using versioning.\n");
2257 if (dump_enabled_p ())
2258 dump_printf_loc (MSG_NOTE, vect_location,
2259 "Versioning for alignment will be applied.\n");
2261 /* Peeling and versioning can't be done together at this time. */
2262 gcc_assert (! (do_peeling && do_versioning));
2264 stat = vect_verify_datarefs_alignment (loop_vinfo);
2265 gcc_assert (stat);
2266 return stat;
2269 /* This point is reached if neither peeling nor versioning is being done. */
2270 gcc_assert (! (do_peeling || do_versioning));
2272 stat = vect_verify_datarefs_alignment (loop_vinfo);
2273 return stat;
2277 /* Function vect_find_same_alignment_drs.
2279 Update group and alignment relations according to the chosen
2280 vectorization factor. */
2282 static void
2283 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2285 struct data_reference *dra = DDR_A (ddr);
2286 struct data_reference *drb = DDR_B (ddr);
2287 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2288 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2290 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2291 return;
2293 if (dra == drb)
2294 return;
2296 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2297 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2298 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2299 return;
2301 /* Two references with distance zero have the same alignment. */
2302 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2303 - wi::to_poly_offset (DR_INIT (drb)));
2304 if (maybe_ne (diff, 0))
2306 /* Get the wider of the two alignments. */
2307 unsigned int align_a = (vect_calculate_target_alignment (dra)
2308 / BITS_PER_UNIT);
2309 unsigned int align_b = (vect_calculate_target_alignment (drb)
2310 / BITS_PER_UNIT);
2311 unsigned int max_align = MAX (align_a, align_b);
2313 /* Require the gap to be a multiple of the larger vector alignment. */
2314 if (!multiple_p (diff, max_align))
2315 return;
2318 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2319 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2320 if (dump_enabled_p ())
2322 dump_printf_loc (MSG_NOTE, vect_location,
2323 "accesses have the same alignment: ");
2324 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2325 dump_printf (MSG_NOTE, " and ");
2326 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2327 dump_printf (MSG_NOTE, "\n");
2332 /* Function vect_analyze_data_refs_alignment
2334 Analyze the alignment of the data-references in the loop.
2335 Return FALSE if a data reference is found that cannot be vectorized. */
2337 bool
2338 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2340 if (dump_enabled_p ())
2341 dump_printf_loc (MSG_NOTE, vect_location,
2342 "=== vect_analyze_data_refs_alignment ===\n");
2344 /* Mark groups of data references with same alignment using
2345 data dependence information. */
2346 vec<ddr_p> ddrs = vinfo->ddrs;
2347 struct data_dependence_relation *ddr;
2348 unsigned int i;
2350 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2351 vect_find_same_alignment_drs (ddr);
2353 vec<data_reference_p> datarefs = vinfo->datarefs;
2354 struct data_reference *dr;
2356 vect_record_base_alignments (vinfo);
2357 FOR_EACH_VEC_ELT (datarefs, i, dr)
2359 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2360 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2361 && !vect_compute_data_ref_alignment (dr))
2363 /* Strided accesses perform only component accesses, misalignment
2364 information is irrelevant for them. */
2365 if (STMT_VINFO_STRIDED_P (stmt_info)
2366 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2367 continue;
2369 if (dump_enabled_p ())
2370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2371 "not vectorized: can't calculate alignment "
2372 "for data ref.\n");
2374 return false;
2378 return true;
2382 /* Analyze alignment of DRs of stmts in NODE. */
2384 static bool
2385 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2387 /* We vectorize from the first scalar stmt in the node unless
2388 the node is permuted in which case we start from the first
2389 element in the group. */
2390 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2391 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2392 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2393 first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2395 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2396 if (! vect_compute_data_ref_alignment (dr)
2397 /* For creating the data-ref pointer we need alignment of the
2398 first element anyway. */
2399 || (dr != first_dr
2400 && ! vect_compute_data_ref_alignment (first_dr))
2401 || ! verify_data_ref_alignment (dr))
2403 if (dump_enabled_p ())
2404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405 "not vectorized: bad data alignment in basic "
2406 "block.\n");
2407 return false;
2410 return true;
2413 /* Function vect_slp_analyze_instance_alignment
2415 Analyze the alignment of the data-references in the SLP instance.
2416 Return FALSE if a data reference is found that cannot be vectorized. */
2418 bool
2419 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2421 if (dump_enabled_p ())
2422 dump_printf_loc (MSG_NOTE, vect_location,
2423 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2425 slp_tree node;
2426 unsigned i;
2427 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2428 if (! vect_slp_analyze_and_verify_node_alignment (node))
2429 return false;
2431 node = SLP_INSTANCE_TREE (instance);
2432 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2433 && ! vect_slp_analyze_and_verify_node_alignment
2434 (SLP_INSTANCE_TREE (instance)))
2435 return false;
2437 return true;
2441 /* Analyze groups of accesses: check that DR belongs to a group of
2442 accesses of legal size, step, etc. Detect gaps, single element
2443 interleaving, and other special cases. Set grouped access info.
2444 Collect groups of strided stores for further use in SLP analysis.
2445 Worker for vect_analyze_group_access. */
2447 static bool
2448 vect_analyze_group_access_1 (struct data_reference *dr)
2450 tree step = DR_STEP (dr);
2451 tree scalar_type = TREE_TYPE (DR_REF (dr));
2452 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2453 gimple *stmt = DR_STMT (dr);
2454 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2455 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2456 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2457 HOST_WIDE_INT dr_step = -1;
2458 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2459 bool slp_impossible = false;
2461 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2462 size of the interleaving group (including gaps). */
2463 if (tree_fits_shwi_p (step))
2465 dr_step = tree_to_shwi (step);
2466 /* Check that STEP is a multiple of type size. Otherwise there is
2467 a non-element-sized gap at the end of the group which we
2468 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2469 ??? As we can handle non-constant step fine here we should
2470 simply remove uses of DR_GROUP_GAP between the last and first
2471 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2472 simply not include that gap. */
2473 if ((dr_step % type_size) != 0)
2475 if (dump_enabled_p ())
2477 dump_printf_loc (MSG_NOTE, vect_location,
2478 "Step ");
2479 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2480 dump_printf (MSG_NOTE,
2481 " is not a multiple of the element size for ");
2482 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2483 dump_printf (MSG_NOTE, "\n");
2485 return false;
2487 groupsize = absu_hwi (dr_step) / type_size;
2489 else
2490 groupsize = 0;
2492 /* Not consecutive access is possible only if it is a part of interleaving. */
2493 if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2495 /* Check if it this DR is a part of interleaving, and is a single
2496 element of the group that is accessed in the loop. */
2498 /* Gaps are supported only for loads. STEP must be a multiple of the type
2499 size. */
2500 if (DR_IS_READ (dr)
2501 && (dr_step % type_size) == 0
2502 && groupsize > 0)
2504 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2505 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2506 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2507 if (dump_enabled_p ())
2509 dump_printf_loc (MSG_NOTE, vect_location,
2510 "Detected single element interleaving ");
2511 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2512 dump_printf (MSG_NOTE, " step ");
2513 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2514 dump_printf (MSG_NOTE, "\n");
2517 return true;
2520 if (dump_enabled_p ())
2522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523 "not consecutive access ");
2524 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2527 if (bb_vinfo)
2529 /* Mark the statement as unvectorizable. */
2530 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2531 return true;
2534 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2535 STMT_VINFO_STRIDED_P (stmt_info) = true;
2536 return true;
2539 if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2541 /* First stmt in the interleaving chain. Check the chain. */
2542 gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2543 struct data_reference *data_ref = dr;
2544 unsigned int count = 1;
2545 tree prev_init = DR_INIT (data_ref);
2546 gimple *prev = stmt;
2547 HOST_WIDE_INT diff, gaps = 0;
2549 /* By construction, all group members have INTEGER_CST DR_INITs. */
2550 while (next)
2552 /* Skip same data-refs. In case that two or more stmts share
2553 data-ref (supported only for loads), we vectorize only the first
2554 stmt, and the rest get their vectorized loads from the first
2555 one. */
2556 if (!tree_int_cst_compare (DR_INIT (data_ref),
2557 DR_INIT (STMT_VINFO_DATA_REF (
2558 vinfo_for_stmt (next)))))
2560 if (DR_IS_WRITE (data_ref))
2562 if (dump_enabled_p ())
2563 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2564 "Two store stmts share the same dr.\n");
2565 return false;
2568 if (dump_enabled_p ())
2569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2570 "Two or more load stmts share the same dr.\n");
2572 /* For load use the same data-ref load. */
2573 DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2575 prev = next;
2576 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2577 continue;
2580 prev = next;
2581 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2583 /* All group members have the same STEP by construction. */
2584 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2586 /* Check that the distance between two accesses is equal to the type
2587 size. Otherwise, we have gaps. */
2588 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2589 - TREE_INT_CST_LOW (prev_init)) / type_size;
2590 if (diff != 1)
2592 /* FORNOW: SLP of accesses with gaps is not supported. */
2593 slp_impossible = true;
2594 if (DR_IS_WRITE (data_ref))
2596 if (dump_enabled_p ())
2597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2598 "interleaved store with gaps\n");
2599 return false;
2602 gaps += diff - 1;
2605 last_accessed_element += diff;
2607 /* Store the gap from the previous member of the group. If there is no
2608 gap in the access, DR_GROUP_GAP is always 1. */
2609 DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2611 prev_init = DR_INIT (data_ref);
2612 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2613 /* Count the number of data-refs in the chain. */
2614 count++;
2617 if (groupsize == 0)
2618 groupsize = count + gaps;
2620 /* This could be UINT_MAX but as we are generating code in a very
2621 inefficient way we have to cap earlier. See PR78699 for example. */
2622 if (groupsize > 4096)
2624 if (dump_enabled_p ())
2625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2626 "group is too large\n");
2627 return false;
2630 /* Check that the size of the interleaving is equal to count for stores,
2631 i.e., that there are no gaps. */
2632 if (groupsize != count
2633 && !DR_IS_READ (dr))
2635 if (dump_enabled_p ())
2636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2637 "interleaved store with gaps\n");
2638 return false;
2641 /* If there is a gap after the last load in the group it is the
2642 difference between the groupsize and the last accessed
2643 element.
2644 When there is no gap, this difference should be 0. */
2645 DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2647 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2648 if (dump_enabled_p ())
2650 dump_printf_loc (MSG_NOTE, vect_location,
2651 "Detected interleaving ");
2652 if (DR_IS_READ (dr))
2653 dump_printf (MSG_NOTE, "load ");
2654 else
2655 dump_printf (MSG_NOTE, "store ");
2656 dump_printf (MSG_NOTE, "of size %u starting with ",
2657 (unsigned)groupsize);
2658 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2659 if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2660 dump_printf_loc (MSG_NOTE, vect_location,
2661 "There is a gap of %u elements after the group\n",
2662 DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2665 /* SLP: create an SLP data structure for every interleaving group of
2666 stores for further analysis in vect_analyse_slp. */
2667 if (DR_IS_WRITE (dr) && !slp_impossible)
2669 if (loop_vinfo)
2670 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2671 if (bb_vinfo)
2672 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2676 return true;
2679 /* Analyze groups of accesses: check that DR belongs to a group of
2680 accesses of legal size, step, etc. Detect gaps, single element
2681 interleaving, and other special cases. Set grouped access info.
2682 Collect groups of strided stores for further use in SLP analysis. */
2684 static bool
2685 vect_analyze_group_access (struct data_reference *dr)
2687 if (!vect_analyze_group_access_1 (dr))
2689 /* Dissolve the group if present. */
2690 gimple *next;
2691 gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2692 while (stmt)
2694 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2695 next = DR_GROUP_NEXT_ELEMENT (vinfo);
2696 DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2697 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2698 stmt = next;
2700 return false;
2702 return true;
2705 /* Analyze the access pattern of the data-reference DR.
2706 In case of non-consecutive accesses call vect_analyze_group_access() to
2707 analyze groups of accesses. */
2709 static bool
2710 vect_analyze_data_ref_access (struct data_reference *dr)
2712 tree step = DR_STEP (dr);
2713 tree scalar_type = TREE_TYPE (DR_REF (dr));
2714 gimple *stmt = DR_STMT (dr);
2715 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2716 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2717 struct loop *loop = NULL;
2719 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2720 return true;
2722 if (loop_vinfo)
2723 loop = LOOP_VINFO_LOOP (loop_vinfo);
2725 if (loop_vinfo && !step)
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2729 "bad data-ref access in loop\n");
2730 return false;
2733 /* Allow loads with zero step in inner-loop vectorization. */
2734 if (loop_vinfo && integer_zerop (step))
2736 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2737 if (!nested_in_vect_loop_p (loop, stmt))
2738 return DR_IS_READ (dr);
2739 /* Allow references with zero step for outer loops marked
2740 with pragma omp simd only - it guarantees absence of
2741 loop-carried dependencies between inner loop iterations. */
2742 if (loop->safelen < 2)
2744 if (dump_enabled_p ())
2745 dump_printf_loc (MSG_NOTE, vect_location,
2746 "zero step in inner loop of nest\n");
2747 return false;
2751 if (loop && nested_in_vect_loop_p (loop, stmt))
2753 /* Interleaved accesses are not yet supported within outer-loop
2754 vectorization for references in the inner-loop. */
2755 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2757 /* For the rest of the analysis we use the outer-loop step. */
2758 step = STMT_VINFO_DR_STEP (stmt_info);
2759 if (integer_zerop (step))
2761 if (dump_enabled_p ())
2762 dump_printf_loc (MSG_NOTE, vect_location,
2763 "zero step in outer loop.\n");
2764 return DR_IS_READ (dr);
2768 /* Consecutive? */
2769 if (TREE_CODE (step) == INTEGER_CST)
2771 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2772 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2773 || (dr_step < 0
2774 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2776 /* Mark that it is not interleaving. */
2777 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2778 return true;
2782 if (loop && nested_in_vect_loop_p (loop, stmt))
2784 if (dump_enabled_p ())
2785 dump_printf_loc (MSG_NOTE, vect_location,
2786 "grouped access in outer loop.\n");
2787 return false;
2791 /* Assume this is a DR handled by non-constant strided load case. */
2792 if (TREE_CODE (step) != INTEGER_CST)
2793 return (STMT_VINFO_STRIDED_P (stmt_info)
2794 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2795 || vect_analyze_group_access (dr)));
2797 /* Not consecutive access - check if it's a part of interleaving group. */
2798 return vect_analyze_group_access (dr);
2801 /* Compare two data-references DRA and DRB to group them into chunks
2802 suitable for grouping. */
2804 static int
2805 dr_group_sort_cmp (const void *dra_, const void *drb_)
2807 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2808 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2809 int cmp;
2811 /* Stabilize sort. */
2812 if (dra == drb)
2813 return 0;
2815 /* DRs in different loops never belong to the same group. */
2816 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2817 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2818 if (loopa != loopb)
2819 return loopa->num < loopb->num ? -1 : 1;
2821 /* Ordering of DRs according to base. */
2822 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2823 DR_BASE_ADDRESS (drb));
2824 if (cmp != 0)
2825 return cmp;
2827 /* And according to DR_OFFSET. */
2828 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2829 if (cmp != 0)
2830 return cmp;
2832 /* Put reads before writes. */
2833 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2834 return DR_IS_READ (dra) ? -1 : 1;
2836 /* Then sort after access size. */
2837 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2838 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2839 if (cmp != 0)
2840 return cmp;
2842 /* And after step. */
2843 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2844 if (cmp != 0)
2845 return cmp;
2847 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2848 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2849 if (cmp == 0)
2850 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2851 return cmp;
2854 /* If OP is the result of a conversion, return the unconverted value,
2855 otherwise return null. */
2857 static tree
2858 strip_conversion (tree op)
2860 if (TREE_CODE (op) != SSA_NAME)
2861 return NULL_TREE;
2862 gimple *stmt = SSA_NAME_DEF_STMT (op);
2863 if (!is_gimple_assign (stmt)
2864 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2865 return NULL_TREE;
2866 return gimple_assign_rhs1 (stmt);
2869 /* Return true if vectorizable_* routines can handle statements STMT1
2870 and STMT2 being in a single group. */
2872 static bool
2873 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2875 if (gimple_assign_single_p (stmt1))
2876 return gimple_assign_single_p (stmt2);
2878 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2880 /* Check for two masked loads or two masked stores. */
2881 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2882 return false;
2883 internal_fn ifn = gimple_call_internal_fn (stmt1);
2884 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2885 return false;
2886 if (ifn != gimple_call_internal_fn (stmt2))
2887 return false;
2889 /* Check that the masks are the same. Cope with casts of masks,
2890 like those created by build_mask_conversion. */
2891 tree mask1 = gimple_call_arg (stmt1, 2);
2892 tree mask2 = gimple_call_arg (stmt2, 2);
2893 if (!operand_equal_p (mask1, mask2, 0))
2895 mask1 = strip_conversion (mask1);
2896 if (!mask1)
2897 return false;
2898 mask2 = strip_conversion (mask2);
2899 if (!mask2)
2900 return false;
2901 if (!operand_equal_p (mask1, mask2, 0))
2902 return false;
2904 return true;
2907 return false;
2910 /* Function vect_analyze_data_ref_accesses.
2912 Analyze the access pattern of all the data references in the loop.
2914 FORNOW: the only access pattern that is considered vectorizable is a
2915 simple step 1 (consecutive) access.
2917 FORNOW: handle only arrays and pointer accesses. */
2919 bool
2920 vect_analyze_data_ref_accesses (vec_info *vinfo)
2922 unsigned int i;
2923 vec<data_reference_p> datarefs = vinfo->datarefs;
2924 struct data_reference *dr;
2926 if (dump_enabled_p ())
2927 dump_printf_loc (MSG_NOTE, vect_location,
2928 "=== vect_analyze_data_ref_accesses ===\n");
2930 if (datarefs.is_empty ())
2931 return true;
2933 /* Sort the array of datarefs to make building the interleaving chains
2934 linear. Don't modify the original vector's order, it is needed for
2935 determining what dependencies are reversed. */
2936 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2937 datarefs_copy.qsort (dr_group_sort_cmp);
2939 /* Build the interleaving chains. */
2940 for (i = 0; i < datarefs_copy.length () - 1;)
2942 data_reference_p dra = datarefs_copy[i];
2943 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2944 stmt_vec_info lastinfo = NULL;
2945 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2946 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2948 ++i;
2949 continue;
2951 for (i = i + 1; i < datarefs_copy.length (); ++i)
2953 data_reference_p drb = datarefs_copy[i];
2954 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2955 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2956 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2957 break;
2959 /* ??? Imperfect sorting (non-compatible types, non-modulo
2960 accesses, same accesses) can lead to a group to be artificially
2961 split here as we don't just skip over those. If it really
2962 matters we can push those to a worklist and re-iterate
2963 over them. The we can just skip ahead to the next DR here. */
2965 /* DRs in a different loop should not be put into the same
2966 interleaving group. */
2967 if (gimple_bb (DR_STMT (dra))->loop_father
2968 != gimple_bb (DR_STMT (drb))->loop_father)
2969 break;
2971 /* Check that the data-refs have same first location (except init)
2972 and they are both either store or load (not load and store,
2973 not masked loads or stores). */
2974 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2975 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2976 DR_BASE_ADDRESS (drb)) != 0
2977 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2978 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2979 break;
2981 /* Check that the data-refs have the same constant size. */
2982 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2983 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2984 if (!tree_fits_uhwi_p (sza)
2985 || !tree_fits_uhwi_p (szb)
2986 || !tree_int_cst_equal (sza, szb))
2987 break;
2989 /* Check that the data-refs have the same step. */
2990 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2991 break;
2993 /* Check the types are compatible.
2994 ??? We don't distinguish this during sorting. */
2995 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2996 TREE_TYPE (DR_REF (drb))))
2997 break;
2999 /* Check that the DR_INITs are compile-time constants. */
3000 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3001 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3002 break;
3004 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3005 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3006 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3007 HOST_WIDE_INT init_prev
3008 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3009 gcc_assert (init_a <= init_b
3010 && init_a <= init_prev
3011 && init_prev <= init_b);
3013 /* Do not place the same access in the interleaving chain twice. */
3014 if (init_b == init_prev)
3016 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3017 < gimple_uid (DR_STMT (drb)));
3018 /* ??? For now we simply "drop" the later reference which is
3019 otherwise the same rather than finishing off this group.
3020 In the end we'd want to re-process duplicates forming
3021 multiple groups from the refs, likely by just collecting
3022 all candidates (including duplicates and split points
3023 below) in a vector and then process them together. */
3024 continue;
3027 /* If init_b == init_a + the size of the type * k, we have an
3028 interleaving, and DRA is accessed before DRB. */
3029 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3030 if (type_size_a == 0
3031 || (init_b - init_a) % type_size_a != 0)
3032 break;
3034 /* If we have a store, the accesses are adjacent. This splits
3035 groups into chunks we support (we don't support vectorization
3036 of stores with gaps). */
3037 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3038 break;
3040 /* If the step (if not zero or non-constant) is greater than the
3041 difference between data-refs' inits this splits groups into
3042 suitable sizes. */
3043 if (tree_fits_shwi_p (DR_STEP (dra)))
3045 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3046 if (step != 0 && step <= (init_b - init_a))
3047 break;
3050 if (dump_enabled_p ())
3052 dump_printf_loc (MSG_NOTE, vect_location,
3053 "Detected interleaving ");
3054 if (DR_IS_READ (dra))
3055 dump_printf (MSG_NOTE, "load ");
3056 else
3057 dump_printf (MSG_NOTE, "store ");
3058 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3059 dump_printf (MSG_NOTE, " and ");
3060 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3061 dump_printf (MSG_NOTE, "\n");
3064 /* Link the found element into the group list. */
3065 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3067 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3068 lastinfo = stmtinfo_a;
3070 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3071 DR_GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3072 lastinfo = stmtinfo_b;
3076 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3077 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3078 && !vect_analyze_data_ref_access (dr))
3080 if (dump_enabled_p ())
3081 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3082 "not vectorized: complicated access pattern.\n");
3084 if (is_a <bb_vec_info> (vinfo))
3086 /* Mark the statement as not vectorizable. */
3087 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3088 continue;
3090 else
3092 datarefs_copy.release ();
3093 return false;
3097 datarefs_copy.release ();
3098 return true;
3101 /* Function vect_vfa_segment_size.
3103 Input:
3104 DR: The data reference.
3105 LENGTH_FACTOR: segment length to consider.
3107 Return a value suitable for the dr_with_seg_len::seg_len field.
3108 This is the "distance travelled" by the pointer from the first
3109 iteration in the segment to the last. Note that it does not include
3110 the size of the access; in effect it only describes the first byte. */
3112 static tree
3113 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3115 length_factor = size_binop (MINUS_EXPR,
3116 fold_convert (sizetype, length_factor),
3117 size_one_node);
3118 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3119 length_factor);
3122 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3123 gives the worst-case number of bytes covered by the segment. */
3125 static unsigned HOST_WIDE_INT
3126 vect_vfa_access_size (data_reference *dr)
3128 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3129 tree ref_type = TREE_TYPE (DR_REF (dr));
3130 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3131 unsigned HOST_WIDE_INT access_size = ref_size;
3132 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3134 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3135 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3137 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3138 && (vect_supportable_dr_alignment (dr, false)
3139 == dr_explicit_realign_optimized))
3141 /* We might access a full vector's worth. */
3142 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3143 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3145 return access_size;
3148 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3150 static unsigned int
3151 vect_vfa_align (const data_reference *dr)
3153 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3156 /* Function vect_no_alias_p.
3158 Given data references A and B with equal base and offset, see whether
3159 the alias relation can be decided at compilation time. Return 1 if
3160 it can and the references alias, 0 if it can and the references do
3161 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3162 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3163 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3165 static int
3166 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3167 tree segment_length_a, tree segment_length_b,
3168 unsigned HOST_WIDE_INT access_size_a,
3169 unsigned HOST_WIDE_INT access_size_b)
3171 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3172 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3173 poly_uint64 const_length_a;
3174 poly_uint64 const_length_b;
3176 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3177 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3178 [a, a+12) */
3179 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3181 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3182 offset_a = (offset_a + access_size_a) - const_length_a;
3184 else
3185 const_length_a = tree_to_poly_uint64 (segment_length_a);
3186 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3188 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3189 offset_b = (offset_b + access_size_b) - const_length_b;
3191 else
3192 const_length_b = tree_to_poly_uint64 (segment_length_b);
3194 const_length_a += access_size_a;
3195 const_length_b += access_size_b;
3197 if (ranges_known_overlap_p (offset_a, const_length_a,
3198 offset_b, const_length_b))
3199 return 1;
3201 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3202 offset_b, const_length_b))
3203 return 0;
3205 return -1;
3208 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3209 in DDR is >= VF. */
3211 static bool
3212 dependence_distance_ge_vf (data_dependence_relation *ddr,
3213 unsigned int loop_depth, poly_uint64 vf)
3215 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3216 || DDR_NUM_DIST_VECTS (ddr) == 0)
3217 return false;
3219 /* If the dependence is exact, we should have limited the VF instead. */
3220 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3222 unsigned int i;
3223 lambda_vector dist_v;
3224 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3226 HOST_WIDE_INT dist = dist_v[loop_depth];
3227 if (dist != 0
3228 && !(dist > 0 && DDR_REVERSED_P (ddr))
3229 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3230 return false;
3233 if (dump_enabled_p ())
3235 dump_printf_loc (MSG_NOTE, vect_location,
3236 "dependence distance between ");
3237 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3238 dump_printf (MSG_NOTE, " and ");
3239 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3240 dump_printf (MSG_NOTE, " is >= VF\n");
3243 return true;
3246 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3248 static void
3249 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3251 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3252 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3253 dump_printf (dump_kind, ") >= ");
3254 dump_dec (dump_kind, lower_bound.min_value);
3257 /* Record that the vectorized loop requires the vec_lower_bound described
3258 by EXPR, UNSIGNED_P and MIN_VALUE. */
3260 static void
3261 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3262 poly_uint64 min_value)
3264 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3265 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3266 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3268 unsigned_p &= lower_bounds[i].unsigned_p;
3269 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3270 if (lower_bounds[i].unsigned_p != unsigned_p
3271 || maybe_lt (lower_bounds[i].min_value, min_value))
3273 lower_bounds[i].unsigned_p = unsigned_p;
3274 lower_bounds[i].min_value = min_value;
3275 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_NOTE, vect_location,
3278 "updating run-time check to ");
3279 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3280 dump_printf (MSG_NOTE, "\n");
3283 return;
3286 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3287 if (dump_enabled_p ())
3289 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3290 dump_lower_bound (MSG_NOTE, lower_bound);
3291 dump_printf (MSG_NOTE, "\n");
3293 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3296 /* Return true if it's unlikely that the step of the vectorized form of DR
3297 will span fewer than GAP bytes. */
3299 static bool
3300 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3302 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3303 HOST_WIDE_INT count
3304 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3305 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3306 count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3307 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3310 /* Return true if we know that there is no alias between DR_A and DR_B
3311 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3312 *LOWER_BOUND_OUT to this N. */
3314 static bool
3315 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3316 poly_uint64 *lower_bound_out)
3318 /* Check that there is a constant gap of known sign between DR_A
3319 and DR_B. */
3320 poly_int64 init_a, init_b;
3321 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3322 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3323 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3324 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3325 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3326 || !ordered_p (init_a, init_b))
3327 return false;
3329 /* Sort DR_A and DR_B by the address they access. */
3330 if (maybe_lt (init_b, init_a))
3332 std::swap (init_a, init_b);
3333 std::swap (dr_a, dr_b);
3336 /* If the two accesses could be dependent within a scalar iteration,
3337 make sure that we'd retain their order. */
3338 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3339 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3340 return false;
3342 /* There is no alias if abs (DR_STEP) is greater than or equal to
3343 the bytes spanned by the combination of the two accesses. */
3344 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3345 return true;
3348 /* Function vect_prune_runtime_alias_test_list.
3350 Prune a list of ddrs to be tested at run-time by versioning for alias.
3351 Merge several alias checks into one if possible.
3352 Return FALSE if resulting list of ddrs is longer then allowed by
3353 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3355 bool
3356 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3358 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3359 hash_set <tree_pair_hash> compared_objects;
3361 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3362 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3363 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3364 vec<vec_object_pair> &check_unequal_addrs
3365 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3366 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3367 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3369 ddr_p ddr;
3370 unsigned int i;
3371 tree length_factor;
3373 if (dump_enabled_p ())
3374 dump_printf_loc (MSG_NOTE, vect_location,
3375 "=== vect_prune_runtime_alias_test_list ===\n");
3377 /* Step values are irrelevant for aliasing if the number of vector
3378 iterations is equal to the number of scalar iterations (which can
3379 happen for fully-SLP loops). */
3380 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3382 if (!ignore_step_p)
3384 /* Convert the checks for nonzero steps into bound tests. */
3385 tree value;
3386 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3387 vect_check_lower_bound (loop_vinfo, value, true, 1);
3390 if (may_alias_ddrs.is_empty ())
3391 return true;
3393 comp_alias_ddrs.create (may_alias_ddrs.length ());
3395 unsigned int loop_depth
3396 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3397 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3399 /* First, we collect all data ref pairs for aliasing checks. */
3400 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3402 int comp_res;
3403 poly_uint64 lower_bound;
3404 struct data_reference *dr_a, *dr_b;
3405 gimple *dr_group_first_a, *dr_group_first_b;
3406 tree segment_length_a, segment_length_b;
3407 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3408 unsigned int align_a, align_b;
3409 gimple *stmt_a, *stmt_b;
3411 /* Ignore the alias if the VF we chose ended up being no greater
3412 than the dependence distance. */
3413 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3414 continue;
3416 if (DDR_OBJECT_A (ddr))
3418 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3419 if (!compared_objects.add (new_pair))
3421 if (dump_enabled_p ())
3423 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3424 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3425 dump_printf (MSG_NOTE, " and ");
3426 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3427 dump_printf (MSG_NOTE, " have different addresses\n");
3429 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3431 continue;
3434 dr_a = DDR_A (ddr);
3435 stmt_a = DR_STMT (DDR_A (ddr));
3437 dr_b = DDR_B (ddr);
3438 stmt_b = DR_STMT (DDR_B (ddr));
3440 /* Skip the pair if inter-iteration dependencies are irrelevant
3441 and intra-iteration dependencies are guaranteed to be honored. */
3442 if (ignore_step_p
3443 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3444 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3446 if (dump_enabled_p ())
3448 dump_printf_loc (MSG_NOTE, vect_location,
3449 "no need for alias check between ");
3450 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3451 dump_printf (MSG_NOTE, " and ");
3452 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3453 dump_printf (MSG_NOTE, " when VF is 1\n");
3455 continue;
3458 /* See whether we can handle the alias using a bounds check on
3459 the step, and whether that's likely to be the best approach.
3460 (It might not be, for example, if the minimum step is much larger
3461 than the number of bytes handled by one vector iteration.) */
3462 if (!ignore_step_p
3463 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3464 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3465 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3466 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3468 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3469 if (dump_enabled_p ())
3471 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3472 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3473 dump_printf (MSG_NOTE, " and ");
3474 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3475 dump_printf (MSG_NOTE, " when the step ");
3476 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3477 dump_printf (MSG_NOTE, " is outside ");
3478 if (unsigned_p)
3479 dump_printf (MSG_NOTE, "[0");
3480 else
3482 dump_printf (MSG_NOTE, "(");
3483 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3485 dump_printf (MSG_NOTE, ", ");
3486 dump_dec (MSG_NOTE, lower_bound);
3487 dump_printf (MSG_NOTE, ")\n");
3489 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3490 lower_bound);
3491 continue;
3494 dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3495 if (dr_group_first_a)
3497 stmt_a = dr_group_first_a;
3498 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3501 dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3502 if (dr_group_first_b)
3504 stmt_b = dr_group_first_b;
3505 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3508 if (ignore_step_p)
3510 segment_length_a = size_zero_node;
3511 segment_length_b = size_zero_node;
3513 else
3515 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3516 length_factor = scalar_loop_iters;
3517 else
3518 length_factor = size_int (vect_factor);
3519 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3520 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3522 access_size_a = vect_vfa_access_size (dr_a);
3523 access_size_b = vect_vfa_access_size (dr_b);
3524 align_a = vect_vfa_align (dr_a);
3525 align_b = vect_vfa_align (dr_b);
3527 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3528 DR_BASE_ADDRESS (dr_b));
3529 if (comp_res == 0)
3530 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3531 DR_OFFSET (dr_b));
3533 /* See whether the alias is known at compilation time. */
3534 if (comp_res == 0
3535 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3536 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3537 && poly_int_tree_p (segment_length_a)
3538 && poly_int_tree_p (segment_length_b))
3540 int res = vect_compile_time_alias (dr_a, dr_b,
3541 segment_length_a,
3542 segment_length_b,
3543 access_size_a,
3544 access_size_b);
3545 if (res >= 0 && dump_enabled_p ())
3547 dump_printf_loc (MSG_NOTE, vect_location,
3548 "can tell at compile time that ");
3549 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3550 dump_printf (MSG_NOTE, " and ");
3551 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3552 if (res == 0)
3553 dump_printf (MSG_NOTE, " do not alias\n");
3554 else
3555 dump_printf (MSG_NOTE, " alias\n");
3558 if (res == 0)
3559 continue;
3561 if (res == 1)
3563 if (dump_enabled_p ())
3564 dump_printf_loc (MSG_NOTE, vect_location,
3565 "not vectorized: compilation time alias.\n");
3566 return false;
3570 dr_with_seg_len_pair_t dr_with_seg_len_pair
3571 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3572 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3574 /* Canonicalize pairs by sorting the two DR members. */
3575 if (comp_res > 0)
3576 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3578 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3581 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3583 unsigned int count = (comp_alias_ddrs.length ()
3584 + check_unequal_addrs.length ());
3586 dump_printf_loc (MSG_NOTE, vect_location,
3587 "improved number of alias checks from %d to %d\n",
3588 may_alias_ddrs.length (), count);
3589 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3591 if (dump_enabled_p ())
3592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3593 "number of versioning for alias "
3594 "run-time tests exceeds %d "
3595 "(--param vect-max-version-for-alias-checks)\n",
3596 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3597 return false;
3600 return true;
3603 /* Check whether we can use an internal function for a gather load
3604 or scatter store. READ_P is true for loads and false for stores.
3605 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3606 the type of the memory elements being loaded or stored. OFFSET_BITS
3607 is the number of bits in each scalar offset and OFFSET_SIGN is the
3608 sign of the offset. SCALE is the amount by which the offset should
3609 be multiplied *after* it has been converted to address width.
3611 Return true if the function is supported, storing the function
3612 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3614 bool
3615 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3616 tree memory_type, unsigned int offset_bits,
3617 signop offset_sign, int scale,
3618 internal_fn *ifn_out, tree *element_type_out)
3620 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3621 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3622 if (offset_bits > element_bits)
3623 /* Internal functions require the offset to be the same width as
3624 the vector elements. We can extend narrower offsets, but it isn't
3625 safe to truncate wider offsets. */
3626 return false;
3628 if (element_bits != memory_bits)
3629 /* For now the vector elements must be the same width as the
3630 memory elements. */
3631 return false;
3633 /* Work out which function we need. */
3634 internal_fn ifn;
3635 if (read_p)
3636 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3637 else
3638 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3640 /* Test whether the target supports this combination. */
3641 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3642 offset_sign, scale))
3643 return false;
3645 *ifn_out = ifn;
3646 *element_type_out = TREE_TYPE (vectype);
3647 return true;
3650 /* CALL is a call to an internal gather load or scatter store function.
3651 Describe the operation in INFO. */
3653 static void
3654 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3656 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3657 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3658 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3660 info->ifn = gimple_call_internal_fn (call);
3661 info->decl = NULL_TREE;
3662 info->base = gimple_call_arg (call, 0);
3663 info->offset = gimple_call_arg (call, 1);
3664 info->offset_dt = vect_unknown_def_type;
3665 info->offset_vectype = NULL_TREE;
3666 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3667 info->element_type = TREE_TYPE (vectype);
3668 info->memory_type = TREE_TYPE (DR_REF (dr));
3671 /* Return true if a non-affine read or write in STMT is suitable for a
3672 gather load or scatter store. Describe the operation in *INFO if so. */
3674 bool
3675 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3676 gather_scatter_info *info)
3678 HOST_WIDE_INT scale = 1;
3679 poly_int64 pbitpos, pbitsize;
3680 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3681 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3682 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3683 tree offtype = NULL_TREE;
3684 tree decl = NULL_TREE, base, off;
3685 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3686 tree memory_type = TREE_TYPE (DR_REF (dr));
3687 machine_mode pmode;
3688 int punsignedp, reversep, pvolatilep = 0;
3689 internal_fn ifn;
3690 tree element_type;
3691 bool masked_p = false;
3693 /* See whether this is already a call to a gather/scatter internal function.
3694 If not, see whether it's a masked load or store. */
3695 gcall *call = dyn_cast <gcall *> (stmt);
3696 if (call && gimple_call_internal_p (call))
3698 ifn = gimple_call_internal_fn (stmt);
3699 if (internal_gather_scatter_fn_p (ifn))
3701 vect_describe_gather_scatter_call (call, info);
3702 return true;
3704 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3707 /* True if we should aim to use internal functions rather than
3708 built-in functions. */
3709 bool use_ifn_p = (DR_IS_READ (dr)
3710 ? supports_vec_gather_load_p ()
3711 : supports_vec_scatter_store_p ());
3713 base = DR_REF (dr);
3714 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3715 see if we can use the def stmt of the address. */
3716 if (masked_p
3717 && TREE_CODE (base) == MEM_REF
3718 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3719 && integer_zerop (TREE_OPERAND (base, 1))
3720 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3722 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3723 if (is_gimple_assign (def_stmt)
3724 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3725 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3728 /* The gather and scatter builtins need address of the form
3729 loop_invariant + vector * {1, 2, 4, 8}
3731 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3732 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3733 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3734 multiplications and additions in it. To get a vector, we need
3735 a single SSA_NAME that will be defined in the loop and will
3736 contain everything that is not loop invariant and that can be
3737 vectorized. The following code attempts to find such a preexistng
3738 SSA_NAME OFF and put the loop invariants into a tree BASE
3739 that can be gimplified before the loop. */
3740 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3741 &punsignedp, &reversep, &pvolatilep);
3742 gcc_assert (base && !reversep);
3743 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3745 if (TREE_CODE (base) == MEM_REF)
3747 if (!integer_zerop (TREE_OPERAND (base, 1)))
3749 if (off == NULL_TREE)
3750 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3751 else
3752 off = size_binop (PLUS_EXPR, off,
3753 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3755 base = TREE_OPERAND (base, 0);
3757 else
3758 base = build_fold_addr_expr (base);
3760 if (off == NULL_TREE)
3761 off = size_zero_node;
3763 /* If base is not loop invariant, either off is 0, then we start with just
3764 the constant offset in the loop invariant BASE and continue with base
3765 as OFF, otherwise give up.
3766 We could handle that case by gimplifying the addition of base + off
3767 into some SSA_NAME and use that as off, but for now punt. */
3768 if (!expr_invariant_in_loop_p (loop, base))
3770 if (!integer_zerop (off))
3771 return false;
3772 off = base;
3773 base = size_int (pbytepos);
3775 /* Otherwise put base + constant offset into the loop invariant BASE
3776 and continue with OFF. */
3777 else
3779 base = fold_convert (sizetype, base);
3780 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3783 /* OFF at this point may be either a SSA_NAME or some tree expression
3784 from get_inner_reference. Try to peel off loop invariants from it
3785 into BASE as long as possible. */
3786 STRIP_NOPS (off);
3787 while (offtype == NULL_TREE)
3789 enum tree_code code;
3790 tree op0, op1, add = NULL_TREE;
3792 if (TREE_CODE (off) == SSA_NAME)
3794 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3796 if (expr_invariant_in_loop_p (loop, off))
3797 return false;
3799 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3800 break;
3802 op0 = gimple_assign_rhs1 (def_stmt);
3803 code = gimple_assign_rhs_code (def_stmt);
3804 op1 = gimple_assign_rhs2 (def_stmt);
3806 else
3808 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3809 return false;
3810 code = TREE_CODE (off);
3811 extract_ops_from_tree (off, &code, &op0, &op1);
3813 switch (code)
3815 case POINTER_PLUS_EXPR:
3816 case PLUS_EXPR:
3817 if (expr_invariant_in_loop_p (loop, op0))
3819 add = op0;
3820 off = op1;
3821 do_add:
3822 add = fold_convert (sizetype, add);
3823 if (scale != 1)
3824 add = size_binop (MULT_EXPR, add, size_int (scale));
3825 base = size_binop (PLUS_EXPR, base, add);
3826 continue;
3828 if (expr_invariant_in_loop_p (loop, op1))
3830 add = op1;
3831 off = op0;
3832 goto do_add;
3834 break;
3835 case MINUS_EXPR:
3836 if (expr_invariant_in_loop_p (loop, op1))
3838 add = fold_convert (sizetype, op1);
3839 add = size_binop (MINUS_EXPR, size_zero_node, add);
3840 off = op0;
3841 goto do_add;
3843 break;
3844 case MULT_EXPR:
3845 if (scale == 1 && tree_fits_shwi_p (op1))
3847 int new_scale = tree_to_shwi (op1);
3848 /* Only treat this as a scaling operation if the target
3849 supports it. */
3850 if (use_ifn_p
3851 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3852 vectype, memory_type, 1,
3853 TYPE_SIGN (TREE_TYPE (op0)),
3854 new_scale, &ifn,
3855 &element_type))
3856 break;
3857 scale = new_scale;
3858 off = op0;
3859 continue;
3861 break;
3862 case SSA_NAME:
3863 off = op0;
3864 continue;
3865 CASE_CONVERT:
3866 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3867 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3868 break;
3869 if (TYPE_PRECISION (TREE_TYPE (op0))
3870 == TYPE_PRECISION (TREE_TYPE (off)))
3872 off = op0;
3873 continue;
3876 /* The internal functions need the offset to be the same width
3877 as the elements of VECTYPE. Don't include operations that
3878 cast the offset from that width to a different width. */
3879 if (use_ifn_p
3880 && (int_size_in_bytes (TREE_TYPE (vectype))
3881 == int_size_in_bytes (TREE_TYPE (off))))
3882 break;
3884 if (TYPE_PRECISION (TREE_TYPE (op0))
3885 < TYPE_PRECISION (TREE_TYPE (off)))
3887 off = op0;
3888 offtype = TREE_TYPE (off);
3889 STRIP_NOPS (off);
3890 continue;
3892 break;
3893 default:
3894 break;
3896 break;
3899 /* If at the end OFF still isn't a SSA_NAME or isn't
3900 defined in the loop, punt. */
3901 if (TREE_CODE (off) != SSA_NAME
3902 || expr_invariant_in_loop_p (loop, off))
3903 return false;
3905 if (offtype == NULL_TREE)
3906 offtype = TREE_TYPE (off);
3908 if (use_ifn_p)
3910 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3911 memory_type, TYPE_PRECISION (offtype),
3912 TYPE_SIGN (offtype), scale, &ifn,
3913 &element_type))
3914 return false;
3916 else
3918 if (DR_IS_READ (dr))
3920 if (targetm.vectorize.builtin_gather)
3921 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3923 else
3925 if (targetm.vectorize.builtin_scatter)
3926 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3929 if (!decl)
3930 return false;
3932 ifn = IFN_LAST;
3933 element_type = TREE_TYPE (vectype);
3936 info->ifn = ifn;
3937 info->decl = decl;
3938 info->base = base;
3939 info->offset = off;
3940 info->offset_dt = vect_unknown_def_type;
3941 info->offset_vectype = NULL_TREE;
3942 info->scale = scale;
3943 info->element_type = element_type;
3944 info->memory_type = memory_type;
3945 return true;
3948 /* Find the data references in STMT, analyze them with respect to LOOP and
3949 append them to DATAREFS. Return false if datarefs in this stmt cannot
3950 be handled. */
3952 bool
3953 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3954 vec<data_reference_p> *datarefs)
3956 /* We can ignore clobbers for dataref analysis - they are removed during
3957 loop vectorization and BB vectorization checks dependences with a
3958 stmt walk. */
3959 if (gimple_clobber_p (stmt))
3960 return true;
3962 if (gimple_has_volatile_ops (stmt))
3964 if (dump_enabled_p ())
3966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3967 "not vectorized: volatile type ");
3968 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3970 return false;
3973 if (stmt_can_throw_internal (stmt))
3975 if (dump_enabled_p ())
3977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3978 "not vectorized: statement can throw an "
3979 "exception ");
3980 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3982 return false;
3985 auto_vec<data_reference_p, 2> refs;
3986 if (!find_data_references_in_stmt (loop, stmt, &refs))
3987 return false;
3989 if (refs.is_empty ())
3990 return true;
3992 if (refs.length () > 1)
3994 if (dump_enabled_p ())
3996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3997 "not vectorized: more than one data ref "
3998 "in stmt: ");
3999 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4001 return false;
4004 if (gcall *call = dyn_cast <gcall *> (stmt))
4005 if (!gimple_call_internal_p (call)
4006 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4007 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4009 if (dump_enabled_p ())
4011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4012 "not vectorized: dr in a call ");
4013 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4015 return false;
4018 data_reference_p dr = refs.pop ();
4019 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4020 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4022 if (dump_enabled_p ())
4024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4025 "not vectorized: statement is bitfield "
4026 "access ");
4027 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4029 return false;
4032 if (DR_BASE_ADDRESS (dr)
4033 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4035 if (dump_enabled_p ())
4036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4037 "not vectorized: base addr of dr is a "
4038 "constant\n");
4039 return false;
4042 datarefs->safe_push (dr);
4043 return true;
4046 /* Function vect_analyze_data_refs.
4048 Find all the data references in the loop or basic block.
4050 The general structure of the analysis of data refs in the vectorizer is as
4051 follows:
4052 1- vect_analyze_data_refs(loop/bb): call
4053 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4054 in the loop/bb and their dependences.
4055 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4056 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4057 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4061 bool
4062 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4064 struct loop *loop = NULL;
4065 unsigned int i;
4066 struct data_reference *dr;
4067 tree scalar_type;
4069 if (dump_enabled_p ())
4070 dump_printf_loc (MSG_NOTE, vect_location,
4071 "=== vect_analyze_data_refs ===\n");
4073 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4074 loop = LOOP_VINFO_LOOP (loop_vinfo);
4076 /* Go through the data-refs, check that the analysis succeeded. Update
4077 pointer from stmt_vec_info struct to DR and vectype. */
4079 vec<data_reference_p> datarefs = vinfo->datarefs;
4080 FOR_EACH_VEC_ELT (datarefs, i, dr)
4082 gimple *stmt;
4083 stmt_vec_info stmt_info;
4084 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4085 bool simd_lane_access = false;
4086 poly_uint64 vf;
4088 gcc_assert (DR_REF (dr));
4089 stmt = DR_STMT (dr);
4090 stmt_info = vinfo_for_stmt (stmt);
4092 /* Check that analysis of the data-ref succeeded. */
4093 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4094 || !DR_STEP (dr))
4096 bool maybe_gather
4097 = DR_IS_READ (dr)
4098 && !TREE_THIS_VOLATILE (DR_REF (dr))
4099 && (targetm.vectorize.builtin_gather != NULL
4100 || supports_vec_gather_load_p ());
4101 bool maybe_scatter
4102 = DR_IS_WRITE (dr)
4103 && !TREE_THIS_VOLATILE (DR_REF (dr))
4104 && (targetm.vectorize.builtin_scatter != NULL
4105 || supports_vec_scatter_store_p ());
4106 bool maybe_simd_lane_access
4107 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4109 /* If target supports vector gather loads or scatter stores, or if
4110 this might be a SIMD lane access, see if they can't be used. */
4111 if (is_a <loop_vec_info> (vinfo)
4112 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4113 && !nested_in_vect_loop_p (loop, stmt))
4115 struct data_reference *newdr
4116 = create_data_ref (NULL, loop_containing_stmt (stmt),
4117 DR_REF (dr), stmt, !maybe_scatter,
4118 DR_IS_CONDITIONAL_IN_STMT (dr));
4119 gcc_assert (newdr != NULL && DR_REF (newdr));
4120 if (DR_BASE_ADDRESS (newdr)
4121 && DR_OFFSET (newdr)
4122 && DR_INIT (newdr)
4123 && DR_STEP (newdr)
4124 && integer_zerop (DR_STEP (newdr)))
4126 if (maybe_simd_lane_access)
4128 tree off = DR_OFFSET (newdr);
4129 STRIP_NOPS (off);
4130 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4131 && TREE_CODE (off) == MULT_EXPR
4132 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4134 tree step = TREE_OPERAND (off, 1);
4135 off = TREE_OPERAND (off, 0);
4136 STRIP_NOPS (off);
4137 if (CONVERT_EXPR_P (off)
4138 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4139 0)))
4140 < TYPE_PRECISION (TREE_TYPE (off)))
4141 off = TREE_OPERAND (off, 0);
4142 if (TREE_CODE (off) == SSA_NAME)
4144 gimple *def = SSA_NAME_DEF_STMT (off);
4145 tree reft = TREE_TYPE (DR_REF (newdr));
4146 if (is_gimple_call (def)
4147 && gimple_call_internal_p (def)
4148 && (gimple_call_internal_fn (def)
4149 == IFN_GOMP_SIMD_LANE))
4151 tree arg = gimple_call_arg (def, 0);
4152 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4153 arg = SSA_NAME_VAR (arg);
4154 if (arg == loop->simduid
4155 /* For now. */
4156 && tree_int_cst_equal
4157 (TYPE_SIZE_UNIT (reft),
4158 step))
4160 DR_OFFSET (newdr) = ssize_int (0);
4161 DR_STEP (newdr) = step;
4162 DR_OFFSET_ALIGNMENT (newdr)
4163 = BIGGEST_ALIGNMENT;
4164 DR_STEP_ALIGNMENT (newdr)
4165 = highest_pow2_factor (step);
4166 dr = newdr;
4167 simd_lane_access = true;
4173 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4175 dr = newdr;
4176 if (maybe_gather)
4177 gatherscatter = GATHER;
4178 else
4179 gatherscatter = SCATTER;
4182 if (gatherscatter == SG_NONE && !simd_lane_access)
4183 free_data_ref (newdr);
4186 if (gatherscatter == SG_NONE && !simd_lane_access)
4188 if (dump_enabled_p ())
4190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4191 "not vectorized: data ref analysis "
4192 "failed ");
4193 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4195 if (is_a <bb_vec_info> (vinfo))
4197 /* In BB vectorization the ref can still participate
4198 in dependence analysis, we just can't vectorize it. */
4199 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4200 continue;
4202 return false;
4206 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4207 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4208 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4210 if (dump_enabled_p ())
4212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4213 "not vectorized: base object not addressable "
4214 "for stmt: ");
4215 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4217 if (is_a <bb_vec_info> (vinfo))
4219 /* In BB vectorization the ref can still participate
4220 in dependence analysis, we just can't vectorize it. */
4221 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4222 continue;
4224 return false;
4227 if (is_a <loop_vec_info> (vinfo)
4228 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4230 if (nested_in_vect_loop_p (loop, stmt))
4232 if (dump_enabled_p ())
4234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4235 "not vectorized: not suitable for strided "
4236 "load ");
4237 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4239 return false;
4241 STMT_VINFO_STRIDED_P (stmt_info) = true;
4244 /* Update DR field in stmt_vec_info struct. */
4246 /* If the dataref is in an inner-loop of the loop that is considered for
4247 for vectorization, we also want to analyze the access relative to
4248 the outer-loop (DR contains information only relative to the
4249 inner-most enclosing loop). We do that by building a reference to the
4250 first location accessed by the inner-loop, and analyze it relative to
4251 the outer-loop. */
4252 if (loop && nested_in_vect_loop_p (loop, stmt))
4254 /* Build a reference to the first location accessed by the
4255 inner loop: *(BASE + INIT + OFFSET). By construction,
4256 this address must be invariant in the inner loop, so we
4257 can consider it as being used in the outer loop. */
4258 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4259 tree offset = unshare_expr (DR_OFFSET (dr));
4260 tree init = unshare_expr (DR_INIT (dr));
4261 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4262 init, offset);
4263 tree init_addr = fold_build_pointer_plus (base, init_offset);
4264 tree init_ref = build_fold_indirect_ref (init_addr);
4266 if (dump_enabled_p ())
4268 dump_printf_loc (MSG_NOTE, vect_location,
4269 "analyze in outer loop: ");
4270 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4271 dump_printf (MSG_NOTE, "\n");
4274 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4275 init_ref, loop))
4276 /* dr_analyze_innermost already explained the failure. */
4277 return false;
4279 if (dump_enabled_p ())
4281 dump_printf_loc (MSG_NOTE, vect_location,
4282 "\touter base_address: ");
4283 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4284 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4285 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4286 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4287 STMT_VINFO_DR_OFFSET (stmt_info));
4288 dump_printf (MSG_NOTE,
4289 "\n\touter constant offset from base address: ");
4290 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4291 STMT_VINFO_DR_INIT (stmt_info));
4292 dump_printf (MSG_NOTE, "\n\touter step: ");
4293 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4294 STMT_VINFO_DR_STEP (stmt_info));
4295 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4296 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4297 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4298 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4299 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4300 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4301 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4302 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4306 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4307 STMT_VINFO_DATA_REF (stmt_info) = dr;
4308 if (simd_lane_access)
4310 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4311 free_data_ref (datarefs[i]);
4312 datarefs[i] = dr;
4315 /* Set vectype for STMT. */
4316 scalar_type = TREE_TYPE (DR_REF (dr));
4317 STMT_VINFO_VECTYPE (stmt_info)
4318 = get_vectype_for_scalar_type (scalar_type);
4319 if (!STMT_VINFO_VECTYPE (stmt_info))
4321 if (dump_enabled_p ())
4323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4324 "not vectorized: no vectype for stmt: ");
4325 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4326 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4327 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4328 scalar_type);
4329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4332 if (is_a <bb_vec_info> (vinfo))
4334 /* No vector type is fine, the ref can still participate
4335 in dependence analysis, we just can't vectorize it. */
4336 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4337 continue;
4340 if (gatherscatter != SG_NONE || simd_lane_access)
4342 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4343 if (gatherscatter != SG_NONE)
4344 free_data_ref (dr);
4346 return false;
4348 else
4350 if (dump_enabled_p ())
4352 dump_printf_loc (MSG_NOTE, vect_location,
4353 "got vectype for stmt: ");
4354 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4355 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4356 STMT_VINFO_VECTYPE (stmt_info));
4357 dump_printf (MSG_NOTE, "\n");
4361 /* Adjust the minimal vectorization factor according to the
4362 vector type. */
4363 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4364 *min_vf = upper_bound (*min_vf, vf);
4366 if (gatherscatter != SG_NONE)
4368 gather_scatter_info gs_info;
4369 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4370 &gs_info)
4371 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4373 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4374 free_data_ref (dr);
4375 if (dump_enabled_p ())
4377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4378 (gatherscatter == GATHER) ?
4379 "not vectorized: not suitable for gather "
4380 "load " :
4381 "not vectorized: not suitable for scatter "
4382 "store ");
4383 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4385 return false;
4388 free_data_ref (datarefs[i]);
4389 datarefs[i] = dr;
4390 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4394 /* We used to stop processing and prune the list here. Verify we no
4395 longer need to. */
4396 gcc_assert (i == datarefs.length ());
4398 return true;
4402 /* Function vect_get_new_vect_var.
4404 Returns a name for a new variable. The current naming scheme appends the
4405 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4406 the name of vectorizer generated variables, and appends that to NAME if
4407 provided. */
4409 tree
4410 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4412 const char *prefix;
4413 tree new_vect_var;
4415 switch (var_kind)
4417 case vect_simple_var:
4418 prefix = "vect";
4419 break;
4420 case vect_scalar_var:
4421 prefix = "stmp";
4422 break;
4423 case vect_mask_var:
4424 prefix = "mask";
4425 break;
4426 case vect_pointer_var:
4427 prefix = "vectp";
4428 break;
4429 default:
4430 gcc_unreachable ();
4433 if (name)
4435 char* tmp = concat (prefix, "_", name, NULL);
4436 new_vect_var = create_tmp_reg (type, tmp);
4437 free (tmp);
4439 else
4440 new_vect_var = create_tmp_reg (type, prefix);
4442 return new_vect_var;
4445 /* Like vect_get_new_vect_var but return an SSA name. */
4447 tree
4448 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4450 const char *prefix;
4451 tree new_vect_var;
4453 switch (var_kind)
4455 case vect_simple_var:
4456 prefix = "vect";
4457 break;
4458 case vect_scalar_var:
4459 prefix = "stmp";
4460 break;
4461 case vect_pointer_var:
4462 prefix = "vectp";
4463 break;
4464 default:
4465 gcc_unreachable ();
4468 if (name)
4470 char* tmp = concat (prefix, "_", name, NULL);
4471 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4472 free (tmp);
4474 else
4475 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4477 return new_vect_var;
4480 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4482 static void
4483 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4485 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4486 int misalign = DR_MISALIGNMENT (dr);
4487 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4488 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4489 else
4490 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4491 DR_TARGET_ALIGNMENT (dr), misalign);
4494 /* Function vect_create_addr_base_for_vector_ref.
4496 Create an expression that computes the address of the first memory location
4497 that will be accessed for a data reference.
4499 Input:
4500 STMT: The statement containing the data reference.
4501 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4502 OFFSET: Optional. If supplied, it is be added to the initial address.
4503 LOOP: Specify relative to which loop-nest should the address be computed.
4504 For example, when the dataref is in an inner-loop nested in an
4505 outer-loop that is now being vectorized, LOOP can be either the
4506 outer-loop, or the inner-loop. The first memory location accessed
4507 by the following dataref ('in' points to short):
4509 for (i=0; i<N; i++)
4510 for (j=0; j<M; j++)
4511 s += in[i+j]
4513 is as follows:
4514 if LOOP=i_loop: &in (relative to i_loop)
4515 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4516 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4517 initial address. Unlike OFFSET, which is number of elements to
4518 be added, BYTE_OFFSET is measured in bytes.
4520 Output:
4521 1. Return an SSA_NAME whose value is the address of the memory location of
4522 the first vector of the data reference.
4523 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4524 these statement(s) which define the returned SSA_NAME.
4526 FORNOW: We are only handling array accesses with step 1. */
4528 tree
4529 vect_create_addr_base_for_vector_ref (gimple *stmt,
4530 gimple_seq *new_stmt_list,
4531 tree offset,
4532 tree byte_offset)
4534 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4535 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4536 const char *base_name;
4537 tree addr_base;
4538 tree dest;
4539 gimple_seq seq = NULL;
4540 tree vect_ptr_type;
4541 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4542 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4543 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4545 tree data_ref_base = unshare_expr (drb->base_address);
4546 tree base_offset = unshare_expr (drb->offset);
4547 tree init = unshare_expr (drb->init);
4549 if (loop_vinfo)
4550 base_name = get_name (data_ref_base);
4551 else
4553 base_offset = ssize_int (0);
4554 init = ssize_int (0);
4555 base_name = get_name (DR_REF (dr));
4558 /* Create base_offset */
4559 base_offset = size_binop (PLUS_EXPR,
4560 fold_convert (sizetype, base_offset),
4561 fold_convert (sizetype, init));
4563 if (offset)
4565 offset = fold_build2 (MULT_EXPR, sizetype,
4566 fold_convert (sizetype, offset), step);
4567 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4568 base_offset, offset);
4570 if (byte_offset)
4572 byte_offset = fold_convert (sizetype, byte_offset);
4573 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4574 base_offset, byte_offset);
4577 /* base + base_offset */
4578 if (loop_vinfo)
4579 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4580 else
4582 addr_base = build1 (ADDR_EXPR,
4583 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4584 unshare_expr (DR_REF (dr)));
4587 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4588 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4589 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4590 gimple_seq_add_seq (new_stmt_list, seq);
4592 if (DR_PTR_INFO (dr)
4593 && TREE_CODE (addr_base) == SSA_NAME
4594 && !SSA_NAME_PTR_INFO (addr_base))
4596 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4597 if (offset || byte_offset)
4598 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4601 if (dump_enabled_p ())
4603 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4604 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4605 dump_printf (MSG_NOTE, "\n");
4608 return addr_base;
4612 /* Function vect_create_data_ref_ptr.
4614 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4615 location accessed in the loop by STMT, along with the def-use update
4616 chain to appropriately advance the pointer through the loop iterations.
4617 Also set aliasing information for the pointer. This pointer is used by
4618 the callers to this function to create a memory reference expression for
4619 vector load/store access.
4621 Input:
4622 1. STMT: a stmt that references memory. Expected to be of the form
4623 GIMPLE_ASSIGN <name, data-ref> or
4624 GIMPLE_ASSIGN <data-ref, name>.
4625 2. AGGR_TYPE: the type of the reference, which should be either a vector
4626 or an array.
4627 3. AT_LOOP: the loop where the vector memref is to be created.
4628 4. OFFSET (optional): an offset to be added to the initial address accessed
4629 by the data-ref in STMT.
4630 5. BSI: location where the new stmts are to be placed if there is no loop
4631 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4632 pointing to the initial address.
4633 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4634 to the initial address accessed by the data-ref in STMT. This is
4635 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4636 in bytes.
4637 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4638 to the IV during each iteration of the loop. NULL says to move
4639 by one copy of AGGR_TYPE up or down, depending on the step of the
4640 data reference.
4642 Output:
4643 1. Declare a new ptr to vector_type, and have it point to the base of the
4644 data reference (initial addressed accessed by the data reference).
4645 For example, for vector of type V8HI, the following code is generated:
4647 v8hi *ap;
4648 ap = (v8hi *)initial_address;
4650 if OFFSET is not supplied:
4651 initial_address = &a[init];
4652 if OFFSET is supplied:
4653 initial_address = &a[init + OFFSET];
4654 if BYTE_OFFSET is supplied:
4655 initial_address = &a[init] + BYTE_OFFSET;
4657 Return the initial_address in INITIAL_ADDRESS.
4659 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4660 update the pointer in each iteration of the loop.
4662 Return the increment stmt that updates the pointer in PTR_INCR.
4664 3. Set INV_P to true if the access pattern of the data reference in the
4665 vectorized loop is invariant. Set it to false otherwise.
4667 4. Return the pointer. */
4669 tree
4670 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4671 tree offset, tree *initial_address,
4672 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4673 bool only_init, bool *inv_p, tree byte_offset,
4674 tree iv_step)
4676 const char *base_name;
4677 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4678 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4679 struct loop *loop = NULL;
4680 bool nested_in_vect_loop = false;
4681 struct loop *containing_loop = NULL;
4682 tree aggr_ptr_type;
4683 tree aggr_ptr;
4684 tree new_temp;
4685 gimple_seq new_stmt_list = NULL;
4686 edge pe = NULL;
4687 basic_block new_bb;
4688 tree aggr_ptr_init;
4689 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4690 tree aptr;
4691 gimple_stmt_iterator incr_gsi;
4692 bool insert_after;
4693 tree indx_before_incr, indx_after_incr;
4694 gimple *incr;
4695 tree step;
4696 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4698 gcc_assert (iv_step != NULL_TREE
4699 || TREE_CODE (aggr_type) == ARRAY_TYPE
4700 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4702 if (loop_vinfo)
4704 loop = LOOP_VINFO_LOOP (loop_vinfo);
4705 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4706 containing_loop = (gimple_bb (stmt))->loop_father;
4707 pe = loop_preheader_edge (loop);
4709 else
4711 gcc_assert (bb_vinfo);
4712 only_init = true;
4713 *ptr_incr = NULL;
4716 /* Check the step (evolution) of the load in LOOP, and record
4717 whether it's invariant. */
4718 step = vect_dr_behavior (dr)->step;
4719 if (integer_zerop (step))
4720 *inv_p = true;
4721 else
4722 *inv_p = false;
4724 /* Create an expression for the first address accessed by this load
4725 in LOOP. */
4726 base_name = get_name (DR_BASE_ADDRESS (dr));
4728 if (dump_enabled_p ())
4730 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4731 dump_printf_loc (MSG_NOTE, vect_location,
4732 "create %s-pointer variable to type: ",
4733 get_tree_code_name (TREE_CODE (aggr_type)));
4734 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4735 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4736 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4737 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4738 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4739 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4740 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4741 else
4742 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4743 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4744 dump_printf (MSG_NOTE, "\n");
4747 /* (1) Create the new aggregate-pointer variable.
4748 Vector and array types inherit the alias set of their component
4749 type by default so we need to use a ref-all pointer if the data
4750 reference does not conflict with the created aggregated data
4751 reference because it is not addressable. */
4752 bool need_ref_all = false;
4753 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4754 get_alias_set (DR_REF (dr))))
4755 need_ref_all = true;
4756 /* Likewise for any of the data references in the stmt group. */
4757 else if (DR_GROUP_SIZE (stmt_info) > 1)
4759 gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4762 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4763 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4764 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4765 get_alias_set (DR_REF (sdr))))
4767 need_ref_all = true;
4768 break;
4770 orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4772 while (orig_stmt);
4774 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4775 need_ref_all);
4776 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4779 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4780 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4781 def-use update cycles for the pointer: one relative to the outer-loop
4782 (LOOP), which is what steps (3) and (4) below do. The other is relative
4783 to the inner-loop (which is the inner-most loop containing the dataref),
4784 and this is done be step (5) below.
4786 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4787 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4788 redundant. Steps (3),(4) create the following:
4790 vp0 = &base_addr;
4791 LOOP: vp1 = phi(vp0,vp2)
4794 vp2 = vp1 + step
4795 goto LOOP
4797 If there is an inner-loop nested in loop, then step (5) will also be
4798 applied, and an additional update in the inner-loop will be created:
4800 vp0 = &base_addr;
4801 LOOP: vp1 = phi(vp0,vp2)
4803 inner: vp3 = phi(vp1,vp4)
4804 vp4 = vp3 + inner_step
4805 if () goto inner
4807 vp2 = vp1 + step
4808 if () goto LOOP */
4810 /* (2) Calculate the initial address of the aggregate-pointer, and set
4811 the aggregate-pointer to point to it before the loop. */
4813 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4815 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4816 offset, byte_offset);
4817 if (new_stmt_list)
4819 if (pe)
4821 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4822 gcc_assert (!new_bb);
4824 else
4825 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4828 *initial_address = new_temp;
4829 aggr_ptr_init = new_temp;
4831 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4832 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4833 inner-loop nested in LOOP (during outer-loop vectorization). */
4835 /* No update in loop is required. */
4836 if (only_init && (!loop_vinfo || at_loop == loop))
4837 aptr = aggr_ptr_init;
4838 else
4840 if (iv_step == NULL_TREE)
4842 /* The step of the aggregate pointer is the type size. */
4843 iv_step = TYPE_SIZE_UNIT (aggr_type);
4844 /* One exception to the above is when the scalar step of the load in
4845 LOOP is zero. In this case the step here is also zero. */
4846 if (*inv_p)
4847 iv_step = size_zero_node;
4848 else if (tree_int_cst_sgn (step) == -1)
4849 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4852 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4854 create_iv (aggr_ptr_init,
4855 fold_convert (aggr_ptr_type, iv_step),
4856 aggr_ptr, loop, &incr_gsi, insert_after,
4857 &indx_before_incr, &indx_after_incr);
4858 incr = gsi_stmt (incr_gsi);
4859 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4861 /* Copy the points-to information if it exists. */
4862 if (DR_PTR_INFO (dr))
4864 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4865 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4867 if (ptr_incr)
4868 *ptr_incr = incr;
4870 aptr = indx_before_incr;
4873 if (!nested_in_vect_loop || only_init)
4874 return aptr;
4877 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4878 nested in LOOP, if exists. */
4880 gcc_assert (nested_in_vect_loop);
4881 if (!only_init)
4883 standard_iv_increment_position (containing_loop, &incr_gsi,
4884 &insert_after);
4885 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4886 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4887 &indx_after_incr);
4888 incr = gsi_stmt (incr_gsi);
4889 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4891 /* Copy the points-to information if it exists. */
4892 if (DR_PTR_INFO (dr))
4894 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4895 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4897 if (ptr_incr)
4898 *ptr_incr = incr;
4900 return indx_before_incr;
4902 else
4903 gcc_unreachable ();
4907 /* Function bump_vector_ptr
4909 Increment a pointer (to a vector type) by vector-size. If requested,
4910 i.e. if PTR-INCR is given, then also connect the new increment stmt
4911 to the existing def-use update-chain of the pointer, by modifying
4912 the PTR_INCR as illustrated below:
4914 The pointer def-use update-chain before this function:
4915 DATAREF_PTR = phi (p_0, p_2)
4916 ....
4917 PTR_INCR: p_2 = DATAREF_PTR + step
4919 The pointer def-use update-chain after this function:
4920 DATAREF_PTR = phi (p_0, p_2)
4921 ....
4922 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4923 ....
4924 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4926 Input:
4927 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4928 in the loop.
4929 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4930 the loop. The increment amount across iterations is expected
4931 to be vector_size.
4932 BSI - location where the new update stmt is to be placed.
4933 STMT - the original scalar memory-access stmt that is being vectorized.
4934 BUMP - optional. The offset by which to bump the pointer. If not given,
4935 the offset is assumed to be vector_size.
4937 Output: Return NEW_DATAREF_PTR as illustrated above.
4941 tree
4942 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4943 gimple *stmt, tree bump)
4945 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4946 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4947 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4948 tree update = TYPE_SIZE_UNIT (vectype);
4949 gassign *incr_stmt;
4950 ssa_op_iter iter;
4951 use_operand_p use_p;
4952 tree new_dataref_ptr;
4954 if (bump)
4955 update = bump;
4957 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4958 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4959 else
4960 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4961 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4962 dataref_ptr, update);
4963 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4965 /* Copy the points-to information if it exists. */
4966 if (DR_PTR_INFO (dr))
4968 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4969 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4972 if (!ptr_incr)
4973 return new_dataref_ptr;
4975 /* Update the vector-pointer's cross-iteration increment. */
4976 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4978 tree use = USE_FROM_PTR (use_p);
4980 if (use == dataref_ptr)
4981 SET_USE (use_p, new_dataref_ptr);
4982 else
4983 gcc_assert (operand_equal_p (use, update, 0));
4986 return new_dataref_ptr;
4990 /* Copy memory reference info such as base/clique from the SRC reference
4991 to the DEST MEM_REF. */
4993 void
4994 vect_copy_ref_info (tree dest, tree src)
4996 if (TREE_CODE (dest) != MEM_REF)
4997 return;
4999 tree src_base = src;
5000 while (handled_component_p (src_base))
5001 src_base = TREE_OPERAND (src_base, 0);
5002 if (TREE_CODE (src_base) != MEM_REF
5003 && TREE_CODE (src_base) != TARGET_MEM_REF)
5004 return;
5006 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5007 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5011 /* Function vect_create_destination_var.
5013 Create a new temporary of type VECTYPE. */
5015 tree
5016 vect_create_destination_var (tree scalar_dest, tree vectype)
5018 tree vec_dest;
5019 const char *name;
5020 char *new_name;
5021 tree type;
5022 enum vect_var_kind kind;
5024 kind = vectype
5025 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5026 ? vect_mask_var
5027 : vect_simple_var
5028 : vect_scalar_var;
5029 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5031 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5033 name = get_name (scalar_dest);
5034 if (name)
5035 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5036 else
5037 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5038 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5039 free (new_name);
5041 return vec_dest;
5044 /* Function vect_grouped_store_supported.
5046 Returns TRUE if interleave high and interleave low permutations
5047 are supported, and FALSE otherwise. */
5049 bool
5050 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5052 machine_mode mode = TYPE_MODE (vectype);
5054 /* vect_permute_store_chain requires the group size to be equal to 3 or
5055 be a power of two. */
5056 if (count != 3 && exact_log2 (count) == -1)
5058 if (dump_enabled_p ())
5059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5060 "the size of the group of accesses"
5061 " is not a power of 2 or not eqaul to 3\n");
5062 return false;
5065 /* Check that the permutation is supported. */
5066 if (VECTOR_MODE_P (mode))
5068 unsigned int i;
5069 if (count == 3)
5071 unsigned int j0 = 0, j1 = 0, j2 = 0;
5072 unsigned int i, j;
5074 unsigned int nelt;
5075 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5077 if (dump_enabled_p ())
5078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5079 "cannot handle groups of 3 stores for"
5080 " variable-length vectors\n");
5081 return false;
5084 vec_perm_builder sel (nelt, nelt, 1);
5085 sel.quick_grow (nelt);
5086 vec_perm_indices indices;
5087 for (j = 0; j < 3; j++)
5089 int nelt0 = ((3 - j) * nelt) % 3;
5090 int nelt1 = ((3 - j) * nelt + 1) % 3;
5091 int nelt2 = ((3 - j) * nelt + 2) % 3;
5092 for (i = 0; i < nelt; i++)
5094 if (3 * i + nelt0 < nelt)
5095 sel[3 * i + nelt0] = j0++;
5096 if (3 * i + nelt1 < nelt)
5097 sel[3 * i + nelt1] = nelt + j1++;
5098 if (3 * i + nelt2 < nelt)
5099 sel[3 * i + nelt2] = 0;
5101 indices.new_vector (sel, 2, nelt);
5102 if (!can_vec_perm_const_p (mode, indices))
5104 if (dump_enabled_p ())
5105 dump_printf (MSG_MISSED_OPTIMIZATION,
5106 "permutation op not supported by target.\n");
5107 return false;
5110 for (i = 0; i < nelt; i++)
5112 if (3 * i + nelt0 < nelt)
5113 sel[3 * i + nelt0] = 3 * i + nelt0;
5114 if (3 * i + nelt1 < nelt)
5115 sel[3 * i + nelt1] = 3 * i + nelt1;
5116 if (3 * i + nelt2 < nelt)
5117 sel[3 * i + nelt2] = nelt + j2++;
5119 indices.new_vector (sel, 2, nelt);
5120 if (!can_vec_perm_const_p (mode, indices))
5122 if (dump_enabled_p ())
5123 dump_printf (MSG_MISSED_OPTIMIZATION,
5124 "permutation op not supported by target.\n");
5125 return false;
5128 return true;
5130 else
5132 /* If length is not equal to 3 then only power of 2 is supported. */
5133 gcc_assert (pow2p_hwi (count));
5134 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5136 /* The encoding has 2 interleaved stepped patterns. */
5137 vec_perm_builder sel (nelt, 2, 3);
5138 sel.quick_grow (6);
5139 for (i = 0; i < 3; i++)
5141 sel[i * 2] = i;
5142 sel[i * 2 + 1] = i + nelt;
5144 vec_perm_indices indices (sel, 2, nelt);
5145 if (can_vec_perm_const_p (mode, indices))
5147 for (i = 0; i < 6; i++)
5148 sel[i] += exact_div (nelt, 2);
5149 indices.new_vector (sel, 2, nelt);
5150 if (can_vec_perm_const_p (mode, indices))
5151 return true;
5156 if (dump_enabled_p ())
5157 dump_printf (MSG_MISSED_OPTIMIZATION,
5158 "permutaion op not supported by target.\n");
5159 return false;
5163 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5164 type VECTYPE. MASKED_P says whether the masked form is needed. */
5166 bool
5167 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5168 bool masked_p)
5170 if (masked_p)
5171 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5172 vec_mask_store_lanes_optab,
5173 vectype, count);
5174 else
5175 return vect_lanes_optab_supported_p ("vec_store_lanes",
5176 vec_store_lanes_optab,
5177 vectype, count);
5181 /* Function vect_permute_store_chain.
5183 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5184 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5185 the data correctly for the stores. Return the final references for stores
5186 in RESULT_CHAIN.
5188 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5189 The input is 4 vectors each containing 8 elements. We assign a number to
5190 each element, the input sequence is:
5192 1st vec: 0 1 2 3 4 5 6 7
5193 2nd vec: 8 9 10 11 12 13 14 15
5194 3rd vec: 16 17 18 19 20 21 22 23
5195 4th vec: 24 25 26 27 28 29 30 31
5197 The output sequence should be:
5199 1st vec: 0 8 16 24 1 9 17 25
5200 2nd vec: 2 10 18 26 3 11 19 27
5201 3rd vec: 4 12 20 28 5 13 21 30
5202 4th vec: 6 14 22 30 7 15 23 31
5204 i.e., we interleave the contents of the four vectors in their order.
5206 We use interleave_high/low instructions to create such output. The input of
5207 each interleave_high/low operation is two vectors:
5208 1st vec 2nd vec
5209 0 1 2 3 4 5 6 7
5210 the even elements of the result vector are obtained left-to-right from the
5211 high/low elements of the first vector. The odd elements of the result are
5212 obtained left-to-right from the high/low elements of the second vector.
5213 The output of interleave_high will be: 0 4 1 5
5214 and of interleave_low: 2 6 3 7
5217 The permutation is done in log LENGTH stages. In each stage interleave_high
5218 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5219 where the first argument is taken from the first half of DR_CHAIN and the
5220 second argument from it's second half.
5221 In our example,
5223 I1: interleave_high (1st vec, 3rd vec)
5224 I2: interleave_low (1st vec, 3rd vec)
5225 I3: interleave_high (2nd vec, 4th vec)
5226 I4: interleave_low (2nd vec, 4th vec)
5228 The output for the first stage is:
5230 I1: 0 16 1 17 2 18 3 19
5231 I2: 4 20 5 21 6 22 7 23
5232 I3: 8 24 9 25 10 26 11 27
5233 I4: 12 28 13 29 14 30 15 31
5235 The output of the second stage, i.e. the final result is:
5237 I1: 0 8 16 24 1 9 17 25
5238 I2: 2 10 18 26 3 11 19 27
5239 I3: 4 12 20 28 5 13 21 30
5240 I4: 6 14 22 30 7 15 23 31. */
5242 void
5243 vect_permute_store_chain (vec<tree> dr_chain,
5244 unsigned int length,
5245 gimple *stmt,
5246 gimple_stmt_iterator *gsi,
5247 vec<tree> *result_chain)
5249 tree vect1, vect2, high, low;
5250 gimple *perm_stmt;
5251 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5252 tree perm_mask_low, perm_mask_high;
5253 tree data_ref;
5254 tree perm3_mask_low, perm3_mask_high;
5255 unsigned int i, j, n, log_length = exact_log2 (length);
5257 result_chain->quick_grow (length);
5258 memcpy (result_chain->address (), dr_chain.address (),
5259 length * sizeof (tree));
5261 if (length == 3)
5263 /* vect_grouped_store_supported ensures that this is constant. */
5264 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5265 unsigned int j0 = 0, j1 = 0, j2 = 0;
5267 vec_perm_builder sel (nelt, nelt, 1);
5268 sel.quick_grow (nelt);
5269 vec_perm_indices indices;
5270 for (j = 0; j < 3; j++)
5272 int nelt0 = ((3 - j) * nelt) % 3;
5273 int nelt1 = ((3 - j) * nelt + 1) % 3;
5274 int nelt2 = ((3 - j) * nelt + 2) % 3;
5276 for (i = 0; i < nelt; i++)
5278 if (3 * i + nelt0 < nelt)
5279 sel[3 * i + nelt0] = j0++;
5280 if (3 * i + nelt1 < nelt)
5281 sel[3 * i + nelt1] = nelt + j1++;
5282 if (3 * i + nelt2 < nelt)
5283 sel[3 * i + nelt2] = 0;
5285 indices.new_vector (sel, 2, nelt);
5286 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5288 for (i = 0; i < nelt; i++)
5290 if (3 * i + nelt0 < nelt)
5291 sel[3 * i + nelt0] = 3 * i + nelt0;
5292 if (3 * i + nelt1 < nelt)
5293 sel[3 * i + nelt1] = 3 * i + nelt1;
5294 if (3 * i + nelt2 < nelt)
5295 sel[3 * i + nelt2] = nelt + j2++;
5297 indices.new_vector (sel, 2, nelt);
5298 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5300 vect1 = dr_chain[0];
5301 vect2 = dr_chain[1];
5303 /* Create interleaving stmt:
5304 low = VEC_PERM_EXPR <vect1, vect2,
5305 {j, nelt, *, j + 1, nelt + j + 1, *,
5306 j + 2, nelt + j + 2, *, ...}> */
5307 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5308 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5309 vect2, perm3_mask_low);
5310 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5312 vect1 = data_ref;
5313 vect2 = dr_chain[2];
5314 /* Create interleaving stmt:
5315 low = VEC_PERM_EXPR <vect1, vect2,
5316 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5317 6, 7, nelt + j + 2, ...}> */
5318 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5319 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5320 vect2, perm3_mask_high);
5321 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5322 (*result_chain)[j] = data_ref;
5325 else
5327 /* If length is not equal to 3 then only power of 2 is supported. */
5328 gcc_assert (pow2p_hwi (length));
5330 /* The encoding has 2 interleaved stepped patterns. */
5331 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5332 vec_perm_builder sel (nelt, 2, 3);
5333 sel.quick_grow (6);
5334 for (i = 0; i < 3; i++)
5336 sel[i * 2] = i;
5337 sel[i * 2 + 1] = i + nelt;
5339 vec_perm_indices indices (sel, 2, nelt);
5340 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5342 for (i = 0; i < 6; i++)
5343 sel[i] += exact_div (nelt, 2);
5344 indices.new_vector (sel, 2, nelt);
5345 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5347 for (i = 0, n = log_length; i < n; i++)
5349 for (j = 0; j < length/2; j++)
5351 vect1 = dr_chain[j];
5352 vect2 = dr_chain[j+length/2];
5354 /* Create interleaving stmt:
5355 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5356 ...}> */
5357 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5358 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5359 vect2, perm_mask_high);
5360 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5361 (*result_chain)[2*j] = high;
5363 /* Create interleaving stmt:
5364 low = VEC_PERM_EXPR <vect1, vect2,
5365 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5366 ...}> */
5367 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5368 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5369 vect2, perm_mask_low);
5370 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5371 (*result_chain)[2*j+1] = low;
5373 memcpy (dr_chain.address (), result_chain->address (),
5374 length * sizeof (tree));
5379 /* Function vect_setup_realignment
5381 This function is called when vectorizing an unaligned load using
5382 the dr_explicit_realign[_optimized] scheme.
5383 This function generates the following code at the loop prolog:
5385 p = initial_addr;
5386 x msq_init = *(floor(p)); # prolog load
5387 realignment_token = call target_builtin;
5388 loop:
5389 x msq = phi (msq_init, ---)
5391 The stmts marked with x are generated only for the case of
5392 dr_explicit_realign_optimized.
5394 The code above sets up a new (vector) pointer, pointing to the first
5395 location accessed by STMT, and a "floor-aligned" load using that pointer.
5396 It also generates code to compute the "realignment-token" (if the relevant
5397 target hook was defined), and creates a phi-node at the loop-header bb
5398 whose arguments are the result of the prolog-load (created by this
5399 function) and the result of a load that takes place in the loop (to be
5400 created by the caller to this function).
5402 For the case of dr_explicit_realign_optimized:
5403 The caller to this function uses the phi-result (msq) to create the
5404 realignment code inside the loop, and sets up the missing phi argument,
5405 as follows:
5406 loop:
5407 msq = phi (msq_init, lsq)
5408 lsq = *(floor(p')); # load in loop
5409 result = realign_load (msq, lsq, realignment_token);
5411 For the case of dr_explicit_realign:
5412 loop:
5413 msq = *(floor(p)); # load in loop
5414 p' = p + (VS-1);
5415 lsq = *(floor(p')); # load in loop
5416 result = realign_load (msq, lsq, realignment_token);
5418 Input:
5419 STMT - (scalar) load stmt to be vectorized. This load accesses
5420 a memory location that may be unaligned.
5421 BSI - place where new code is to be inserted.
5422 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5423 is used.
5425 Output:
5426 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5427 target hook, if defined.
5428 Return value - the result of the loop-header phi node. */
5430 tree
5431 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5432 tree *realignment_token,
5433 enum dr_alignment_support alignment_support_scheme,
5434 tree init_addr,
5435 struct loop **at_loop)
5437 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5438 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5439 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5440 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5441 struct loop *loop = NULL;
5442 edge pe = NULL;
5443 tree scalar_dest = gimple_assign_lhs (stmt);
5444 tree vec_dest;
5445 gimple *inc;
5446 tree ptr;
5447 tree data_ref;
5448 basic_block new_bb;
5449 tree msq_init = NULL_TREE;
5450 tree new_temp;
5451 gphi *phi_stmt;
5452 tree msq = NULL_TREE;
5453 gimple_seq stmts = NULL;
5454 bool inv_p;
5455 bool compute_in_loop = false;
5456 bool nested_in_vect_loop = false;
5457 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5458 struct loop *loop_for_initial_load = NULL;
5460 if (loop_vinfo)
5462 loop = LOOP_VINFO_LOOP (loop_vinfo);
5463 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5466 gcc_assert (alignment_support_scheme == dr_explicit_realign
5467 || alignment_support_scheme == dr_explicit_realign_optimized);
5469 /* We need to generate three things:
5470 1. the misalignment computation
5471 2. the extra vector load (for the optimized realignment scheme).
5472 3. the phi node for the two vectors from which the realignment is
5473 done (for the optimized realignment scheme). */
5475 /* 1. Determine where to generate the misalignment computation.
5477 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5478 calculation will be generated by this function, outside the loop (in the
5479 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5480 caller, inside the loop.
5482 Background: If the misalignment remains fixed throughout the iterations of
5483 the loop, then both realignment schemes are applicable, and also the
5484 misalignment computation can be done outside LOOP. This is because we are
5485 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5486 are a multiple of VS (the Vector Size), and therefore the misalignment in
5487 different vectorized LOOP iterations is always the same.
5488 The problem arises only if the memory access is in an inner-loop nested
5489 inside LOOP, which is now being vectorized using outer-loop vectorization.
5490 This is the only case when the misalignment of the memory access may not
5491 remain fixed throughout the iterations of the inner-loop (as explained in
5492 detail in vect_supportable_dr_alignment). In this case, not only is the
5493 optimized realignment scheme not applicable, but also the misalignment
5494 computation (and generation of the realignment token that is passed to
5495 REALIGN_LOAD) have to be done inside the loop.
5497 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5498 or not, which in turn determines if the misalignment is computed inside
5499 the inner-loop, or outside LOOP. */
5501 if (init_addr != NULL_TREE || !loop_vinfo)
5503 compute_in_loop = true;
5504 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5508 /* 2. Determine where to generate the extra vector load.
5510 For the optimized realignment scheme, instead of generating two vector
5511 loads in each iteration, we generate a single extra vector load in the
5512 preheader of the loop, and in each iteration reuse the result of the
5513 vector load from the previous iteration. In case the memory access is in
5514 an inner-loop nested inside LOOP, which is now being vectorized using
5515 outer-loop vectorization, we need to determine whether this initial vector
5516 load should be generated at the preheader of the inner-loop, or can be
5517 generated at the preheader of LOOP. If the memory access has no evolution
5518 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5519 to be generated inside LOOP (in the preheader of the inner-loop). */
5521 if (nested_in_vect_loop)
5523 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5524 bool invariant_in_outerloop =
5525 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5526 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5528 else
5529 loop_for_initial_load = loop;
5530 if (at_loop)
5531 *at_loop = loop_for_initial_load;
5533 if (loop_for_initial_load)
5534 pe = loop_preheader_edge (loop_for_initial_load);
5536 /* 3. For the case of the optimized realignment, create the first vector
5537 load at the loop preheader. */
5539 if (alignment_support_scheme == dr_explicit_realign_optimized)
5541 /* Create msq_init = *(floor(p1)) in the loop preheader */
5542 gassign *new_stmt;
5544 gcc_assert (!compute_in_loop);
5545 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5546 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5547 NULL_TREE, &init_addr, NULL, &inc,
5548 true, &inv_p);
5549 if (TREE_CODE (ptr) == SSA_NAME)
5550 new_temp = copy_ssa_name (ptr);
5551 else
5552 new_temp = make_ssa_name (TREE_TYPE (ptr));
5553 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5554 new_stmt = gimple_build_assign
5555 (new_temp, BIT_AND_EXPR, ptr,
5556 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5557 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5558 gcc_assert (!new_bb);
5559 data_ref
5560 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5561 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5562 vect_copy_ref_info (data_ref, DR_REF (dr));
5563 new_stmt = gimple_build_assign (vec_dest, data_ref);
5564 new_temp = make_ssa_name (vec_dest, new_stmt);
5565 gimple_assign_set_lhs (new_stmt, new_temp);
5566 if (pe)
5568 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5569 gcc_assert (!new_bb);
5571 else
5572 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5574 msq_init = gimple_assign_lhs (new_stmt);
5577 /* 4. Create realignment token using a target builtin, if available.
5578 It is done either inside the containing loop, or before LOOP (as
5579 determined above). */
5581 if (targetm.vectorize.builtin_mask_for_load)
5583 gcall *new_stmt;
5584 tree builtin_decl;
5586 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5587 if (!init_addr)
5589 /* Generate the INIT_ADDR computation outside LOOP. */
5590 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5591 NULL_TREE);
5592 if (loop)
5594 pe = loop_preheader_edge (loop);
5595 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5596 gcc_assert (!new_bb);
5598 else
5599 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5602 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5603 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5604 vec_dest =
5605 vect_create_destination_var (scalar_dest,
5606 gimple_call_return_type (new_stmt));
5607 new_temp = make_ssa_name (vec_dest, new_stmt);
5608 gimple_call_set_lhs (new_stmt, new_temp);
5610 if (compute_in_loop)
5611 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5612 else
5614 /* Generate the misalignment computation outside LOOP. */
5615 pe = loop_preheader_edge (loop);
5616 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5617 gcc_assert (!new_bb);
5620 *realignment_token = gimple_call_lhs (new_stmt);
5622 /* The result of the CALL_EXPR to this builtin is determined from
5623 the value of the parameter and no global variables are touched
5624 which makes the builtin a "const" function. Requiring the
5625 builtin to have the "const" attribute makes it unnecessary
5626 to call mark_call_clobbered. */
5627 gcc_assert (TREE_READONLY (builtin_decl));
5630 if (alignment_support_scheme == dr_explicit_realign)
5631 return msq;
5633 gcc_assert (!compute_in_loop);
5634 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5637 /* 5. Create msq = phi <msq_init, lsq> in loop */
5639 pe = loop_preheader_edge (containing_loop);
5640 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5641 msq = make_ssa_name (vec_dest);
5642 phi_stmt = create_phi_node (msq, containing_loop->header);
5643 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5645 return msq;
5649 /* Function vect_grouped_load_supported.
5651 COUNT is the size of the load group (the number of statements plus the
5652 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5653 only one statement, with a gap of COUNT - 1.
5655 Returns true if a suitable permute exists. */
5657 bool
5658 vect_grouped_load_supported (tree vectype, bool single_element_p,
5659 unsigned HOST_WIDE_INT count)
5661 machine_mode mode = TYPE_MODE (vectype);
5663 /* If this is single-element interleaving with an element distance
5664 that leaves unused vector loads around punt - we at least create
5665 very sub-optimal code in that case (and blow up memory,
5666 see PR65518). */
5667 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5669 if (dump_enabled_p ())
5670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5671 "single-element interleaving not supported "
5672 "for not adjacent vector loads\n");
5673 return false;
5676 /* vect_permute_load_chain requires the group size to be equal to 3 or
5677 be a power of two. */
5678 if (count != 3 && exact_log2 (count) == -1)
5680 if (dump_enabled_p ())
5681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5682 "the size of the group of accesses"
5683 " is not a power of 2 or not equal to 3\n");
5684 return false;
5687 /* Check that the permutation is supported. */
5688 if (VECTOR_MODE_P (mode))
5690 unsigned int i, j;
5691 if (count == 3)
5693 unsigned int nelt;
5694 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5696 if (dump_enabled_p ())
5697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5698 "cannot handle groups of 3 loads for"
5699 " variable-length vectors\n");
5700 return false;
5703 vec_perm_builder sel (nelt, nelt, 1);
5704 sel.quick_grow (nelt);
5705 vec_perm_indices indices;
5706 unsigned int k;
5707 for (k = 0; k < 3; k++)
5709 for (i = 0; i < nelt; i++)
5710 if (3 * i + k < 2 * nelt)
5711 sel[i] = 3 * i + k;
5712 else
5713 sel[i] = 0;
5714 indices.new_vector (sel, 2, nelt);
5715 if (!can_vec_perm_const_p (mode, indices))
5717 if (dump_enabled_p ())
5718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5719 "shuffle of 3 loads is not supported by"
5720 " target\n");
5721 return false;
5723 for (i = 0, j = 0; i < nelt; i++)
5724 if (3 * i + k < 2 * nelt)
5725 sel[i] = i;
5726 else
5727 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5728 indices.new_vector (sel, 2, nelt);
5729 if (!can_vec_perm_const_p (mode, indices))
5731 if (dump_enabled_p ())
5732 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5733 "shuffle of 3 loads is not supported by"
5734 " target\n");
5735 return false;
5738 return true;
5740 else
5742 /* If length is not equal to 3 then only power of 2 is supported. */
5743 gcc_assert (pow2p_hwi (count));
5744 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5746 /* The encoding has a single stepped pattern. */
5747 vec_perm_builder sel (nelt, 1, 3);
5748 sel.quick_grow (3);
5749 for (i = 0; i < 3; i++)
5750 sel[i] = i * 2;
5751 vec_perm_indices indices (sel, 2, nelt);
5752 if (can_vec_perm_const_p (mode, indices))
5754 for (i = 0; i < 3; i++)
5755 sel[i] = i * 2 + 1;
5756 indices.new_vector (sel, 2, nelt);
5757 if (can_vec_perm_const_p (mode, indices))
5758 return true;
5763 if (dump_enabled_p ())
5764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5765 "extract even/odd not supported by target\n");
5766 return false;
5769 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5770 type VECTYPE. MASKED_P says whether the masked form is needed. */
5772 bool
5773 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5774 bool masked_p)
5776 if (masked_p)
5777 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5778 vec_mask_load_lanes_optab,
5779 vectype, count);
5780 else
5781 return vect_lanes_optab_supported_p ("vec_load_lanes",
5782 vec_load_lanes_optab,
5783 vectype, count);
5786 /* Function vect_permute_load_chain.
5788 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5789 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5790 the input data correctly. Return the final references for loads in
5791 RESULT_CHAIN.
5793 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5794 The input is 4 vectors each containing 8 elements. We assign a number to each
5795 element, the input sequence is:
5797 1st vec: 0 1 2 3 4 5 6 7
5798 2nd vec: 8 9 10 11 12 13 14 15
5799 3rd vec: 16 17 18 19 20 21 22 23
5800 4th vec: 24 25 26 27 28 29 30 31
5802 The output sequence should be:
5804 1st vec: 0 4 8 12 16 20 24 28
5805 2nd vec: 1 5 9 13 17 21 25 29
5806 3rd vec: 2 6 10 14 18 22 26 30
5807 4th vec: 3 7 11 15 19 23 27 31
5809 i.e., the first output vector should contain the first elements of each
5810 interleaving group, etc.
5812 We use extract_even/odd instructions to create such output. The input of
5813 each extract_even/odd operation is two vectors
5814 1st vec 2nd vec
5815 0 1 2 3 4 5 6 7
5817 and the output is the vector of extracted even/odd elements. The output of
5818 extract_even will be: 0 2 4 6
5819 and of extract_odd: 1 3 5 7
5822 The permutation is done in log LENGTH stages. In each stage extract_even
5823 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5824 their order. In our example,
5826 E1: extract_even (1st vec, 2nd vec)
5827 E2: extract_odd (1st vec, 2nd vec)
5828 E3: extract_even (3rd vec, 4th vec)
5829 E4: extract_odd (3rd vec, 4th vec)
5831 The output for the first stage will be:
5833 E1: 0 2 4 6 8 10 12 14
5834 E2: 1 3 5 7 9 11 13 15
5835 E3: 16 18 20 22 24 26 28 30
5836 E4: 17 19 21 23 25 27 29 31
5838 In order to proceed and create the correct sequence for the next stage (or
5839 for the correct output, if the second stage is the last one, as in our
5840 example), we first put the output of extract_even operation and then the
5841 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5842 The input for the second stage is:
5844 1st vec (E1): 0 2 4 6 8 10 12 14
5845 2nd vec (E3): 16 18 20 22 24 26 28 30
5846 3rd vec (E2): 1 3 5 7 9 11 13 15
5847 4th vec (E4): 17 19 21 23 25 27 29 31
5849 The output of the second stage:
5851 E1: 0 4 8 12 16 20 24 28
5852 E2: 2 6 10 14 18 22 26 30
5853 E3: 1 5 9 13 17 21 25 29
5854 E4: 3 7 11 15 19 23 27 31
5856 And RESULT_CHAIN after reordering:
5858 1st vec (E1): 0 4 8 12 16 20 24 28
5859 2nd vec (E3): 1 5 9 13 17 21 25 29
5860 3rd vec (E2): 2 6 10 14 18 22 26 30
5861 4th vec (E4): 3 7 11 15 19 23 27 31. */
5863 static void
5864 vect_permute_load_chain (vec<tree> dr_chain,
5865 unsigned int length,
5866 gimple *stmt,
5867 gimple_stmt_iterator *gsi,
5868 vec<tree> *result_chain)
5870 tree data_ref, first_vect, second_vect;
5871 tree perm_mask_even, perm_mask_odd;
5872 tree perm3_mask_low, perm3_mask_high;
5873 gimple *perm_stmt;
5874 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5875 unsigned int i, j, log_length = exact_log2 (length);
5877 result_chain->quick_grow (length);
5878 memcpy (result_chain->address (), dr_chain.address (),
5879 length * sizeof (tree));
5881 if (length == 3)
5883 /* vect_grouped_load_supported ensures that this is constant. */
5884 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5885 unsigned int k;
5887 vec_perm_builder sel (nelt, nelt, 1);
5888 sel.quick_grow (nelt);
5889 vec_perm_indices indices;
5890 for (k = 0; k < 3; k++)
5892 for (i = 0; i < nelt; i++)
5893 if (3 * i + k < 2 * nelt)
5894 sel[i] = 3 * i + k;
5895 else
5896 sel[i] = 0;
5897 indices.new_vector (sel, 2, nelt);
5898 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5900 for (i = 0, j = 0; i < nelt; i++)
5901 if (3 * i + k < 2 * nelt)
5902 sel[i] = i;
5903 else
5904 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5905 indices.new_vector (sel, 2, nelt);
5906 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5908 first_vect = dr_chain[0];
5909 second_vect = dr_chain[1];
5911 /* Create interleaving stmt (low part of):
5912 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5913 ...}> */
5914 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5915 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5916 second_vect, perm3_mask_low);
5917 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5919 /* Create interleaving stmt (high part of):
5920 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5921 ...}> */
5922 first_vect = data_ref;
5923 second_vect = dr_chain[2];
5924 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5925 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5926 second_vect, perm3_mask_high);
5927 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5928 (*result_chain)[k] = data_ref;
5931 else
5933 /* If length is not equal to 3 then only power of 2 is supported. */
5934 gcc_assert (pow2p_hwi (length));
5936 /* The encoding has a single stepped pattern. */
5937 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5938 vec_perm_builder sel (nelt, 1, 3);
5939 sel.quick_grow (3);
5940 for (i = 0; i < 3; ++i)
5941 sel[i] = i * 2;
5942 vec_perm_indices indices (sel, 2, nelt);
5943 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5945 for (i = 0; i < 3; ++i)
5946 sel[i] = i * 2 + 1;
5947 indices.new_vector (sel, 2, nelt);
5948 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5950 for (i = 0; i < log_length; i++)
5952 for (j = 0; j < length; j += 2)
5954 first_vect = dr_chain[j];
5955 second_vect = dr_chain[j+1];
5957 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5958 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5959 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5960 first_vect, second_vect,
5961 perm_mask_even);
5962 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5963 (*result_chain)[j/2] = data_ref;
5965 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5966 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5967 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5968 first_vect, second_vect,
5969 perm_mask_odd);
5970 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5971 (*result_chain)[j/2+length/2] = data_ref;
5973 memcpy (dr_chain.address (), result_chain->address (),
5974 length * sizeof (tree));
5979 /* Function vect_shift_permute_load_chain.
5981 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5982 sequence of stmts to reorder the input data accordingly.
5983 Return the final references for loads in RESULT_CHAIN.
5984 Return true if successed, false otherwise.
5986 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5987 The input is 3 vectors each containing 8 elements. We assign a
5988 number to each element, the input sequence is:
5990 1st vec: 0 1 2 3 4 5 6 7
5991 2nd vec: 8 9 10 11 12 13 14 15
5992 3rd vec: 16 17 18 19 20 21 22 23
5994 The output sequence should be:
5996 1st vec: 0 3 6 9 12 15 18 21
5997 2nd vec: 1 4 7 10 13 16 19 22
5998 3rd vec: 2 5 8 11 14 17 20 23
6000 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6002 First we shuffle all 3 vectors to get correct elements order:
6004 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6005 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6006 3rd vec: (16 19 22) (17 20 23) (18 21)
6008 Next we unite and shift vector 3 times:
6010 1st step:
6011 shift right by 6 the concatenation of:
6012 "1st vec" and "2nd vec"
6013 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6014 "2nd vec" and "3rd vec"
6015 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6016 "3rd vec" and "1st vec"
6017 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6018 | New vectors |
6020 So that now new vectors are:
6022 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6023 2nd vec: (10 13) (16 19 22) (17 20 23)
6024 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6026 2nd step:
6027 shift right by 5 the concatenation of:
6028 "1st vec" and "3rd vec"
6029 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6030 "2nd vec" and "1st vec"
6031 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6032 "3rd vec" and "2nd vec"
6033 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6034 | New vectors |
6036 So that now new vectors are:
6038 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6039 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6040 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6042 3rd step:
6043 shift right by 5 the concatenation of:
6044 "1st vec" and "1st vec"
6045 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6046 shift right by 3 the concatenation of:
6047 "2nd vec" and "2nd vec"
6048 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6049 | New vectors |
6051 So that now all vectors are READY:
6052 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6053 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6054 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6056 This algorithm is faster than one in vect_permute_load_chain if:
6057 1. "shift of a concatination" is faster than general permutation.
6058 This is usually so.
6059 2. The TARGET machine can't execute vector instructions in parallel.
6060 This is because each step of the algorithm depends on previous.
6061 The algorithm in vect_permute_load_chain is much more parallel.
6063 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6066 static bool
6067 vect_shift_permute_load_chain (vec<tree> dr_chain,
6068 unsigned int length,
6069 gimple *stmt,
6070 gimple_stmt_iterator *gsi,
6071 vec<tree> *result_chain)
6073 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6074 tree perm2_mask1, perm2_mask2, perm3_mask;
6075 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6076 gimple *perm_stmt;
6078 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6079 unsigned int i;
6080 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6081 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6083 unsigned HOST_WIDE_INT nelt, vf;
6084 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6085 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6086 /* Not supported for variable-length vectors. */
6087 return false;
6089 vec_perm_builder sel (nelt, nelt, 1);
6090 sel.quick_grow (nelt);
6092 result_chain->quick_grow (length);
6093 memcpy (result_chain->address (), dr_chain.address (),
6094 length * sizeof (tree));
6096 if (pow2p_hwi (length) && vf > 4)
6098 unsigned int j, log_length = exact_log2 (length);
6099 for (i = 0; i < nelt / 2; ++i)
6100 sel[i] = i * 2;
6101 for (i = 0; i < nelt / 2; ++i)
6102 sel[nelt / 2 + i] = i * 2 + 1;
6103 vec_perm_indices indices (sel, 2, nelt);
6104 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6106 if (dump_enabled_p ())
6107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6108 "shuffle of 2 fields structure is not \
6109 supported by target\n");
6110 return false;
6112 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6114 for (i = 0; i < nelt / 2; ++i)
6115 sel[i] = i * 2 + 1;
6116 for (i = 0; i < nelt / 2; ++i)
6117 sel[nelt / 2 + i] = i * 2;
6118 indices.new_vector (sel, 2, nelt);
6119 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6121 if (dump_enabled_p ())
6122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6123 "shuffle of 2 fields structure is not \
6124 supported by target\n");
6125 return false;
6127 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6129 /* Generating permutation constant to shift all elements.
6130 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6131 for (i = 0; i < nelt; i++)
6132 sel[i] = nelt / 2 + i;
6133 indices.new_vector (sel, 2, nelt);
6134 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6136 if (dump_enabled_p ())
6137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6138 "shift permutation is not supported by target\n");
6139 return false;
6141 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6143 /* Generating permutation constant to select vector from 2.
6144 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6145 for (i = 0; i < nelt / 2; i++)
6146 sel[i] = i;
6147 for (i = nelt / 2; i < nelt; i++)
6148 sel[i] = nelt + i;
6149 indices.new_vector (sel, 2, nelt);
6150 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6152 if (dump_enabled_p ())
6153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6154 "select is not supported by target\n");
6155 return false;
6157 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6159 for (i = 0; i < log_length; i++)
6161 for (j = 0; j < length; j += 2)
6163 first_vect = dr_chain[j];
6164 second_vect = dr_chain[j + 1];
6166 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6167 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6168 first_vect, first_vect,
6169 perm2_mask1);
6170 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6171 vect[0] = data_ref;
6173 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6174 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6175 second_vect, second_vect,
6176 perm2_mask2);
6177 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6178 vect[1] = data_ref;
6180 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6181 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6182 vect[0], vect[1], shift1_mask);
6183 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6184 (*result_chain)[j/2 + length/2] = data_ref;
6186 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6187 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6188 vect[0], vect[1], select_mask);
6189 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6190 (*result_chain)[j/2] = data_ref;
6192 memcpy (dr_chain.address (), result_chain->address (),
6193 length * sizeof (tree));
6195 return true;
6197 if (length == 3 && vf > 2)
6199 unsigned int k = 0, l = 0;
6201 /* Generating permutation constant to get all elements in rigth order.
6202 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6203 for (i = 0; i < nelt; i++)
6205 if (3 * k + (l % 3) >= nelt)
6207 k = 0;
6208 l += (3 - (nelt % 3));
6210 sel[i] = 3 * k + (l % 3);
6211 k++;
6213 vec_perm_indices indices (sel, 2, nelt);
6214 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6216 if (dump_enabled_p ())
6217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6218 "shuffle of 3 fields structure is not \
6219 supported by target\n");
6220 return false;
6222 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6224 /* Generating permutation constant to shift all elements.
6225 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6226 for (i = 0; i < nelt; i++)
6227 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6228 indices.new_vector (sel, 2, nelt);
6229 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6231 if (dump_enabled_p ())
6232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6233 "shift permutation is not supported by target\n");
6234 return false;
6236 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6238 /* Generating permutation constant to shift all elements.
6239 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6240 for (i = 0; i < nelt; i++)
6241 sel[i] = 2 * (nelt / 3) + 1 + i;
6242 indices.new_vector (sel, 2, nelt);
6243 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6245 if (dump_enabled_p ())
6246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6247 "shift permutation is not supported by target\n");
6248 return false;
6250 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6252 /* Generating permutation constant to shift all elements.
6253 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6254 for (i = 0; i < nelt; i++)
6255 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6256 indices.new_vector (sel, 2, nelt);
6257 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6259 if (dump_enabled_p ())
6260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6261 "shift permutation is not supported by target\n");
6262 return false;
6264 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6266 /* Generating permutation constant to shift all elements.
6267 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6268 for (i = 0; i < nelt; i++)
6269 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6270 indices.new_vector (sel, 2, nelt);
6271 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6273 if (dump_enabled_p ())
6274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6275 "shift permutation is not supported by target\n");
6276 return false;
6278 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6280 for (k = 0; k < 3; k++)
6282 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6283 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6284 dr_chain[k], dr_chain[k],
6285 perm3_mask);
6286 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6287 vect[k] = data_ref;
6290 for (k = 0; k < 3; k++)
6292 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6293 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6294 vect[k % 3], vect[(k + 1) % 3],
6295 shift1_mask);
6296 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6297 vect_shift[k] = data_ref;
6300 for (k = 0; k < 3; k++)
6302 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6303 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6304 vect_shift[(4 - k) % 3],
6305 vect_shift[(3 - k) % 3],
6306 shift2_mask);
6307 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6308 vect[k] = data_ref;
6311 (*result_chain)[3 - (nelt % 3)] = vect[2];
6313 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6314 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6315 vect[0], shift3_mask);
6316 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6317 (*result_chain)[nelt % 3] = data_ref;
6319 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6320 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6321 vect[1], shift4_mask);
6322 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6323 (*result_chain)[0] = data_ref;
6324 return true;
6326 return false;
6329 /* Function vect_transform_grouped_load.
6331 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6332 to perform their permutation and ascribe the result vectorized statements to
6333 the scalar statements.
6336 void
6337 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6338 gimple_stmt_iterator *gsi)
6340 machine_mode mode;
6341 vec<tree> result_chain = vNULL;
6343 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6344 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6345 vectors, that are ready for vector computation. */
6346 result_chain.create (size);
6348 /* If reassociation width for vector type is 2 or greater target machine can
6349 execute 2 or more vector instructions in parallel. Otherwise try to
6350 get chain for loads group using vect_shift_permute_load_chain. */
6351 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6352 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6353 || pow2p_hwi (size)
6354 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6355 gsi, &result_chain))
6356 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6357 vect_record_grouped_load_vectors (stmt, result_chain);
6358 result_chain.release ();
6361 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6362 generated as part of the vectorization of STMT. Assign the statement
6363 for each vector to the associated scalar statement. */
6365 void
6366 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6368 gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6369 gimple *next_stmt, *new_stmt;
6370 unsigned int i, gap_count;
6371 tree tmp_data_ref;
6373 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6374 Since we scan the chain starting from it's first node, their order
6375 corresponds the order of data-refs in RESULT_CHAIN. */
6376 next_stmt = first_stmt;
6377 gap_count = 1;
6378 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6380 if (!next_stmt)
6381 break;
6383 /* Skip the gaps. Loads created for the gaps will be removed by dead
6384 code elimination pass later. No need to check for the first stmt in
6385 the group, since it always exists.
6386 DR_GROUP_GAP is the number of steps in elements from the previous
6387 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6388 correspond to the gaps. */
6389 if (next_stmt != first_stmt
6390 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6392 gap_count++;
6393 continue;
6396 while (next_stmt)
6398 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6399 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6400 copies, and we put the new vector statement in the first available
6401 RELATED_STMT. */
6402 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6403 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6404 else
6406 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6408 gimple *prev_stmt =
6409 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6410 gimple *rel_stmt =
6411 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6412 while (rel_stmt)
6414 prev_stmt = rel_stmt;
6415 rel_stmt =
6416 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6419 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6420 new_stmt;
6424 next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6425 gap_count = 1;
6426 /* If NEXT_STMT accesses the same DR as the previous statement,
6427 put the same TMP_DATA_REF as its vectorized statement; otherwise
6428 get the next data-ref from RESULT_CHAIN. */
6429 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6430 break;
6435 /* Function vect_force_dr_alignment_p.
6437 Returns whether the alignment of a DECL can be forced to be aligned
6438 on ALIGNMENT bit boundary. */
6440 bool
6441 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6443 if (!VAR_P (decl))
6444 return false;
6446 if (decl_in_symtab_p (decl)
6447 && !symtab_node::get (decl)->can_increase_alignment_p ())
6448 return false;
6450 if (TREE_STATIC (decl))
6451 return (alignment <= MAX_OFILE_ALIGNMENT);
6452 else
6453 return (alignment <= MAX_STACK_ALIGNMENT);
6457 /* Return whether the data reference DR is supported with respect to its
6458 alignment.
6459 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6460 it is aligned, i.e., check if it is possible to vectorize it with different
6461 alignment. */
6463 enum dr_alignment_support
6464 vect_supportable_dr_alignment (struct data_reference *dr,
6465 bool check_aligned_accesses)
6467 gimple *stmt = DR_STMT (dr);
6468 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6469 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6470 machine_mode mode = TYPE_MODE (vectype);
6471 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6472 struct loop *vect_loop = NULL;
6473 bool nested_in_vect_loop = false;
6475 if (aligned_access_p (dr) && !check_aligned_accesses)
6476 return dr_aligned;
6478 /* For now assume all conditional loads/stores support unaligned
6479 access without any special code. */
6480 if (is_gimple_call (stmt)
6481 && gimple_call_internal_p (stmt)
6482 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6483 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6484 return dr_unaligned_supported;
6486 if (loop_vinfo)
6488 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6489 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6492 /* Possibly unaligned access. */
6494 /* We can choose between using the implicit realignment scheme (generating
6495 a misaligned_move stmt) and the explicit realignment scheme (generating
6496 aligned loads with a REALIGN_LOAD). There are two variants to the
6497 explicit realignment scheme: optimized, and unoptimized.
6498 We can optimize the realignment only if the step between consecutive
6499 vector loads is equal to the vector size. Since the vector memory
6500 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6501 is guaranteed that the misalignment amount remains the same throughout the
6502 execution of the vectorized loop. Therefore, we can create the
6503 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6504 at the loop preheader.
6506 However, in the case of outer-loop vectorization, when vectorizing a
6507 memory access in the inner-loop nested within the LOOP that is now being
6508 vectorized, while it is guaranteed that the misalignment of the
6509 vectorized memory access will remain the same in different outer-loop
6510 iterations, it is *not* guaranteed that is will remain the same throughout
6511 the execution of the inner-loop. This is because the inner-loop advances
6512 with the original scalar step (and not in steps of VS). If the inner-loop
6513 step happens to be a multiple of VS, then the misalignment remains fixed
6514 and we can use the optimized realignment scheme. For example:
6516 for (i=0; i<N; i++)
6517 for (j=0; j<M; j++)
6518 s += a[i+j];
6520 When vectorizing the i-loop in the above example, the step between
6521 consecutive vector loads is 1, and so the misalignment does not remain
6522 fixed across the execution of the inner-loop, and the realignment cannot
6523 be optimized (as illustrated in the following pseudo vectorized loop):
6525 for (i=0; i<N; i+=4)
6526 for (j=0; j<M; j++){
6527 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6528 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6529 // (assuming that we start from an aligned address).
6532 We therefore have to use the unoptimized realignment scheme:
6534 for (i=0; i<N; i+=4)
6535 for (j=k; j<M; j+=4)
6536 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6537 // that the misalignment of the initial address is
6538 // 0).
6540 The loop can then be vectorized as follows:
6542 for (k=0; k<4; k++){
6543 rt = get_realignment_token (&vp[k]);
6544 for (i=0; i<N; i+=4){
6545 v1 = vp[i+k];
6546 for (j=k; j<M; j+=4){
6547 v2 = vp[i+j+VS-1];
6548 va = REALIGN_LOAD <v1,v2,rt>;
6549 vs += va;
6550 v1 = v2;
6553 } */
6555 if (DR_IS_READ (dr))
6557 bool is_packed = false;
6558 tree type = (TREE_TYPE (DR_REF (dr)));
6560 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6561 && (!targetm.vectorize.builtin_mask_for_load
6562 || targetm.vectorize.builtin_mask_for_load ()))
6564 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6566 /* If we are doing SLP then the accesses need not have the
6567 same alignment, instead it depends on the SLP group size. */
6568 if (loop_vinfo
6569 && STMT_SLP_TYPE (stmt_info)
6570 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6571 * DR_GROUP_SIZE (vinfo_for_stmt
6572 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6573 TYPE_VECTOR_SUBPARTS (vectype)))
6575 else if (!loop_vinfo
6576 || (nested_in_vect_loop
6577 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6578 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6579 return dr_explicit_realign;
6580 else
6581 return dr_explicit_realign_optimized;
6583 if (!known_alignment_for_access_p (dr))
6584 is_packed = not_size_aligned (DR_REF (dr));
6586 if (targetm.vectorize.support_vector_misalignment
6587 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6588 /* Can't software pipeline the loads, but can at least do them. */
6589 return dr_unaligned_supported;
6591 else
6593 bool is_packed = false;
6594 tree type = (TREE_TYPE (DR_REF (dr)));
6596 if (!known_alignment_for_access_p (dr))
6597 is_packed = not_size_aligned (DR_REF (dr));
6599 if (targetm.vectorize.support_vector_misalignment
6600 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6601 return dr_unaligned_supported;
6604 /* Unsupported. */
6605 return dr_unaligned_unsupported;