2018-08-01 Jan Willem Jagersma <jwjagersma@gmail.com>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob2158bf4e2f63dc272f552369f566dc9931d2772e
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT_INFO.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
121 HOST_WIDE_INT *lhs_size_unit,
122 HOST_WIDE_INT *rhs_size_unit)
124 tree scalar_type = gimple_expr_type (stmt_info->stmt);
125 HOST_WIDE_INT lhs, rhs;
127 /* During the analysis phase, this function is called on arbitrary
128 statements that might not have scalar results. */
129 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
130 return scalar_type;
132 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
134 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
135 if (assign
136 && (gimple_assign_cast_p (assign)
137 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
140 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
141 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
143 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
145 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
146 if (rhs < lhs)
147 scalar_type = rhs_type;
150 *lhs_size_unit = lhs;
151 *rhs_size_unit = rhs;
152 return scalar_type;
156 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
157 tested at run-time. Return TRUE if DDR was successfully inserted.
158 Return false if versioning is not supported. */
160 static bool
161 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
163 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
165 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
166 return false;
168 if (!runtime_alias_check_p (ddr, loop,
169 optimize_loop_nest_for_speed_p (loop)))
170 return false;
172 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
173 return true;
176 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
178 static void
179 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
181 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
182 for (unsigned int i = 0; i < checks.length(); ++i)
183 if (checks[i] == value)
184 return;
186 if (dump_enabled_p ())
188 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
189 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
190 dump_printf (MSG_NOTE, " is nonzero\n");
192 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
195 /* Return true if we know that the order of vectorized DR_INFO_A and
196 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
197 DR_INFO_B. At least one of the accesses is a write. */
199 static bool
200 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
202 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
203 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
205 /* Single statements are always kept in their original order. */
206 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
207 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
208 return true;
210 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
211 group are emitted at the position of the first scalar load and all
212 stores in a group are emitted at the position of the last scalar store.
213 Thus writes will happen no earlier than their current position
214 (but could happen later) while reads will happen no later than their
215 current position (but could happen earlier). Reordering is therefore
216 only possible if the first access is a write. */
217 if (is_pattern_stmt_p (stmtinfo_a))
218 stmtinfo_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
219 if (is_pattern_stmt_p (stmtinfo_b))
220 stmtinfo_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
221 stmt_vec_info earlier_stmt_info = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
222 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (earlier_stmt_info));
225 /* A subroutine of vect_analyze_data_ref_dependence. Handle
226 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
227 distances. These distances are conservatively correct but they don't
228 reflect a guaranteed dependence.
230 Return true if this function does all the work necessary to avoid
231 an alias or false if the caller should use the dependence distances
232 to limit the vectorization factor in the usual way. LOOP_DEPTH is
233 the depth of the loop described by LOOP_VINFO and the other arguments
234 are as for vect_analyze_data_ref_dependence. */
236 static bool
237 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
238 loop_vec_info loop_vinfo,
239 int loop_depth, unsigned int *max_vf)
241 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
242 lambda_vector dist_v;
243 unsigned int i;
244 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
246 int dist = dist_v[loop_depth];
247 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
249 /* If the user asserted safelen >= DIST consecutive iterations
250 can be executed concurrently, assume independence.
252 ??? An alternative would be to add the alias check even
253 in this case, and vectorize the fallback loop with the
254 maximum VF set to safelen. However, if the user has
255 explicitly given a length, it's less likely that that
256 would be a win. */
257 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
259 if ((unsigned int) loop->safelen < *max_vf)
260 *max_vf = loop->safelen;
261 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
262 continue;
265 /* For dependence distances of 2 or more, we have the option
266 of limiting VF or checking for an alias at runtime.
267 Prefer to check at runtime if we can, to avoid limiting
268 the VF unnecessarily when the bases are in fact independent.
270 Note that the alias checks will be removed if the VF ends up
271 being small enough. */
272 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
273 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
274 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
275 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
276 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
279 return true;
283 /* Function vect_analyze_data_ref_dependence.
285 Return TRUE if there (might) exist a dependence between a memory-reference
286 DRA and a memory-reference DRB. When versioning for alias may check a
287 dependence at run-time, return FALSE. Adjust *MAX_VF according to
288 the data dependence. */
290 static bool
291 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
292 loop_vec_info loop_vinfo,
293 unsigned int *max_vf)
295 unsigned int i;
296 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
297 struct data_reference *dra = DDR_A (ddr);
298 struct data_reference *drb = DDR_B (ddr);
299 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
300 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
301 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
302 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
303 lambda_vector dist_v;
304 unsigned int loop_depth;
306 /* In loop analysis all data references should be vectorizable. */
307 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
308 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
309 gcc_unreachable ();
311 /* Independent data accesses. */
312 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
313 return false;
315 if (dra == drb
316 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
317 return false;
319 /* We do not have to consider dependences between accesses that belong
320 to the same group, unless the stride could be smaller than the
321 group size. */
322 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
323 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
324 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
325 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
326 return false;
328 /* Even if we have an anti-dependence then, as the vectorized loop covers at
329 least two scalar iterations, there is always also a true dependence.
330 As the vectorizer does not re-order loads and stores we can ignore
331 the anti-dependence if TBAA can disambiguate both DRs similar to the
332 case with known negative distance anti-dependences (positive
333 distance anti-dependences would violate TBAA constraints). */
334 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
335 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
336 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
337 get_alias_set (DR_REF (drb))))
338 return false;
340 /* Unknown data dependence. */
341 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
343 /* If user asserted safelen consecutive iterations can be
344 executed concurrently, assume independence. */
345 if (loop->safelen >= 2)
347 if ((unsigned int) loop->safelen < *max_vf)
348 *max_vf = loop->safelen;
349 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
350 return false;
353 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
354 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
356 if (dump_enabled_p ())
358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
359 "versioning for alias not supported for: "
360 "can't determine dependence between ");
361 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
362 DR_REF (dra));
363 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
364 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
365 DR_REF (drb));
366 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
368 return true;
371 if (dump_enabled_p ())
373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
374 "versioning for alias required: "
375 "can't determine dependence between ");
376 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
377 DR_REF (dra));
378 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
379 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
380 DR_REF (drb));
381 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
384 /* Add to list of ddrs that need to be tested at run-time. */
385 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
388 /* Known data dependence. */
389 if (DDR_NUM_DIST_VECTS (ddr) == 0)
391 /* If user asserted safelen consecutive iterations can be
392 executed concurrently, assume independence. */
393 if (loop->safelen >= 2)
395 if ((unsigned int) loop->safelen < *max_vf)
396 *max_vf = loop->safelen;
397 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
398 return false;
401 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
402 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
404 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "versioning for alias not supported for: "
408 "bad dist vector for ");
409 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
410 DR_REF (dra));
411 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
412 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
413 DR_REF (drb));
414 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
416 return true;
419 if (dump_enabled_p ())
421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
422 "versioning for alias required: "
423 "bad dist vector for ");
424 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
425 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
426 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
427 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
429 /* Add to list of ddrs that need to be tested at run-time. */
430 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
433 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
435 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
436 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
437 loop_depth, max_vf))
438 return false;
440 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
442 int dist = dist_v[loop_depth];
444 if (dump_enabled_p ())
445 dump_printf_loc (MSG_NOTE, vect_location,
446 "dependence distance = %d.\n", dist);
448 if (dist == 0)
450 if (dump_enabled_p ())
452 dump_printf_loc (MSG_NOTE, vect_location,
453 "dependence distance == 0 between ");
454 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
455 dump_printf (MSG_NOTE, " and ");
456 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
457 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
460 /* When we perform grouped accesses and perform implicit CSE
461 by detecting equal accesses and doing disambiguation with
462 runtime alias tests like for
463 .. = a[i];
464 .. = a[i+1];
465 a[i] = ..;
466 a[i+1] = ..;
467 *p = ..;
468 .. = a[i];
469 .. = a[i+1];
470 where we will end up loading { a[i], a[i+1] } once, make
471 sure that inserting group loads before the first load and
472 stores after the last store will do the right thing.
473 Similar for groups like
474 a[i] = ...;
475 ... = a[i];
476 a[i+1] = ...;
477 where loads from the group interleave with the store. */
478 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
480 if (dump_enabled_p ())
481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
482 "READ_WRITE dependence in interleaving.\n");
483 return true;
486 if (loop->safelen < 2)
488 tree indicator = dr_zero_step_indicator (dra);
489 if (!indicator || integer_zerop (indicator))
491 if (dump_enabled_p ())
492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
493 "access also has a zero step\n");
494 return true;
496 else if (TREE_CODE (indicator) != INTEGER_CST)
497 vect_check_nonzero_value (loop_vinfo, indicator);
499 continue;
502 if (dist > 0 && DDR_REVERSED_P (ddr))
504 /* If DDR_REVERSED_P the order of the data-refs in DDR was
505 reversed (to make distance vector positive), and the actual
506 distance is negative. */
507 if (dump_enabled_p ())
508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
509 "dependence distance negative.\n");
510 /* Record a negative dependence distance to later limit the
511 amount of stmt copying / unrolling we can perform.
512 Only need to handle read-after-write dependence. */
513 if (DR_IS_READ (drb)
514 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
515 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
516 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
517 continue;
520 unsigned int abs_dist = abs (dist);
521 if (abs_dist >= 2 && abs_dist < *max_vf)
523 /* The dependence distance requires reduction of the maximal
524 vectorization factor. */
525 *max_vf = abs (dist);
526 if (dump_enabled_p ())
527 dump_printf_loc (MSG_NOTE, vect_location,
528 "adjusting maximal vectorization factor to %i\n",
529 *max_vf);
532 if (abs_dist >= *max_vf)
534 /* Dependence distance does not create dependence, as far as
535 vectorization is concerned, in this case. */
536 if (dump_enabled_p ())
537 dump_printf_loc (MSG_NOTE, vect_location,
538 "dependence distance >= VF.\n");
539 continue;
542 if (dump_enabled_p ())
544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
545 "not vectorized, possible dependence "
546 "between data-refs ");
547 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
548 dump_printf (MSG_NOTE, " and ");
549 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
550 dump_printf (MSG_NOTE, "\n");
553 return true;
556 return false;
559 /* Function vect_analyze_data_ref_dependences.
561 Examine all the data references in the loop, and make sure there do not
562 exist any data dependences between them. Set *MAX_VF according to
563 the maximum vectorization factor the data dependences allow. */
565 bool
566 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
567 unsigned int *max_vf)
569 unsigned int i;
570 struct data_dependence_relation *ddr;
572 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
574 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
576 LOOP_VINFO_DDRS (loop_vinfo)
577 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
578 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
579 /* We need read-read dependences to compute
580 STMT_VINFO_SAME_ALIGN_REFS. */
581 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
582 &LOOP_VINFO_DDRS (loop_vinfo),
583 LOOP_VINFO_LOOP_NEST (loop_vinfo),
584 true);
585 gcc_assert (res);
588 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
590 /* For epilogues we either have no aliases or alias versioning
591 was applied to original loop. Therefore we may just get max_vf
592 using VF of original loop. */
593 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
594 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
595 else
596 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
597 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
598 return false;
600 return true;
604 /* Function vect_slp_analyze_data_ref_dependence.
606 Return TRUE if there (might) exist a dependence between a memory-reference
607 DRA and a memory-reference DRB for VINFO. When versioning for alias
608 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
609 according to the data dependence. */
611 static bool
612 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
613 struct data_dependence_relation *ddr)
615 struct data_reference *dra = DDR_A (ddr);
616 struct data_reference *drb = DDR_B (ddr);
617 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
618 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
620 /* We need to check dependences of statements marked as unvectorizable
621 as well, they still can prohibit vectorization. */
623 /* Independent data accesses. */
624 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
625 return false;
627 if (dra == drb)
628 return false;
630 /* Read-read is OK. */
631 if (DR_IS_READ (dra) && DR_IS_READ (drb))
632 return false;
634 /* If dra and drb are part of the same interleaving chain consider
635 them independent. */
636 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
637 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
638 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
639 return false;
641 /* Unknown data dependence. */
642 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
644 if (dump_enabled_p ())
646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
647 "can't determine dependence between ");
648 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
649 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
650 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
651 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
654 else if (dump_enabled_p ())
656 dump_printf_loc (MSG_NOTE, vect_location,
657 "determined dependence between ");
658 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
659 dump_printf (MSG_NOTE, " and ");
660 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
661 dump_printf (MSG_NOTE, "\n");
664 return true;
668 /* Analyze dependences involved in the transform of SLP NODE. STORES
669 contain the vector of scalar stores of this instance if we are
670 disambiguating the loads. */
672 static bool
673 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
674 vec<stmt_vec_info> stores,
675 stmt_vec_info last_store_info)
677 /* This walks over all stmts involved in the SLP load/store done
678 in NODE verifying we can sink them up to the last stmt in the
679 group. */
680 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
681 vec_info *vinfo = last_access_info->vinfo;
682 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
684 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
685 if (access_info == last_access_info)
686 continue;
687 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
688 ao_ref ref;
689 bool ref_initialized_p = false;
690 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
691 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
693 gimple *stmt = gsi_stmt (gsi);
694 if (! gimple_vuse (stmt)
695 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
696 continue;
698 /* If we couldn't record a (single) data reference for this
699 stmt we have to resort to the alias oracle. */
700 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
701 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
702 if (!dr_b)
704 /* We are moving a store or sinking a load - this means
705 we cannot use TBAA for disambiguation. */
706 if (!ref_initialized_p)
707 ao_ref_init (&ref, DR_REF (dr_a));
708 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
709 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
710 return false;
711 continue;
714 bool dependent = false;
715 /* If we run into a store of this same instance (we've just
716 marked those) then delay dependence checking until we run
717 into the last store because this is where it will have
718 been sunk to (and we verify if we can do that as well). */
719 if (gimple_visited_p (stmt))
721 if (stmt_info != last_store_info)
722 continue;
723 unsigned i;
724 stmt_vec_info store_info;
725 FOR_EACH_VEC_ELT (stores, i, store_info)
727 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
728 ddr_p ddr = initialize_data_dependence_relation
729 (dr_a, store_dr, vNULL);
730 dependent
731 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
732 free_dependence_relation (ddr);
733 if (dependent)
734 break;
737 else
739 ddr_p ddr = initialize_data_dependence_relation (dr_a,
740 dr_b, vNULL);
741 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
742 free_dependence_relation (ddr);
744 if (dependent)
745 return false;
748 return true;
752 /* Function vect_analyze_data_ref_dependences.
754 Examine all the data references in the basic-block, and make sure there
755 do not exist any data dependences between them. Set *MAX_VF according to
756 the maximum vectorization factor the data dependences allow. */
758 bool
759 vect_slp_analyze_instance_dependence (slp_instance instance)
761 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
763 /* The stores of this instance are at the root of the SLP tree. */
764 slp_tree store = SLP_INSTANCE_TREE (instance);
765 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
766 store = NULL;
768 /* Verify we can sink stores to the vectorized stmt insert location. */
769 stmt_vec_info last_store_info = NULL;
770 if (store)
772 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
773 return false;
775 /* Mark stores in this instance and remember the last one. */
776 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
777 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
778 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
781 bool res = true;
783 /* Verify we can sink loads to the vectorized stmt insert location,
784 special-casing stores of this instance. */
785 slp_tree load;
786 unsigned int i;
787 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
788 if (! vect_slp_analyze_node_dependences (instance, load,
789 store
790 ? SLP_TREE_SCALAR_STMTS (store)
791 : vNULL, last_store_info))
793 res = false;
794 break;
797 /* Unset the visited flag. */
798 if (store)
799 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
800 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
802 return res;
805 /* Record the base alignment guarantee given by DRB, which occurs
806 in STMT_INFO. */
808 static void
809 vect_record_base_alignment (stmt_vec_info stmt_info,
810 innermost_loop_behavior *drb)
812 vec_info *vinfo = stmt_info->vinfo;
813 bool existed;
814 innermost_loop_behavior *&entry
815 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
816 if (!existed || entry->base_alignment < drb->base_alignment)
818 entry = drb;
819 if (dump_enabled_p ())
821 dump_printf_loc (MSG_NOTE, vect_location,
822 "recording new base alignment for ");
823 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
824 dump_printf (MSG_NOTE, "\n");
825 dump_printf_loc (MSG_NOTE, vect_location,
826 " alignment: %d\n", drb->base_alignment);
827 dump_printf_loc (MSG_NOTE, vect_location,
828 " misalignment: %d\n", drb->base_misalignment);
829 dump_printf_loc (MSG_NOTE, vect_location,
830 " based on: ");
831 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
836 /* If the region we're going to vectorize is reached, all unconditional
837 data references occur at least once. We can therefore pool the base
838 alignment guarantees from each unconditional reference. Do this by
839 going through all the data references in VINFO and checking whether
840 the containing statement makes the reference unconditionally. If so,
841 record the alignment of the base address in VINFO so that it can be
842 used for all other references with the same base. */
844 void
845 vect_record_base_alignments (vec_info *vinfo)
847 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
848 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
849 data_reference *dr;
850 unsigned int i;
851 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
853 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
854 stmt_vec_info stmt_info = dr_info->stmt;
855 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
856 && STMT_VINFO_VECTORIZABLE (stmt_info)
857 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
859 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
861 /* If DR is nested in the loop that is being vectorized, we can also
862 record the alignment of the base wrt the outer loop. */
863 if (loop && nested_in_vect_loop_p (loop, stmt_info))
864 vect_record_base_alignment
865 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
870 /* Return the target alignment for the vectorized form of DR_INFO. */
872 static unsigned int
873 vect_calculate_target_alignment (dr_vec_info *dr_info)
875 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
876 return targetm.vectorize.preferred_vector_alignment (vectype);
879 /* Function vect_compute_data_ref_alignment
881 Compute the misalignment of the data reference DR_INFO.
883 Output:
884 1. DR_MISALIGNMENT (DR_INFO) is defined.
886 FOR NOW: No analysis is actually performed. Misalignment is calculated
887 only for trivial cases. TODO. */
889 static void
890 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
892 stmt_vec_info stmt_info = dr_info->stmt;
893 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
894 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
895 struct loop *loop = NULL;
896 tree ref = DR_REF (dr_info->dr);
897 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
899 if (dump_enabled_p ())
900 dump_printf_loc (MSG_NOTE, vect_location,
901 "vect_compute_data_ref_alignment:\n");
903 if (loop_vinfo)
904 loop = LOOP_VINFO_LOOP (loop_vinfo);
906 /* Initialize misalignment to unknown. */
907 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
909 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
910 return;
912 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
913 bool step_preserves_misalignment_p;
915 unsigned HOST_WIDE_INT vector_alignment
916 = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
917 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
919 /* No step for BB vectorization. */
920 if (!loop)
922 gcc_assert (integer_zerop (drb->step));
923 step_preserves_misalignment_p = true;
926 /* In case the dataref is in an inner-loop of the loop that is being
927 vectorized (LOOP), we use the base and misalignment information
928 relative to the outer-loop (LOOP). This is ok only if the misalignment
929 stays the same throughout the execution of the inner-loop, which is why
930 we have to check that the stride of the dataref in the inner-loop evenly
931 divides by the vector alignment. */
932 else if (nested_in_vect_loop_p (loop, stmt_info))
934 step_preserves_misalignment_p
935 = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
937 if (dump_enabled_p ())
939 if (step_preserves_misalignment_p)
940 dump_printf_loc (MSG_NOTE, vect_location,
941 "inner step divides the vector alignment.\n");
942 else
943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
944 "inner step doesn't divide the vector"
945 " alignment.\n");
949 /* Similarly we can only use base and misalignment information relative to
950 an innermost loop if the misalignment stays the same throughout the
951 execution of the loop. As above, this is the case if the stride of
952 the dataref evenly divides by the alignment. */
953 else
955 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
956 step_preserves_misalignment_p
957 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
959 if (!step_preserves_misalignment_p && dump_enabled_p ())
960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
961 "step doesn't divide the vector alignment.\n");
964 unsigned int base_alignment = drb->base_alignment;
965 unsigned int base_misalignment = drb->base_misalignment;
967 /* Calculate the maximum of the pooled base address alignment and the
968 alignment that we can compute for DR itself. */
969 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
970 if (entry && base_alignment < (*entry)->base_alignment)
972 base_alignment = (*entry)->base_alignment;
973 base_misalignment = (*entry)->base_misalignment;
976 if (drb->offset_alignment < vector_alignment
977 || !step_preserves_misalignment_p
978 /* We need to know whether the step wrt the vectorized loop is
979 negative when computing the starting misalignment below. */
980 || TREE_CODE (drb->step) != INTEGER_CST)
982 if (dump_enabled_p ())
984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
985 "Unknown alignment for access: ");
986 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
987 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
989 return;
992 if (base_alignment < vector_alignment)
994 unsigned int max_alignment;
995 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
996 if (max_alignment < vector_alignment
997 || !vect_can_force_dr_alignment_p (base,
998 vector_alignment * BITS_PER_UNIT))
1000 if (dump_enabled_p ())
1002 dump_printf_loc (MSG_NOTE, vect_location,
1003 "can't force alignment of ref: ");
1004 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1005 dump_printf (MSG_NOTE, "\n");
1007 return;
1010 /* Force the alignment of the decl.
1011 NOTE: This is the only change to the code we make during
1012 the analysis phase, before deciding to vectorize the loop. */
1013 if (dump_enabled_p ())
1015 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1016 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1017 dump_printf (MSG_NOTE, "\n");
1020 dr_info->base_decl = base;
1021 dr_info->base_misaligned = true;
1022 base_misalignment = 0;
1024 poly_int64 misalignment
1025 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1027 /* If this is a backward running DR then first access in the larger
1028 vectype actually is N-1 elements before the address in the DR.
1029 Adjust misalign accordingly. */
1030 if (tree_int_cst_sgn (drb->step) < 0)
1031 /* PLUS because STEP is negative. */
1032 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1033 * TREE_INT_CST_LOW (drb->step));
1035 unsigned int const_misalignment;
1036 if (!known_misalignment (misalignment, vector_alignment,
1037 &const_misalignment))
1039 if (dump_enabled_p ())
1041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1042 "Non-constant misalignment for access: ");
1043 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1044 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1046 return;
1049 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1051 if (dump_enabled_p ())
1053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1054 "misalign = %d bytes of ref ",
1055 DR_MISALIGNMENT (dr_info));
1056 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1057 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1060 return;
1063 /* Function vect_update_misalignment_for_peel.
1064 Sets DR_INFO's misalignment
1065 - to 0 if it has the same alignment as DR_PEEL_INFO,
1066 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1067 - to -1 (unknown) otherwise.
1069 DR_INFO - the data reference whose misalignment is to be adjusted.
1070 DR_PEEL_INFO - the data reference whose misalignment is being made
1071 zero in the vector loop by the peel.
1072 NPEEL - the number of iterations in the peel loop if the misalignment
1073 of DR_PEEL_INFO is known at compile time. */
1075 static void
1076 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1077 dr_vec_info *dr_peel_info, int npeel)
1079 unsigned int i;
1080 vec<dr_p> same_aligned_drs;
1081 struct data_reference *current_dr;
1082 int dr_size = vect_get_scalar_dr_size (dr_info);
1083 int dr_peel_size = vect_get_scalar_dr_size (dr_peel_info);
1084 stmt_vec_info stmt_info = dr_info->stmt;
1085 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1087 /* For interleaved data accesses the step in the loop must be multiplied by
1088 the size of the interleaving group. */
1089 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1090 dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1091 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1092 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1094 /* It can be assumed that the data refs with the same alignment as dr_peel
1095 are aligned in the vector loop. */
1096 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1097 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1099 if (current_dr != dr_info->dr)
1100 continue;
1101 gcc_assert (!known_alignment_for_access_p (dr_info)
1102 || !known_alignment_for_access_p (dr_peel_info)
1103 || (DR_MISALIGNMENT (dr_info) / dr_size
1104 == DR_MISALIGNMENT (dr_peel_info) / dr_peel_size));
1105 SET_DR_MISALIGNMENT (dr_info, 0);
1106 return;
1109 if (known_alignment_for_access_p (dr_info)
1110 && known_alignment_for_access_p (dr_peel_info))
1112 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1113 size_zero_node) < 0;
1114 int misal = DR_MISALIGNMENT (dr_info);
1115 misal += negative ? -npeel * dr_size : npeel * dr_size;
1116 misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1117 SET_DR_MISALIGNMENT (dr_info, misal);
1118 return;
1121 if (dump_enabled_p ())
1122 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1123 "to unknown (-1).\n");
1124 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1128 /* Function verify_data_ref_alignment
1130 Return TRUE if DR_INFO can be handled with respect to alignment. */
1132 static bool
1133 verify_data_ref_alignment (dr_vec_info *dr_info)
1135 enum dr_alignment_support supportable_dr_alignment
1136 = vect_supportable_dr_alignment (dr_info, false);
1137 if (!supportable_dr_alignment)
1139 if (dump_enabled_p ())
1141 if (DR_IS_READ (dr_info->dr))
1142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1143 "not vectorized: unsupported unaligned load.");
1144 else
1145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1146 "not vectorized: unsupported unaligned "
1147 "store.");
1149 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1150 DR_REF (dr_info->dr));
1151 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1153 return false;
1156 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1157 dump_printf_loc (MSG_NOTE, vect_location,
1158 "Vectorizing an unaligned access.\n");
1160 return true;
1163 /* Function vect_verify_datarefs_alignment
1165 Return TRUE if all data references in the loop can be
1166 handled with respect to alignment. */
1168 bool
1169 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1171 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1172 struct data_reference *dr;
1173 unsigned int i;
1175 FOR_EACH_VEC_ELT (datarefs, i, dr)
1177 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1178 stmt_vec_info stmt_info = dr_info->stmt;
1180 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1181 continue;
1183 /* For interleaving, only the alignment of the first access matters. */
1184 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1185 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1186 continue;
1188 /* Strided accesses perform only component accesses, alignment is
1189 irrelevant for them. */
1190 if (STMT_VINFO_STRIDED_P (stmt_info)
1191 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1192 continue;
1194 if (! verify_data_ref_alignment (dr_info))
1195 return false;
1198 return true;
1201 /* Given an memory reference EXP return whether its alignment is less
1202 than its size. */
1204 static bool
1205 not_size_aligned (tree exp)
1207 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1208 return true;
1210 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1211 > get_object_alignment (exp));
1214 /* Function vector_alignment_reachable_p
1216 Return true if vector alignment for DR_INFO is reachable by peeling
1217 a few loop iterations. Return false otherwise. */
1219 static bool
1220 vector_alignment_reachable_p (dr_vec_info *dr_info)
1222 stmt_vec_info stmt_info = dr_info->stmt;
1223 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1225 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1227 /* For interleaved access we peel only if number of iterations in
1228 the prolog loop ({VF - misalignment}), is a multiple of the
1229 number of the interleaved accesses. */
1230 int elem_size, mis_in_elements;
1232 /* FORNOW: handle only known alignment. */
1233 if (!known_alignment_for_access_p (dr_info))
1234 return false;
1236 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1237 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1238 elem_size = vector_element_size (vector_size, nelements);
1239 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1241 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1242 return false;
1245 /* If misalignment is known at the compile time then allow peeling
1246 only if natural alignment is reachable through peeling. */
1247 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1249 HOST_WIDE_INT elmsize =
1250 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1251 if (dump_enabled_p ())
1253 dump_printf_loc (MSG_NOTE, vect_location,
1254 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1255 dump_printf (MSG_NOTE,
1256 ". misalignment = %d.\n", DR_MISALIGNMENT (dr_info));
1258 if (DR_MISALIGNMENT (dr_info) % elmsize)
1260 if (dump_enabled_p ())
1261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1262 "data size does not divide the misalignment.\n");
1263 return false;
1267 if (!known_alignment_for_access_p (dr_info))
1269 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1270 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1271 if (dump_enabled_p ())
1272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1273 "Unknown misalignment, %snaturally aligned\n",
1274 is_packed ? "not " : "");
1275 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1278 return true;
1282 /* Calculate the cost of the memory access represented by DR_INFO. */
1284 static void
1285 vect_get_data_access_cost (dr_vec_info *dr_info,
1286 unsigned int *inside_cost,
1287 unsigned int *outside_cost,
1288 stmt_vector_for_cost *body_cost_vec,
1289 stmt_vector_for_cost *prologue_cost_vec)
1291 stmt_vec_info stmt_info = dr_info->stmt;
1292 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1293 int ncopies;
1295 if (PURE_SLP_STMT (stmt_info))
1296 ncopies = 1;
1297 else
1298 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1300 if (DR_IS_READ (dr_info->dr))
1301 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1302 prologue_cost_vec, body_cost_vec, false);
1303 else
1304 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1306 if (dump_enabled_p ())
1307 dump_printf_loc (MSG_NOTE, vect_location,
1308 "vect_get_data_access_cost: inside_cost = %d, "
1309 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1313 typedef struct _vect_peel_info
1315 dr_vec_info *dr_info;
1316 int npeel;
1317 unsigned int count;
1318 } *vect_peel_info;
1320 typedef struct _vect_peel_extended_info
1322 struct _vect_peel_info peel_info;
1323 unsigned int inside_cost;
1324 unsigned int outside_cost;
1325 } *vect_peel_extended_info;
1328 /* Peeling hashtable helpers. */
1330 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1332 static inline hashval_t hash (const _vect_peel_info *);
1333 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1336 inline hashval_t
1337 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1339 return (hashval_t) peel_info->npeel;
1342 inline bool
1343 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1345 return (a->npeel == b->npeel);
1349 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1351 static void
1352 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1353 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1354 int npeel)
1356 struct _vect_peel_info elem, *slot;
1357 _vect_peel_info **new_slot;
1358 bool supportable_dr_alignment
1359 = vect_supportable_dr_alignment (dr_info, true);
1361 elem.npeel = npeel;
1362 slot = peeling_htab->find (&elem);
1363 if (slot)
1364 slot->count++;
1365 else
1367 slot = XNEW (struct _vect_peel_info);
1368 slot->npeel = npeel;
1369 slot->dr_info = dr_info;
1370 slot->count = 1;
1371 new_slot = peeling_htab->find_slot (slot, INSERT);
1372 *new_slot = slot;
1375 if (!supportable_dr_alignment
1376 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1377 slot->count += VECT_MAX_COST;
1381 /* Traverse peeling hash table to find peeling option that aligns maximum
1382 number of data accesses. */
1385 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1386 _vect_peel_extended_info *max)
1388 vect_peel_info elem = *slot;
1390 if (elem->count > max->peel_info.count
1391 || (elem->count == max->peel_info.count
1392 && max->peel_info.npeel > elem->npeel))
1394 max->peel_info.npeel = elem->npeel;
1395 max->peel_info.count = elem->count;
1396 max->peel_info.dr_info = elem->dr_info;
1399 return 1;
1402 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1403 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1404 we assume DR0_INFO's misalignment will be zero after peeling. */
1406 static void
1407 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1408 dr_vec_info *dr0_info,
1409 unsigned int *inside_cost,
1410 unsigned int *outside_cost,
1411 stmt_vector_for_cost *body_cost_vec,
1412 stmt_vector_for_cost *prologue_cost_vec,
1413 unsigned int npeel,
1414 bool unknown_misalignment)
1416 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1417 unsigned i;
1418 data_reference *dr;
1420 FOR_EACH_VEC_ELT (datarefs, i, dr)
1422 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1423 stmt_vec_info stmt_info = dr_info->stmt;
1424 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1425 continue;
1427 /* For interleaving, only the alignment of the first access
1428 matters. */
1429 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1430 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1431 continue;
1433 /* Strided accesses perform only component accesses, alignment is
1434 irrelevant for them. */
1435 if (STMT_VINFO_STRIDED_P (stmt_info)
1436 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1437 continue;
1439 int save_misalignment;
1440 save_misalignment = DR_MISALIGNMENT (dr_info);
1441 if (npeel == 0)
1443 else if (unknown_misalignment && dr_info == dr0_info)
1444 SET_DR_MISALIGNMENT (dr_info, 0);
1445 else
1446 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1447 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1448 body_cost_vec, prologue_cost_vec);
1449 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1453 /* Traverse peeling hash table and calculate cost for each peeling option.
1454 Find the one with the lowest cost. */
1457 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1458 _vect_peel_extended_info *min)
1460 vect_peel_info elem = *slot;
1461 int dummy;
1462 unsigned int inside_cost = 0, outside_cost = 0;
1463 stmt_vec_info stmt_info = elem->dr_info->stmt;
1464 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1465 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1466 epilogue_cost_vec;
1468 prologue_cost_vec.create (2);
1469 body_cost_vec.create (2);
1470 epilogue_cost_vec.create (2);
1472 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1473 &outside_cost, &body_cost_vec,
1474 &prologue_cost_vec, elem->npeel, false);
1476 body_cost_vec.release ();
1478 outside_cost += vect_get_known_peeling_cost
1479 (loop_vinfo, elem->npeel, &dummy,
1480 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1481 &prologue_cost_vec, &epilogue_cost_vec);
1483 /* Prologue and epilogue costs are added to the target model later.
1484 These costs depend only on the scalar iteration cost, the
1485 number of peeling iterations finally chosen, and the number of
1486 misaligned statements. So discard the information found here. */
1487 prologue_cost_vec.release ();
1488 epilogue_cost_vec.release ();
1490 if (inside_cost < min->inside_cost
1491 || (inside_cost == min->inside_cost
1492 && outside_cost < min->outside_cost))
1494 min->inside_cost = inside_cost;
1495 min->outside_cost = outside_cost;
1496 min->peel_info.dr_info = elem->dr_info;
1497 min->peel_info.npeel = elem->npeel;
1498 min->peel_info.count = elem->count;
1501 return 1;
1505 /* Choose best peeling option by traversing peeling hash table and either
1506 choosing an option with the lowest cost (if cost model is enabled) or the
1507 option that aligns as many accesses as possible. */
1509 static struct _vect_peel_extended_info
1510 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1511 loop_vec_info loop_vinfo)
1513 struct _vect_peel_extended_info res;
1515 res.peel_info.dr_info = NULL;
1517 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1519 res.inside_cost = INT_MAX;
1520 res.outside_cost = INT_MAX;
1521 peeling_htab->traverse <_vect_peel_extended_info *,
1522 vect_peeling_hash_get_lowest_cost> (&res);
1524 else
1526 res.peel_info.count = 0;
1527 peeling_htab->traverse <_vect_peel_extended_info *,
1528 vect_peeling_hash_get_most_frequent> (&res);
1529 res.inside_cost = 0;
1530 res.outside_cost = 0;
1533 return res;
1536 /* Return true if the new peeling NPEEL is supported. */
1538 static bool
1539 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1540 unsigned npeel)
1542 unsigned i;
1543 struct data_reference *dr = NULL;
1544 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1545 enum dr_alignment_support supportable_dr_alignment;
1547 /* Ensure that all data refs can be vectorized after the peel. */
1548 FOR_EACH_VEC_ELT (datarefs, i, dr)
1550 int save_misalignment;
1552 if (dr == dr0_info->dr)
1553 continue;
1555 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1556 stmt_vec_info stmt_info = dr_info->stmt;
1557 /* For interleaving, only the alignment of the first access
1558 matters. */
1559 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1560 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1561 continue;
1563 /* Strided accesses perform only component accesses, alignment is
1564 irrelevant for them. */
1565 if (STMT_VINFO_STRIDED_P (stmt_info)
1566 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1567 continue;
1569 save_misalignment = DR_MISALIGNMENT (dr_info);
1570 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1571 supportable_dr_alignment
1572 = vect_supportable_dr_alignment (dr_info, false);
1573 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1575 if (!supportable_dr_alignment)
1576 return false;
1579 return true;
1582 /* Function vect_enhance_data_refs_alignment
1584 This pass will use loop versioning and loop peeling in order to enhance
1585 the alignment of data references in the loop.
1587 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1588 original loop is to be vectorized. Any other loops that are created by
1589 the transformations performed in this pass - are not supposed to be
1590 vectorized. This restriction will be relaxed.
1592 This pass will require a cost model to guide it whether to apply peeling
1593 or versioning or a combination of the two. For example, the scheme that
1594 intel uses when given a loop with several memory accesses, is as follows:
1595 choose one memory access ('p') which alignment you want to force by doing
1596 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1597 other accesses are not necessarily aligned, or (2) use loop versioning to
1598 generate one loop in which all accesses are aligned, and another loop in
1599 which only 'p' is necessarily aligned.
1601 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1602 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1603 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1605 Devising a cost model is the most critical aspect of this work. It will
1606 guide us on which access to peel for, whether to use loop versioning, how
1607 many versions to create, etc. The cost model will probably consist of
1608 generic considerations as well as target specific considerations (on
1609 powerpc for example, misaligned stores are more painful than misaligned
1610 loads).
1612 Here are the general steps involved in alignment enhancements:
1614 -- original loop, before alignment analysis:
1615 for (i=0; i<N; i++){
1616 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1617 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1620 -- After vect_compute_data_refs_alignment:
1621 for (i=0; i<N; i++){
1622 x = q[i]; # DR_MISALIGNMENT(q) = 3
1623 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1626 -- Possibility 1: we do loop versioning:
1627 if (p is aligned) {
1628 for (i=0; i<N; i++){ # loop 1A
1629 x = q[i]; # DR_MISALIGNMENT(q) = 3
1630 p[i] = y; # DR_MISALIGNMENT(p) = 0
1633 else {
1634 for (i=0; i<N; i++){ # loop 1B
1635 x = q[i]; # DR_MISALIGNMENT(q) = 3
1636 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1640 -- Possibility 2: we do loop peeling:
1641 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1642 x = q[i];
1643 p[i] = y;
1645 for (i = 3; i < N; i++){ # loop 2A
1646 x = q[i]; # DR_MISALIGNMENT(q) = 0
1647 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1650 -- Possibility 3: combination of loop peeling and versioning:
1651 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1652 x = q[i];
1653 p[i] = y;
1655 if (p is aligned) {
1656 for (i = 3; i<N; i++){ # loop 3A
1657 x = q[i]; # DR_MISALIGNMENT(q) = 0
1658 p[i] = y; # DR_MISALIGNMENT(p) = 0
1661 else {
1662 for (i = 3; i<N; i++){ # loop 3B
1663 x = q[i]; # DR_MISALIGNMENT(q) = 0
1664 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1668 These loops are later passed to loop_transform to be vectorized. The
1669 vectorizer will use the alignment information to guide the transformation
1670 (whether to generate regular loads/stores, or with special handling for
1671 misalignment). */
1673 bool
1674 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1676 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1677 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1678 enum dr_alignment_support supportable_dr_alignment;
1679 dr_vec_info *first_store = NULL;
1680 dr_vec_info *dr0_info = NULL;
1681 struct data_reference *dr;
1682 unsigned int i, j;
1683 bool do_peeling = false;
1684 bool do_versioning = false;
1685 bool stat;
1686 unsigned int npeel = 0;
1687 bool one_misalignment_known = false;
1688 bool one_misalignment_unknown = false;
1689 bool one_dr_unsupportable = false;
1690 dr_vec_info *unsupportable_dr_info = NULL;
1691 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1692 unsigned possible_npeel_number = 1;
1693 tree vectype;
1694 unsigned int mis, same_align_drs_max = 0;
1695 hash_table<peel_info_hasher> peeling_htab (1);
1697 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1699 /* Reset data so we can safely be called multiple times. */
1700 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1701 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1703 /* While cost model enhancements are expected in the future, the high level
1704 view of the code at this time is as follows:
1706 A) If there is a misaligned access then see if peeling to align
1707 this access can make all data references satisfy
1708 vect_supportable_dr_alignment. If so, update data structures
1709 as needed and return true.
1711 B) If peeling wasn't possible and there is a data reference with an
1712 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1713 then see if loop versioning checks can be used to make all data
1714 references satisfy vect_supportable_dr_alignment. If so, update
1715 data structures as needed and return true.
1717 C) If neither peeling nor versioning were successful then return false if
1718 any data reference does not satisfy vect_supportable_dr_alignment.
1720 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1722 Note, Possibility 3 above (which is peeling and versioning together) is not
1723 being done at this time. */
1725 /* (1) Peeling to force alignment. */
1727 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1728 Considerations:
1729 + How many accesses will become aligned due to the peeling
1730 - How many accesses will become unaligned due to the peeling,
1731 and the cost of misaligned accesses.
1732 - The cost of peeling (the extra runtime checks, the increase
1733 in code size). */
1735 FOR_EACH_VEC_ELT (datarefs, i, dr)
1737 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1738 stmt_vec_info stmt_info = dr_info->stmt;
1740 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1741 continue;
1743 /* For interleaving, only the alignment of the first access
1744 matters. */
1745 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1746 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1747 continue;
1749 /* For scatter-gather or invariant accesses there is nothing
1750 to enhance. */
1751 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1752 || integer_zerop (DR_STEP (dr)))
1753 continue;
1755 /* Strided accesses perform only component accesses, alignment is
1756 irrelevant for them. */
1757 if (STMT_VINFO_STRIDED_P (stmt_info)
1758 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1759 continue;
1761 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1762 do_peeling = vector_alignment_reachable_p (dr_info);
1763 if (do_peeling)
1765 if (known_alignment_for_access_p (dr_info))
1767 unsigned int npeel_tmp = 0;
1768 bool negative = tree_int_cst_compare (DR_STEP (dr),
1769 size_zero_node) < 0;
1771 vectype = STMT_VINFO_VECTYPE (stmt_info);
1772 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1773 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1774 mis = (negative
1775 ? DR_MISALIGNMENT (dr_info)
1776 : -DR_MISALIGNMENT (dr_info));
1777 if (DR_MISALIGNMENT (dr_info) != 0)
1778 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1780 /* For multiple types, it is possible that the bigger type access
1781 will have more than one peeling option. E.g., a loop with two
1782 types: one of size (vector size / 4), and the other one of
1783 size (vector size / 8). Vectorization factor will 8. If both
1784 accesses are misaligned by 3, the first one needs one scalar
1785 iteration to be aligned, and the second one needs 5. But the
1786 first one will be aligned also by peeling 5 scalar
1787 iterations, and in that case both accesses will be aligned.
1788 Hence, except for the immediate peeling amount, we also want
1789 to try to add full vector size, while we don't exceed
1790 vectorization factor.
1791 We do this automatically for cost model, since we calculate
1792 cost for every peeling option. */
1793 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1795 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1796 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1797 possible_npeel_number
1798 = vect_get_num_vectors (nscalars, vectype);
1800 /* NPEEL_TMP is 0 when there is no misalignment, but also
1801 allow peeling NELEMENTS. */
1802 if (DR_MISALIGNMENT (dr_info) == 0)
1803 possible_npeel_number++;
1806 /* Save info about DR in the hash table. Also include peeling
1807 amounts according to the explanation above. */
1808 for (j = 0; j < possible_npeel_number; j++)
1810 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1811 dr_info, npeel_tmp);
1812 npeel_tmp += target_align / dr_size;
1815 one_misalignment_known = true;
1817 else
1819 /* If we don't know any misalignment values, we prefer
1820 peeling for data-ref that has the maximum number of data-refs
1821 with the same alignment, unless the target prefers to align
1822 stores over load. */
1823 unsigned same_align_drs
1824 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1825 if (!dr0_info
1826 || same_align_drs_max < same_align_drs)
1828 same_align_drs_max = same_align_drs;
1829 dr0_info = dr_info;
1831 /* For data-refs with the same number of related
1832 accesses prefer the one where the misalign
1833 computation will be invariant in the outermost loop. */
1834 else if (same_align_drs_max == same_align_drs)
1836 struct loop *ivloop0, *ivloop;
1837 ivloop0 = outermost_invariant_loop_for_expr
1838 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1839 ivloop = outermost_invariant_loop_for_expr
1840 (loop, DR_BASE_ADDRESS (dr));
1841 if ((ivloop && !ivloop0)
1842 || (ivloop && ivloop0
1843 && flow_loop_nested_p (ivloop, ivloop0)))
1844 dr0_info = dr_info;
1847 one_misalignment_unknown = true;
1849 /* Check for data refs with unsupportable alignment that
1850 can be peeled. */
1851 if (!supportable_dr_alignment)
1853 one_dr_unsupportable = true;
1854 unsupportable_dr_info = dr_info;
1857 if (!first_store && DR_IS_WRITE (dr))
1858 first_store = dr_info;
1861 else
1863 if (!aligned_access_p (dr_info))
1865 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867 "vector alignment may not be reachable\n");
1868 break;
1873 /* Check if we can possibly peel the loop. */
1874 if (!vect_can_advance_ivs_p (loop_vinfo)
1875 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1876 || loop->inner)
1877 do_peeling = false;
1879 struct _vect_peel_extended_info peel_for_known_alignment;
1880 struct _vect_peel_extended_info peel_for_unknown_alignment;
1881 struct _vect_peel_extended_info best_peel;
1883 peel_for_unknown_alignment.inside_cost = INT_MAX;
1884 peel_for_unknown_alignment.outside_cost = INT_MAX;
1885 peel_for_unknown_alignment.peel_info.count = 0;
1887 if (do_peeling
1888 && one_misalignment_unknown)
1890 /* Check if the target requires to prefer stores over loads, i.e., if
1891 misaligned stores are more expensive than misaligned loads (taking
1892 drs with same alignment into account). */
1893 unsigned int load_inside_cost = 0;
1894 unsigned int load_outside_cost = 0;
1895 unsigned int store_inside_cost = 0;
1896 unsigned int store_outside_cost = 0;
1897 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1899 stmt_vector_for_cost dummy;
1900 dummy.create (2);
1901 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1902 &load_inside_cost,
1903 &load_outside_cost,
1904 &dummy, &dummy, estimated_npeels, true);
1905 dummy.release ();
1907 if (first_store)
1909 dummy.create (2);
1910 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1911 &store_inside_cost,
1912 &store_outside_cost,
1913 &dummy, &dummy,
1914 estimated_npeels, true);
1915 dummy.release ();
1917 else
1919 store_inside_cost = INT_MAX;
1920 store_outside_cost = INT_MAX;
1923 if (load_inside_cost > store_inside_cost
1924 || (load_inside_cost == store_inside_cost
1925 && load_outside_cost > store_outside_cost))
1927 dr0_info = first_store;
1928 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1929 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1931 else
1933 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1934 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1937 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1938 prologue_cost_vec.create (2);
1939 epilogue_cost_vec.create (2);
1941 int dummy2;
1942 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1943 (loop_vinfo, estimated_npeels, &dummy2,
1944 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1945 &prologue_cost_vec, &epilogue_cost_vec);
1947 prologue_cost_vec.release ();
1948 epilogue_cost_vec.release ();
1950 peel_for_unknown_alignment.peel_info.count = 1
1951 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1954 peel_for_unknown_alignment.peel_info.npeel = 0;
1955 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1957 best_peel = peel_for_unknown_alignment;
1959 peel_for_known_alignment.inside_cost = INT_MAX;
1960 peel_for_known_alignment.outside_cost = INT_MAX;
1961 peel_for_known_alignment.peel_info.count = 0;
1962 peel_for_known_alignment.peel_info.dr_info = NULL;
1964 if (do_peeling && one_misalignment_known)
1966 /* Peeling is possible, but there is no data access that is not supported
1967 unless aligned. So we try to choose the best possible peeling from
1968 the hash table. */
1969 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1970 (&peeling_htab, loop_vinfo);
1973 /* Compare costs of peeling for known and unknown alignment. */
1974 if (peel_for_known_alignment.peel_info.dr_info != NULL
1975 && peel_for_unknown_alignment.inside_cost
1976 >= peel_for_known_alignment.inside_cost)
1978 best_peel = peel_for_known_alignment;
1980 /* If the best peeling for known alignment has NPEEL == 0, perform no
1981 peeling at all except if there is an unsupportable dr that we can
1982 align. */
1983 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1984 do_peeling = false;
1987 /* If there is an unsupportable data ref, prefer this over all choices so far
1988 since we'd have to discard a chosen peeling except when it accidentally
1989 aligned the unsupportable data ref. */
1990 if (one_dr_unsupportable)
1991 dr0_info = unsupportable_dr_info;
1992 else if (do_peeling)
1994 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1995 TODO: Use nopeel_outside_cost or get rid of it? */
1996 unsigned nopeel_inside_cost = 0;
1997 unsigned nopeel_outside_cost = 0;
1999 stmt_vector_for_cost dummy;
2000 dummy.create (2);
2001 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2002 &nopeel_outside_cost, &dummy, &dummy,
2003 0, false);
2004 dummy.release ();
2006 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2007 costs will be recorded. */
2008 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2009 prologue_cost_vec.create (2);
2010 epilogue_cost_vec.create (2);
2012 int dummy2;
2013 nopeel_outside_cost += vect_get_known_peeling_cost
2014 (loop_vinfo, 0, &dummy2,
2015 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2016 &prologue_cost_vec, &epilogue_cost_vec);
2018 prologue_cost_vec.release ();
2019 epilogue_cost_vec.release ();
2021 npeel = best_peel.peel_info.npeel;
2022 dr0_info = best_peel.peel_info.dr_info;
2024 /* If no peeling is not more expensive than the best peeling we
2025 have so far, don't perform any peeling. */
2026 if (nopeel_inside_cost <= best_peel.inside_cost)
2027 do_peeling = false;
2030 if (do_peeling)
2032 stmt_vec_info stmt_info = dr0_info->stmt;
2033 vectype = STMT_VINFO_VECTYPE (stmt_info);
2035 if (known_alignment_for_access_p (dr0_info))
2037 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2038 size_zero_node) < 0;
2039 if (!npeel)
2041 /* Since it's known at compile time, compute the number of
2042 iterations in the peeled loop (the peeling factor) for use in
2043 updating DR_MISALIGNMENT values. The peeling factor is the
2044 vectorization factor minus the misalignment as an element
2045 count. */
2046 mis = (negative
2047 ? DR_MISALIGNMENT (dr0_info)
2048 : -DR_MISALIGNMENT (dr0_info));
2049 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2050 npeel = ((mis & (target_align - 1))
2051 / vect_get_scalar_dr_size (dr0_info));
2054 /* For interleaved data access every iteration accesses all the
2055 members of the group, therefore we divide the number of iterations
2056 by the group size. */
2057 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2058 npeel /= DR_GROUP_SIZE (stmt_info);
2060 if (dump_enabled_p ())
2061 dump_printf_loc (MSG_NOTE, vect_location,
2062 "Try peeling by %d\n", npeel);
2065 /* Ensure that all datarefs can be vectorized after the peel. */
2066 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2067 do_peeling = false;
2069 /* Check if all datarefs are supportable and log. */
2070 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2072 stat = vect_verify_datarefs_alignment (loop_vinfo);
2073 if (!stat)
2074 do_peeling = false;
2075 else
2076 return stat;
2079 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2080 if (do_peeling)
2082 unsigned max_allowed_peel
2083 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2084 if (max_allowed_peel != (unsigned)-1)
2086 unsigned max_peel = npeel;
2087 if (max_peel == 0)
2089 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2090 max_peel = (target_align
2091 / vect_get_scalar_dr_size (dr0_info) - 1);
2093 if (max_peel > max_allowed_peel)
2095 do_peeling = false;
2096 if (dump_enabled_p ())
2097 dump_printf_loc (MSG_NOTE, vect_location,
2098 "Disable peeling, max peels reached: %d\n", max_peel);
2103 /* Cost model #2 - if peeling may result in a remaining loop not
2104 iterating enough to be vectorized then do not peel. Since this
2105 is a cost heuristic rather than a correctness decision, use the
2106 most likely runtime value for variable vectorization factors. */
2107 if (do_peeling
2108 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2110 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2111 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2112 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2113 < assumed_vf + max_peel)
2114 do_peeling = false;
2117 if (do_peeling)
2119 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2120 If the misalignment of DR_i is identical to that of dr0 then set
2121 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2122 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2123 by the peeling factor times the element size of DR_i (MOD the
2124 vectorization factor times the size). Otherwise, the
2125 misalignment of DR_i must be set to unknown. */
2126 FOR_EACH_VEC_ELT (datarefs, i, dr)
2127 if (dr != dr0_info->dr)
2129 /* Strided accesses perform only component accesses, alignment
2130 is irrelevant for them. */
2131 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2132 stmt_info = dr_info->stmt;
2133 if (STMT_VINFO_STRIDED_P (stmt_info)
2134 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2135 continue;
2137 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2140 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2141 if (npeel)
2142 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2143 else
2144 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2145 = DR_MISALIGNMENT (dr0_info);
2146 SET_DR_MISALIGNMENT (dr0_info, 0);
2147 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_NOTE, vect_location,
2150 "Alignment of access forced using peeling.\n");
2151 dump_printf_loc (MSG_NOTE, vect_location,
2152 "Peeling for alignment will be applied.\n");
2155 /* The inside-loop cost will be accounted for in vectorizable_load
2156 and vectorizable_store correctly with adjusted alignments.
2157 Drop the body_cst_vec on the floor here. */
2158 stat = vect_verify_datarefs_alignment (loop_vinfo);
2159 gcc_assert (stat);
2160 return stat;
2164 /* (2) Versioning to force alignment. */
2166 /* Try versioning if:
2167 1) optimize loop for speed
2168 2) there is at least one unsupported misaligned data ref with an unknown
2169 misalignment, and
2170 3) all misaligned data refs with a known misalignment are supported, and
2171 4) the number of runtime alignment checks is within reason. */
2173 do_versioning =
2174 optimize_loop_nest_for_speed_p (loop)
2175 && (!loop->inner); /* FORNOW */
2177 if (do_versioning)
2179 FOR_EACH_VEC_ELT (datarefs, i, dr)
2181 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2182 stmt_vec_info stmt_info = dr_info->stmt;
2184 /* For interleaving, only the alignment of the first access
2185 matters. */
2186 if (aligned_access_p (dr_info)
2187 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2188 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2189 continue;
2191 if (STMT_VINFO_STRIDED_P (stmt_info))
2193 /* Strided loads perform only component accesses, alignment is
2194 irrelevant for them. */
2195 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2196 continue;
2197 do_versioning = false;
2198 break;
2201 supportable_dr_alignment
2202 = vect_supportable_dr_alignment (dr_info, false);
2204 if (!supportable_dr_alignment)
2206 int mask;
2207 tree vectype;
2209 if (known_alignment_for_access_p (dr_info)
2210 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2211 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2213 do_versioning = false;
2214 break;
2217 vectype = STMT_VINFO_VECTYPE (stmt_info);
2218 gcc_assert (vectype);
2220 /* At present we don't support versioning for alignment
2221 with variable VF, since there's no guarantee that the
2222 VF is a power of two. We could relax this if we added
2223 a way of enforcing a power-of-two size. */
2224 unsigned HOST_WIDE_INT size;
2225 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2227 do_versioning = false;
2228 break;
2231 /* The rightmost bits of an aligned address must be zeros.
2232 Construct the mask needed for this test. For example,
2233 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2234 mask must be 15 = 0xf. */
2235 mask = size - 1;
2237 /* FORNOW: use the same mask to test all potentially unaligned
2238 references in the loop. The vectorizer currently supports
2239 a single vector size, see the reference to
2240 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2241 vectorization factor is computed. */
2242 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2243 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2244 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2245 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2249 /* Versioning requires at least one misaligned data reference. */
2250 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2251 do_versioning = false;
2252 else if (!do_versioning)
2253 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2256 if (do_versioning)
2258 vec<stmt_vec_info> may_misalign_stmts
2259 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2260 stmt_vec_info stmt_info;
2262 /* It can now be assumed that the data references in the statements
2263 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2264 of the loop being vectorized. */
2265 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2267 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2268 SET_DR_MISALIGNMENT (dr_info, 0);
2269 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_NOTE, vect_location,
2271 "Alignment of access forced using versioning.\n");
2274 if (dump_enabled_p ())
2275 dump_printf_loc (MSG_NOTE, vect_location,
2276 "Versioning for alignment will be applied.\n");
2278 /* Peeling and versioning can't be done together at this time. */
2279 gcc_assert (! (do_peeling && do_versioning));
2281 stat = vect_verify_datarefs_alignment (loop_vinfo);
2282 gcc_assert (stat);
2283 return stat;
2286 /* This point is reached if neither peeling nor versioning is being done. */
2287 gcc_assert (! (do_peeling || do_versioning));
2289 stat = vect_verify_datarefs_alignment (loop_vinfo);
2290 return stat;
2294 /* Function vect_find_same_alignment_drs.
2296 Update group and alignment relations in VINFO according to the chosen
2297 vectorization factor. */
2299 static void
2300 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2302 struct data_reference *dra = DDR_A (ddr);
2303 struct data_reference *drb = DDR_B (ddr);
2304 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2305 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2306 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2307 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2309 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2310 return;
2312 if (dra == drb)
2313 return;
2315 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2316 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2317 return;
2319 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2320 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2321 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2322 return;
2324 /* Two references with distance zero have the same alignment. */
2325 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2326 - wi::to_poly_offset (DR_INIT (drb)));
2327 if (maybe_ne (diff, 0))
2329 /* Get the wider of the two alignments. */
2330 unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2331 / BITS_PER_UNIT);
2332 unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2333 / BITS_PER_UNIT);
2334 unsigned int max_align = MAX (align_a, align_b);
2336 /* Require the gap to be a multiple of the larger vector alignment. */
2337 if (!multiple_p (diff, max_align))
2338 return;
2341 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2342 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2343 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_NOTE, vect_location,
2346 "accesses have the same alignment: ");
2347 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2348 dump_printf (MSG_NOTE, " and ");
2349 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2350 dump_printf (MSG_NOTE, "\n");
2355 /* Function vect_analyze_data_refs_alignment
2357 Analyze the alignment of the data-references in the loop.
2358 Return FALSE if a data reference is found that cannot be vectorized. */
2360 bool
2361 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2363 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2365 /* Mark groups of data references with same alignment using
2366 data dependence information. */
2367 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2368 struct data_dependence_relation *ddr;
2369 unsigned int i;
2371 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2372 vect_find_same_alignment_drs (vinfo, ddr);
2374 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2375 struct data_reference *dr;
2377 vect_record_base_alignments (vinfo);
2378 FOR_EACH_VEC_ELT (datarefs, i, dr)
2380 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2381 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2382 vect_compute_data_ref_alignment (dr_info);
2385 return true;
2389 /* Analyze alignment of DRs of stmts in NODE. */
2391 static bool
2392 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2394 /* We vectorize from the first scalar stmt in the node unless
2395 the node is permuted in which case we start from the first
2396 element in the group. */
2397 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2398 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2399 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2400 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2402 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2403 vect_compute_data_ref_alignment (dr_info);
2404 /* For creating the data-ref pointer we need alignment of the
2405 first element anyway. */
2406 if (dr_info != first_dr_info)
2407 vect_compute_data_ref_alignment (first_dr_info);
2408 if (! verify_data_ref_alignment (dr_info))
2410 if (dump_enabled_p ())
2411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2412 "not vectorized: bad data alignment in basic "
2413 "block.\n");
2414 return false;
2417 return true;
2420 /* Function vect_slp_analyze_instance_alignment
2422 Analyze the alignment of the data-references in the SLP instance.
2423 Return FALSE if a data reference is found that cannot be vectorized. */
2425 bool
2426 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2428 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2430 slp_tree node;
2431 unsigned i;
2432 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2433 if (! vect_slp_analyze_and_verify_node_alignment (node))
2434 return false;
2436 node = SLP_INSTANCE_TREE (instance);
2437 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2438 && ! vect_slp_analyze_and_verify_node_alignment
2439 (SLP_INSTANCE_TREE (instance)))
2440 return false;
2442 return true;
2446 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2447 accesses of legal size, step, etc. Detect gaps, single element
2448 interleaving, and other special cases. Set grouped access info.
2449 Collect groups of strided stores for further use in SLP analysis.
2450 Worker for vect_analyze_group_access. */
2452 static bool
2453 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2455 data_reference *dr = dr_info->dr;
2456 tree step = DR_STEP (dr);
2457 tree scalar_type = TREE_TYPE (DR_REF (dr));
2458 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2459 stmt_vec_info stmt_info = dr_info->stmt;
2460 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2461 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2462 HOST_WIDE_INT dr_step = -1;
2463 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2464 bool slp_impossible = false;
2466 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2467 size of the interleaving group (including gaps). */
2468 if (tree_fits_shwi_p (step))
2470 dr_step = tree_to_shwi (step);
2471 /* Check that STEP is a multiple of type size. Otherwise there is
2472 a non-element-sized gap at the end of the group which we
2473 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2474 ??? As we can handle non-constant step fine here we should
2475 simply remove uses of DR_GROUP_GAP between the last and first
2476 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2477 simply not include that gap. */
2478 if ((dr_step % type_size) != 0)
2480 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_NOTE, vect_location,
2483 "Step ");
2484 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2485 dump_printf (MSG_NOTE,
2486 " is not a multiple of the element size for ");
2487 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2488 dump_printf (MSG_NOTE, "\n");
2490 return false;
2492 groupsize = absu_hwi (dr_step) / type_size;
2494 else
2495 groupsize = 0;
2497 /* Not consecutive access is possible only if it is a part of interleaving. */
2498 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2500 /* Check if it this DR is a part of interleaving, and is a single
2501 element of the group that is accessed in the loop. */
2503 /* Gaps are supported only for loads. STEP must be a multiple of the type
2504 size. */
2505 if (DR_IS_READ (dr)
2506 && (dr_step % type_size) == 0
2507 && groupsize > 0)
2509 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2510 DR_GROUP_SIZE (stmt_info) = groupsize;
2511 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2512 if (dump_enabled_p ())
2514 dump_printf_loc (MSG_NOTE, vect_location,
2515 "Detected single element interleaving ");
2516 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2517 dump_printf (MSG_NOTE, " step ");
2518 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2519 dump_printf (MSG_NOTE, "\n");
2522 return true;
2525 if (dump_enabled_p ())
2527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2528 "not consecutive access ");
2529 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2530 stmt_info->stmt, 0);
2533 if (bb_vinfo)
2535 /* Mark the statement as unvectorizable. */
2536 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2537 return true;
2540 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2541 STMT_VINFO_STRIDED_P (stmt_info) = true;
2542 return true;
2545 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2547 /* First stmt in the interleaving chain. Check the chain. */
2548 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2549 struct data_reference *data_ref = dr;
2550 unsigned int count = 1;
2551 tree prev_init = DR_INIT (data_ref);
2552 stmt_vec_info prev = stmt_info;
2553 HOST_WIDE_INT diff, gaps = 0;
2555 /* By construction, all group members have INTEGER_CST DR_INITs. */
2556 while (next)
2558 /* Skip same data-refs. In case that two or more stmts share
2559 data-ref (supported only for loads), we vectorize only the first
2560 stmt, and the rest get their vectorized loads from the first
2561 one. */
2562 if (!tree_int_cst_compare (DR_INIT (data_ref),
2563 DR_INIT (STMT_VINFO_DATA_REF (next))))
2565 if (DR_IS_WRITE (data_ref))
2567 if (dump_enabled_p ())
2568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2569 "Two store stmts share the same dr.\n");
2570 return false;
2573 if (dump_enabled_p ())
2574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2575 "Two or more load stmts share the same dr.\n");
2577 /* For load use the same data-ref load. */
2578 DR_GROUP_SAME_DR_STMT (next) = prev;
2580 prev = next;
2581 next = DR_GROUP_NEXT_ELEMENT (next);
2582 continue;
2585 prev = next;
2586 data_ref = STMT_VINFO_DATA_REF (next);
2588 /* All group members have the same STEP by construction. */
2589 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2591 /* Check that the distance between two accesses is equal to the type
2592 size. Otherwise, we have gaps. */
2593 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2594 - TREE_INT_CST_LOW (prev_init)) / type_size;
2595 if (diff != 1)
2597 /* FORNOW: SLP of accesses with gaps is not supported. */
2598 slp_impossible = true;
2599 if (DR_IS_WRITE (data_ref))
2601 if (dump_enabled_p ())
2602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2603 "interleaved store with gaps\n");
2604 return false;
2607 gaps += diff - 1;
2610 last_accessed_element += diff;
2612 /* Store the gap from the previous member of the group. If there is no
2613 gap in the access, DR_GROUP_GAP is always 1. */
2614 DR_GROUP_GAP (next) = diff;
2616 prev_init = DR_INIT (data_ref);
2617 next = DR_GROUP_NEXT_ELEMENT (next);
2618 /* Count the number of data-refs in the chain. */
2619 count++;
2622 if (groupsize == 0)
2623 groupsize = count + gaps;
2625 /* This could be UINT_MAX but as we are generating code in a very
2626 inefficient way we have to cap earlier. See PR78699 for example. */
2627 if (groupsize > 4096)
2629 if (dump_enabled_p ())
2630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631 "group is too large\n");
2632 return false;
2635 /* Check that the size of the interleaving is equal to count for stores,
2636 i.e., that there are no gaps. */
2637 if (groupsize != count
2638 && !DR_IS_READ (dr))
2640 if (dump_enabled_p ())
2641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2642 "interleaved store with gaps\n");
2643 return false;
2646 /* If there is a gap after the last load in the group it is the
2647 difference between the groupsize and the last accessed
2648 element.
2649 When there is no gap, this difference should be 0. */
2650 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2652 DR_GROUP_SIZE (stmt_info) = groupsize;
2653 if (dump_enabled_p ())
2655 dump_printf_loc (MSG_NOTE, vect_location,
2656 "Detected interleaving ");
2657 if (DR_IS_READ (dr))
2658 dump_printf (MSG_NOTE, "load ");
2659 else
2660 dump_printf (MSG_NOTE, "store ");
2661 dump_printf (MSG_NOTE, "of size %u starting with ",
2662 (unsigned)groupsize);
2663 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2664 if (DR_GROUP_GAP (stmt_info) != 0)
2665 dump_printf_loc (MSG_NOTE, vect_location,
2666 "There is a gap of %u elements after the group\n",
2667 DR_GROUP_GAP (stmt_info));
2670 /* SLP: create an SLP data structure for every interleaving group of
2671 stores for further analysis in vect_analyse_slp. */
2672 if (DR_IS_WRITE (dr) && !slp_impossible)
2674 if (loop_vinfo)
2675 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2676 if (bb_vinfo)
2677 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2681 return true;
2684 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2685 accesses of legal size, step, etc. Detect gaps, single element
2686 interleaving, and other special cases. Set grouped access info.
2687 Collect groups of strided stores for further use in SLP analysis. */
2689 static bool
2690 vect_analyze_group_access (dr_vec_info *dr_info)
2692 if (!vect_analyze_group_access_1 (dr_info))
2694 /* Dissolve the group if present. */
2695 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2696 while (stmt_info)
2698 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2699 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2700 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2701 stmt_info = next;
2703 return false;
2705 return true;
2708 /* Analyze the access pattern of the data-reference DR_INFO.
2709 In case of non-consecutive accesses call vect_analyze_group_access() to
2710 analyze groups of accesses. */
2712 static bool
2713 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2715 data_reference *dr = dr_info->dr;
2716 tree step = DR_STEP (dr);
2717 tree scalar_type = TREE_TYPE (DR_REF (dr));
2718 stmt_vec_info stmt_info = dr_info->stmt;
2719 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2720 struct loop *loop = NULL;
2722 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2723 return true;
2725 if (loop_vinfo)
2726 loop = LOOP_VINFO_LOOP (loop_vinfo);
2728 if (loop_vinfo && !step)
2730 if (dump_enabled_p ())
2731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2732 "bad data-ref access in loop\n");
2733 return false;
2736 /* Allow loads with zero step in inner-loop vectorization. */
2737 if (loop_vinfo && integer_zerop (step))
2739 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2740 if (!nested_in_vect_loop_p (loop, stmt_info))
2741 return DR_IS_READ (dr);
2742 /* Allow references with zero step for outer loops marked
2743 with pragma omp simd only - it guarantees absence of
2744 loop-carried dependencies between inner loop iterations. */
2745 if (loop->safelen < 2)
2747 if (dump_enabled_p ())
2748 dump_printf_loc (MSG_NOTE, vect_location,
2749 "zero step in inner loop of nest\n");
2750 return false;
2754 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2756 /* Interleaved accesses are not yet supported within outer-loop
2757 vectorization for references in the inner-loop. */
2758 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2760 /* For the rest of the analysis we use the outer-loop step. */
2761 step = STMT_VINFO_DR_STEP (stmt_info);
2762 if (integer_zerop (step))
2764 if (dump_enabled_p ())
2765 dump_printf_loc (MSG_NOTE, vect_location,
2766 "zero step in outer loop.\n");
2767 return DR_IS_READ (dr);
2771 /* Consecutive? */
2772 if (TREE_CODE (step) == INTEGER_CST)
2774 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2775 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2776 || (dr_step < 0
2777 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2779 /* Mark that it is not interleaving. */
2780 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2781 return true;
2785 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2787 if (dump_enabled_p ())
2788 dump_printf_loc (MSG_NOTE, vect_location,
2789 "grouped access in outer loop.\n");
2790 return false;
2794 /* Assume this is a DR handled by non-constant strided load case. */
2795 if (TREE_CODE (step) != INTEGER_CST)
2796 return (STMT_VINFO_STRIDED_P (stmt_info)
2797 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2798 || vect_analyze_group_access (dr_info)));
2800 /* Not consecutive access - check if it's a part of interleaving group. */
2801 return vect_analyze_group_access (dr_info);
2804 /* Compare two data-references DRA and DRB to group them into chunks
2805 suitable for grouping. */
2807 static int
2808 dr_group_sort_cmp (const void *dra_, const void *drb_)
2810 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2811 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2812 int cmp;
2814 /* Stabilize sort. */
2815 if (dra == drb)
2816 return 0;
2818 /* DRs in different loops never belong to the same group. */
2819 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2820 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2821 if (loopa != loopb)
2822 return loopa->num < loopb->num ? -1 : 1;
2824 /* Ordering of DRs according to base. */
2825 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2826 DR_BASE_ADDRESS (drb));
2827 if (cmp != 0)
2828 return cmp;
2830 /* And according to DR_OFFSET. */
2831 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2832 if (cmp != 0)
2833 return cmp;
2835 /* Put reads before writes. */
2836 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2837 return DR_IS_READ (dra) ? -1 : 1;
2839 /* Then sort after access size. */
2840 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2841 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2842 if (cmp != 0)
2843 return cmp;
2845 /* And after step. */
2846 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2847 if (cmp != 0)
2848 return cmp;
2850 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2851 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2852 if (cmp == 0)
2853 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2854 return cmp;
2857 /* If OP is the result of a conversion, return the unconverted value,
2858 otherwise return null. */
2860 static tree
2861 strip_conversion (tree op)
2863 if (TREE_CODE (op) != SSA_NAME)
2864 return NULL_TREE;
2865 gimple *stmt = SSA_NAME_DEF_STMT (op);
2866 if (!is_gimple_assign (stmt)
2867 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2868 return NULL_TREE;
2869 return gimple_assign_rhs1 (stmt);
2872 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2873 and STMT2_INFO being in a single group. */
2875 static bool
2876 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2878 if (gimple_assign_single_p (stmt1_info->stmt))
2879 return gimple_assign_single_p (stmt2_info->stmt);
2881 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2882 if (call1 && gimple_call_internal_p (call1))
2884 /* Check for two masked loads or two masked stores. */
2885 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2886 if (!call2 || !gimple_call_internal_p (call2))
2887 return false;
2888 internal_fn ifn = gimple_call_internal_fn (call1);
2889 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2890 return false;
2891 if (ifn != gimple_call_internal_fn (call2))
2892 return false;
2894 /* Check that the masks are the same. Cope with casts of masks,
2895 like those created by build_mask_conversion. */
2896 tree mask1 = gimple_call_arg (call1, 2);
2897 tree mask2 = gimple_call_arg (call2, 2);
2898 if (!operand_equal_p (mask1, mask2, 0))
2900 mask1 = strip_conversion (mask1);
2901 if (!mask1)
2902 return false;
2903 mask2 = strip_conversion (mask2);
2904 if (!mask2)
2905 return false;
2906 if (!operand_equal_p (mask1, mask2, 0))
2907 return false;
2909 return true;
2912 return false;
2915 /* Function vect_analyze_data_ref_accesses.
2917 Analyze the access pattern of all the data references in the loop.
2919 FORNOW: the only access pattern that is considered vectorizable is a
2920 simple step 1 (consecutive) access.
2922 FORNOW: handle only arrays and pointer accesses. */
2924 bool
2925 vect_analyze_data_ref_accesses (vec_info *vinfo)
2927 unsigned int i;
2928 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2929 struct data_reference *dr;
2931 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2933 if (datarefs.is_empty ())
2934 return true;
2936 /* Sort the array of datarefs to make building the interleaving chains
2937 linear. Don't modify the original vector's order, it is needed for
2938 determining what dependencies are reversed. */
2939 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2940 datarefs_copy.qsort (dr_group_sort_cmp);
2942 /* Build the interleaving chains. */
2943 for (i = 0; i < datarefs_copy.length () - 1;)
2945 data_reference_p dra = datarefs_copy[i];
2946 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2947 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2948 stmt_vec_info lastinfo = NULL;
2949 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2950 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2952 ++i;
2953 continue;
2955 for (i = i + 1; i < datarefs_copy.length (); ++i)
2957 data_reference_p drb = datarefs_copy[i];
2958 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2959 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2960 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2961 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2962 break;
2964 /* ??? Imperfect sorting (non-compatible types, non-modulo
2965 accesses, same accesses) can lead to a group to be artificially
2966 split here as we don't just skip over those. If it really
2967 matters we can push those to a worklist and re-iterate
2968 over them. The we can just skip ahead to the next DR here. */
2970 /* DRs in a different loop should not be put into the same
2971 interleaving group. */
2972 if (gimple_bb (DR_STMT (dra))->loop_father
2973 != gimple_bb (DR_STMT (drb))->loop_father)
2974 break;
2976 /* Check that the data-refs have same first location (except init)
2977 and they are both either store or load (not load and store,
2978 not masked loads or stores). */
2979 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2980 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2981 DR_BASE_ADDRESS (drb)) != 0
2982 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2983 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2984 break;
2986 /* Check that the data-refs have the same constant size. */
2987 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2988 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2989 if (!tree_fits_uhwi_p (sza)
2990 || !tree_fits_uhwi_p (szb)
2991 || !tree_int_cst_equal (sza, szb))
2992 break;
2994 /* Check that the data-refs have the same step. */
2995 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2996 break;
2998 /* Check the types are compatible.
2999 ??? We don't distinguish this during sorting. */
3000 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3001 TREE_TYPE (DR_REF (drb))))
3002 break;
3004 /* Check that the DR_INITs are compile-time constants. */
3005 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3006 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3007 break;
3009 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3010 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3011 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3012 HOST_WIDE_INT init_prev
3013 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3014 gcc_assert (init_a <= init_b
3015 && init_a <= init_prev
3016 && init_prev <= init_b);
3018 /* Do not place the same access in the interleaving chain twice. */
3019 if (init_b == init_prev)
3021 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3022 < gimple_uid (DR_STMT (drb)));
3023 /* ??? For now we simply "drop" the later reference which is
3024 otherwise the same rather than finishing off this group.
3025 In the end we'd want to re-process duplicates forming
3026 multiple groups from the refs, likely by just collecting
3027 all candidates (including duplicates and split points
3028 below) in a vector and then process them together. */
3029 continue;
3032 /* If init_b == init_a + the size of the type * k, we have an
3033 interleaving, and DRA is accessed before DRB. */
3034 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3035 if (type_size_a == 0
3036 || (init_b - init_a) % type_size_a != 0)
3037 break;
3039 /* If we have a store, the accesses are adjacent. This splits
3040 groups into chunks we support (we don't support vectorization
3041 of stores with gaps). */
3042 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3043 break;
3045 /* If the step (if not zero or non-constant) is greater than the
3046 difference between data-refs' inits this splits groups into
3047 suitable sizes. */
3048 if (tree_fits_shwi_p (DR_STEP (dra)))
3050 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3051 if (step != 0 && step <= (init_b - init_a))
3052 break;
3055 if (dump_enabled_p ())
3057 dump_printf_loc (MSG_NOTE, vect_location,
3058 "Detected interleaving ");
3059 if (DR_IS_READ (dra))
3060 dump_printf (MSG_NOTE, "load ");
3061 else
3062 dump_printf (MSG_NOTE, "store ");
3063 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3064 dump_printf (MSG_NOTE, " and ");
3065 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3066 dump_printf (MSG_NOTE, "\n");
3069 /* Link the found element into the group list. */
3070 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3072 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3073 lastinfo = stmtinfo_a;
3075 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3076 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3077 lastinfo = stmtinfo_b;
3081 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3083 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3084 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3085 && !vect_analyze_data_ref_access (dr_info))
3087 if (dump_enabled_p ())
3088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3089 "not vectorized: complicated access pattern.\n");
3091 if (is_a <bb_vec_info> (vinfo))
3093 /* Mark the statement as not vectorizable. */
3094 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3095 continue;
3097 else
3099 datarefs_copy.release ();
3100 return false;
3105 datarefs_copy.release ();
3106 return true;
3109 /* Function vect_vfa_segment_size.
3111 Input:
3112 DR_INFO: The data reference.
3113 LENGTH_FACTOR: segment length to consider.
3115 Return a value suitable for the dr_with_seg_len::seg_len field.
3116 This is the "distance travelled" by the pointer from the first
3117 iteration in the segment to the last. Note that it does not include
3118 the size of the access; in effect it only describes the first byte. */
3120 static tree
3121 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3123 length_factor = size_binop (MINUS_EXPR,
3124 fold_convert (sizetype, length_factor),
3125 size_one_node);
3126 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3127 length_factor);
3130 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3131 gives the worst-case number of bytes covered by the segment. */
3133 static unsigned HOST_WIDE_INT
3134 vect_vfa_access_size (dr_vec_info *dr_info)
3136 stmt_vec_info stmt_vinfo = dr_info->stmt;
3137 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3138 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3139 unsigned HOST_WIDE_INT access_size = ref_size;
3140 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3142 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3143 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3145 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3146 && (vect_supportable_dr_alignment (dr_info, false)
3147 == dr_explicit_realign_optimized))
3149 /* We might access a full vector's worth. */
3150 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3151 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3153 return access_size;
3156 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3157 describes. */
3159 static unsigned int
3160 vect_vfa_align (dr_vec_info *dr_info)
3162 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3165 /* Function vect_no_alias_p.
3167 Given data references A and B with equal base and offset, see whether
3168 the alias relation can be decided at compilation time. Return 1 if
3169 it can and the references alias, 0 if it can and the references do
3170 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3171 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3172 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3174 static int
3175 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3176 tree segment_length_a, tree segment_length_b,
3177 unsigned HOST_WIDE_INT access_size_a,
3178 unsigned HOST_WIDE_INT access_size_b)
3180 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3181 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3182 poly_uint64 const_length_a;
3183 poly_uint64 const_length_b;
3185 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3186 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3187 [a, a+12) */
3188 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3190 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3191 offset_a = (offset_a + access_size_a) - const_length_a;
3193 else
3194 const_length_a = tree_to_poly_uint64 (segment_length_a);
3195 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3197 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3198 offset_b = (offset_b + access_size_b) - const_length_b;
3200 else
3201 const_length_b = tree_to_poly_uint64 (segment_length_b);
3203 const_length_a += access_size_a;
3204 const_length_b += access_size_b;
3206 if (ranges_known_overlap_p (offset_a, const_length_a,
3207 offset_b, const_length_b))
3208 return 1;
3210 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3211 offset_b, const_length_b))
3212 return 0;
3214 return -1;
3217 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3218 in DDR is >= VF. */
3220 static bool
3221 dependence_distance_ge_vf (data_dependence_relation *ddr,
3222 unsigned int loop_depth, poly_uint64 vf)
3224 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3225 || DDR_NUM_DIST_VECTS (ddr) == 0)
3226 return false;
3228 /* If the dependence is exact, we should have limited the VF instead. */
3229 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3231 unsigned int i;
3232 lambda_vector dist_v;
3233 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3235 HOST_WIDE_INT dist = dist_v[loop_depth];
3236 if (dist != 0
3237 && !(dist > 0 && DDR_REVERSED_P (ddr))
3238 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3239 return false;
3242 if (dump_enabled_p ())
3244 dump_printf_loc (MSG_NOTE, vect_location,
3245 "dependence distance between ");
3246 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3247 dump_printf (MSG_NOTE, " and ");
3248 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3249 dump_printf (MSG_NOTE, " is >= VF\n");
3252 return true;
3255 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3257 static void
3258 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3260 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3261 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3262 dump_printf (dump_kind, ") >= ");
3263 dump_dec (dump_kind, lower_bound.min_value);
3266 /* Record that the vectorized loop requires the vec_lower_bound described
3267 by EXPR, UNSIGNED_P and MIN_VALUE. */
3269 static void
3270 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3271 poly_uint64 min_value)
3273 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3274 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3275 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3277 unsigned_p &= lower_bounds[i].unsigned_p;
3278 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3279 if (lower_bounds[i].unsigned_p != unsigned_p
3280 || maybe_lt (lower_bounds[i].min_value, min_value))
3282 lower_bounds[i].unsigned_p = unsigned_p;
3283 lower_bounds[i].min_value = min_value;
3284 if (dump_enabled_p ())
3286 dump_printf_loc (MSG_NOTE, vect_location,
3287 "updating run-time check to ");
3288 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3289 dump_printf (MSG_NOTE, "\n");
3292 return;
3295 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3296 if (dump_enabled_p ())
3298 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3299 dump_lower_bound (MSG_NOTE, lower_bound);
3300 dump_printf (MSG_NOTE, "\n");
3302 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3305 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3306 will span fewer than GAP bytes. */
3308 static bool
3309 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3310 poly_int64 gap)
3312 stmt_vec_info stmt_info = dr_info->stmt;
3313 HOST_WIDE_INT count
3314 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3315 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3316 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3317 return (estimated_poly_value (gap)
3318 <= count * vect_get_scalar_dr_size (dr_info));
3321 /* Return true if we know that there is no alias between DR_INFO_A and
3322 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3323 When returning true, set *LOWER_BOUND_OUT to this N. */
3325 static bool
3326 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3327 poly_uint64 *lower_bound_out)
3329 /* Check that there is a constant gap of known sign between DR_A
3330 and DR_B. */
3331 data_reference *dr_a = dr_info_a->dr;
3332 data_reference *dr_b = dr_info_b->dr;
3333 poly_int64 init_a, init_b;
3334 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3335 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3336 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3337 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3338 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3339 || !ordered_p (init_a, init_b))
3340 return false;
3342 /* Sort DR_A and DR_B by the address they access. */
3343 if (maybe_lt (init_b, init_a))
3345 std::swap (init_a, init_b);
3346 std::swap (dr_info_a, dr_info_b);
3347 std::swap (dr_a, dr_b);
3350 /* If the two accesses could be dependent within a scalar iteration,
3351 make sure that we'd retain their order. */
3352 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3353 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3354 return false;
3356 /* There is no alias if abs (DR_STEP) is greater than or equal to
3357 the bytes spanned by the combination of the two accesses. */
3358 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3359 return true;
3362 /* Function vect_prune_runtime_alias_test_list.
3364 Prune a list of ddrs to be tested at run-time by versioning for alias.
3365 Merge several alias checks into one if possible.
3366 Return FALSE if resulting list of ddrs is longer then allowed by
3367 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3369 bool
3370 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3372 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3373 hash_set <tree_pair_hash> compared_objects;
3375 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3376 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3377 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3378 vec<vec_object_pair> &check_unequal_addrs
3379 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3380 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3381 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3383 ddr_p ddr;
3384 unsigned int i;
3385 tree length_factor;
3387 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3389 /* Step values are irrelevant for aliasing if the number of vector
3390 iterations is equal to the number of scalar iterations (which can
3391 happen for fully-SLP loops). */
3392 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3394 if (!ignore_step_p)
3396 /* Convert the checks for nonzero steps into bound tests. */
3397 tree value;
3398 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3399 vect_check_lower_bound (loop_vinfo, value, true, 1);
3402 if (may_alias_ddrs.is_empty ())
3403 return true;
3405 comp_alias_ddrs.create (may_alias_ddrs.length ());
3407 unsigned int loop_depth
3408 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3409 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3411 /* First, we collect all data ref pairs for aliasing checks. */
3412 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3414 int comp_res;
3415 poly_uint64 lower_bound;
3416 tree segment_length_a, segment_length_b;
3417 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3418 unsigned int align_a, align_b;
3420 /* Ignore the alias if the VF we chose ended up being no greater
3421 than the dependence distance. */
3422 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3423 continue;
3425 if (DDR_OBJECT_A (ddr))
3427 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3428 if (!compared_objects.add (new_pair))
3430 if (dump_enabled_p ())
3432 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3433 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3434 dump_printf (MSG_NOTE, " and ");
3435 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3436 dump_printf (MSG_NOTE, " have different addresses\n");
3438 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3440 continue;
3443 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3444 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3446 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3447 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3449 /* Skip the pair if inter-iteration dependencies are irrelevant
3450 and intra-iteration dependencies are guaranteed to be honored. */
3451 if (ignore_step_p
3452 && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3453 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3454 &lower_bound)))
3456 if (dump_enabled_p ())
3458 dump_printf_loc (MSG_NOTE, vect_location,
3459 "no need for alias check between ");
3460 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3461 dump_printf (MSG_NOTE, " and ");
3462 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3463 dump_printf (MSG_NOTE, " when VF is 1\n");
3465 continue;
3468 /* See whether we can handle the alias using a bounds check on
3469 the step, and whether that's likely to be the best approach.
3470 (It might not be, for example, if the minimum step is much larger
3471 than the number of bytes handled by one vector iteration.) */
3472 if (!ignore_step_p
3473 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3474 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3475 &lower_bound)
3476 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3477 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3479 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3480 if (dump_enabled_p ())
3482 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3483 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3484 dump_printf (MSG_NOTE, " and ");
3485 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3486 dump_printf (MSG_NOTE, " when the step ");
3487 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_info_a->dr));
3488 dump_printf (MSG_NOTE, " is outside ");
3489 if (unsigned_p)
3490 dump_printf (MSG_NOTE, "[0");
3491 else
3493 dump_printf (MSG_NOTE, "(");
3494 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3496 dump_printf (MSG_NOTE, ", ");
3497 dump_dec (MSG_NOTE, lower_bound);
3498 dump_printf (MSG_NOTE, ")\n");
3500 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3501 unsigned_p, lower_bound);
3502 continue;
3505 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3506 if (dr_group_first_a)
3508 stmt_info_a = dr_group_first_a;
3509 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3512 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3513 if (dr_group_first_b)
3515 stmt_info_b = dr_group_first_b;
3516 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3519 if (ignore_step_p)
3521 segment_length_a = size_zero_node;
3522 segment_length_b = size_zero_node;
3524 else
3526 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3527 DR_STEP (dr_info_b->dr), 0))
3528 length_factor = scalar_loop_iters;
3529 else
3530 length_factor = size_int (vect_factor);
3531 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3532 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3534 access_size_a = vect_vfa_access_size (dr_info_a);
3535 access_size_b = vect_vfa_access_size (dr_info_b);
3536 align_a = vect_vfa_align (dr_info_a);
3537 align_b = vect_vfa_align (dr_info_b);
3539 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3540 DR_BASE_ADDRESS (dr_info_b->dr));
3541 if (comp_res == 0)
3542 comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3543 DR_OFFSET (dr_info_b->dr));
3545 /* See whether the alias is known at compilation time. */
3546 if (comp_res == 0
3547 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3548 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3549 && poly_int_tree_p (segment_length_a)
3550 && poly_int_tree_p (segment_length_b))
3552 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3553 segment_length_a,
3554 segment_length_b,
3555 access_size_a,
3556 access_size_b);
3557 if (res >= 0 && dump_enabled_p ())
3559 dump_printf_loc (MSG_NOTE, vect_location,
3560 "can tell at compile time that ");
3561 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3562 dump_printf (MSG_NOTE, " and ");
3563 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3564 if (res == 0)
3565 dump_printf (MSG_NOTE, " do not alias\n");
3566 else
3567 dump_printf (MSG_NOTE, " alias\n");
3570 if (res == 0)
3571 continue;
3573 if (res == 1)
3575 if (dump_enabled_p ())
3576 dump_printf_loc (MSG_NOTE, vect_location,
3577 "not vectorized: compilation time alias.\n");
3578 return false;
3582 dr_with_seg_len_pair_t dr_with_seg_len_pair
3583 (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3584 access_size_a, align_a),
3585 dr_with_seg_len (dr_info_b->dr, segment_length_b,
3586 access_size_b, align_b));
3588 /* Canonicalize pairs by sorting the two DR members. */
3589 if (comp_res > 0)
3590 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3592 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3595 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3597 unsigned int count = (comp_alias_ddrs.length ()
3598 + check_unequal_addrs.length ());
3600 dump_printf_loc (MSG_NOTE, vect_location,
3601 "improved number of alias checks from %d to %d\n",
3602 may_alias_ddrs.length (), count);
3603 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3605 if (dump_enabled_p ())
3606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3607 "number of versioning for alias "
3608 "run-time tests exceeds %d "
3609 "(--param vect-max-version-for-alias-checks)\n",
3610 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3611 return false;
3614 return true;
3617 /* Check whether we can use an internal function for a gather load
3618 or scatter store. READ_P is true for loads and false for stores.
3619 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3620 the type of the memory elements being loaded or stored. OFFSET_BITS
3621 is the number of bits in each scalar offset and OFFSET_SIGN is the
3622 sign of the offset. SCALE is the amount by which the offset should
3623 be multiplied *after* it has been converted to address width.
3625 Return true if the function is supported, storing the function
3626 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3628 bool
3629 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3630 tree memory_type, unsigned int offset_bits,
3631 signop offset_sign, int scale,
3632 internal_fn *ifn_out, tree *element_type_out)
3634 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3635 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3636 if (offset_bits > element_bits)
3637 /* Internal functions require the offset to be the same width as
3638 the vector elements. We can extend narrower offsets, but it isn't
3639 safe to truncate wider offsets. */
3640 return false;
3642 if (element_bits != memory_bits)
3643 /* For now the vector elements must be the same width as the
3644 memory elements. */
3645 return false;
3647 /* Work out which function we need. */
3648 internal_fn ifn;
3649 if (read_p)
3650 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3651 else
3652 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3654 /* Test whether the target supports this combination. */
3655 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3656 offset_sign, scale))
3657 return false;
3659 *ifn_out = ifn;
3660 *element_type_out = TREE_TYPE (vectype);
3661 return true;
3664 /* STMT_INFO is a call to an internal gather load or scatter store function.
3665 Describe the operation in INFO. */
3667 static void
3668 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3669 gather_scatter_info *info)
3671 gcall *call = as_a <gcall *> (stmt_info->stmt);
3672 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3673 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3675 info->ifn = gimple_call_internal_fn (call);
3676 info->decl = NULL_TREE;
3677 info->base = gimple_call_arg (call, 0);
3678 info->offset = gimple_call_arg (call, 1);
3679 info->offset_dt = vect_unknown_def_type;
3680 info->offset_vectype = NULL_TREE;
3681 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3682 info->element_type = TREE_TYPE (vectype);
3683 info->memory_type = TREE_TYPE (DR_REF (dr));
3686 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3687 gather load or scatter store. Describe the operation in *INFO if so. */
3689 bool
3690 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3691 gather_scatter_info *info)
3693 HOST_WIDE_INT scale = 1;
3694 poly_int64 pbitpos, pbitsize;
3695 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3696 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3697 tree offtype = NULL_TREE;
3698 tree decl = NULL_TREE, base, off;
3699 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3700 tree memory_type = TREE_TYPE (DR_REF (dr));
3701 machine_mode pmode;
3702 int punsignedp, reversep, pvolatilep = 0;
3703 internal_fn ifn;
3704 tree element_type;
3705 bool masked_p = false;
3707 /* See whether this is already a call to a gather/scatter internal function.
3708 If not, see whether it's a masked load or store. */
3709 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3710 if (call && gimple_call_internal_p (call))
3712 ifn = gimple_call_internal_fn (call);
3713 if (internal_gather_scatter_fn_p (ifn))
3715 vect_describe_gather_scatter_call (stmt_info, info);
3716 return true;
3718 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3721 /* True if we should aim to use internal functions rather than
3722 built-in functions. */
3723 bool use_ifn_p = (DR_IS_READ (dr)
3724 ? supports_vec_gather_load_p ()
3725 : supports_vec_scatter_store_p ());
3727 base = DR_REF (dr);
3728 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3729 see if we can use the def stmt of the address. */
3730 if (masked_p
3731 && TREE_CODE (base) == MEM_REF
3732 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3733 && integer_zerop (TREE_OPERAND (base, 1))
3734 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3736 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3737 if (is_gimple_assign (def_stmt)
3738 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3739 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3742 /* The gather and scatter builtins need address of the form
3743 loop_invariant + vector * {1, 2, 4, 8}
3745 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3746 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3747 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3748 multiplications and additions in it. To get a vector, we need
3749 a single SSA_NAME that will be defined in the loop and will
3750 contain everything that is not loop invariant and that can be
3751 vectorized. The following code attempts to find such a preexistng
3752 SSA_NAME OFF and put the loop invariants into a tree BASE
3753 that can be gimplified before the loop. */
3754 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3755 &punsignedp, &reversep, &pvolatilep);
3756 if (reversep)
3757 return false;
3759 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3761 if (TREE_CODE (base) == MEM_REF)
3763 if (!integer_zerop (TREE_OPERAND (base, 1)))
3765 if (off == NULL_TREE)
3766 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3767 else
3768 off = size_binop (PLUS_EXPR, off,
3769 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3771 base = TREE_OPERAND (base, 0);
3773 else
3774 base = build_fold_addr_expr (base);
3776 if (off == NULL_TREE)
3777 off = size_zero_node;
3779 /* If base is not loop invariant, either off is 0, then we start with just
3780 the constant offset in the loop invariant BASE and continue with base
3781 as OFF, otherwise give up.
3782 We could handle that case by gimplifying the addition of base + off
3783 into some SSA_NAME and use that as off, but for now punt. */
3784 if (!expr_invariant_in_loop_p (loop, base))
3786 if (!integer_zerop (off))
3787 return false;
3788 off = base;
3789 base = size_int (pbytepos);
3791 /* Otherwise put base + constant offset into the loop invariant BASE
3792 and continue with OFF. */
3793 else
3795 base = fold_convert (sizetype, base);
3796 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3799 /* OFF at this point may be either a SSA_NAME or some tree expression
3800 from get_inner_reference. Try to peel off loop invariants from it
3801 into BASE as long as possible. */
3802 STRIP_NOPS (off);
3803 while (offtype == NULL_TREE)
3805 enum tree_code code;
3806 tree op0, op1, add = NULL_TREE;
3808 if (TREE_CODE (off) == SSA_NAME)
3810 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3812 if (expr_invariant_in_loop_p (loop, off))
3813 return false;
3815 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3816 break;
3818 op0 = gimple_assign_rhs1 (def_stmt);
3819 code = gimple_assign_rhs_code (def_stmt);
3820 op1 = gimple_assign_rhs2 (def_stmt);
3822 else
3824 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3825 return false;
3826 code = TREE_CODE (off);
3827 extract_ops_from_tree (off, &code, &op0, &op1);
3829 switch (code)
3831 case POINTER_PLUS_EXPR:
3832 case PLUS_EXPR:
3833 if (expr_invariant_in_loop_p (loop, op0))
3835 add = op0;
3836 off = op1;
3837 do_add:
3838 add = fold_convert (sizetype, add);
3839 if (scale != 1)
3840 add = size_binop (MULT_EXPR, add, size_int (scale));
3841 base = size_binop (PLUS_EXPR, base, add);
3842 continue;
3844 if (expr_invariant_in_loop_p (loop, op1))
3846 add = op1;
3847 off = op0;
3848 goto do_add;
3850 break;
3851 case MINUS_EXPR:
3852 if (expr_invariant_in_loop_p (loop, op1))
3854 add = fold_convert (sizetype, op1);
3855 add = size_binop (MINUS_EXPR, size_zero_node, add);
3856 off = op0;
3857 goto do_add;
3859 break;
3860 case MULT_EXPR:
3861 if (scale == 1 && tree_fits_shwi_p (op1))
3863 int new_scale = tree_to_shwi (op1);
3864 /* Only treat this as a scaling operation if the target
3865 supports it. */
3866 if (use_ifn_p
3867 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3868 vectype, memory_type, 1,
3869 TYPE_SIGN (TREE_TYPE (op0)),
3870 new_scale, &ifn,
3871 &element_type))
3872 break;
3873 scale = new_scale;
3874 off = op0;
3875 continue;
3877 break;
3878 case SSA_NAME:
3879 off = op0;
3880 continue;
3881 CASE_CONVERT:
3882 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3883 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3884 break;
3885 if (TYPE_PRECISION (TREE_TYPE (op0))
3886 == TYPE_PRECISION (TREE_TYPE (off)))
3888 off = op0;
3889 continue;
3892 /* The internal functions need the offset to be the same width
3893 as the elements of VECTYPE. Don't include operations that
3894 cast the offset from that width to a different width. */
3895 if (use_ifn_p
3896 && (int_size_in_bytes (TREE_TYPE (vectype))
3897 == int_size_in_bytes (TREE_TYPE (off))))
3898 break;
3900 if (TYPE_PRECISION (TREE_TYPE (op0))
3901 < TYPE_PRECISION (TREE_TYPE (off)))
3903 off = op0;
3904 offtype = TREE_TYPE (off);
3905 STRIP_NOPS (off);
3906 continue;
3908 break;
3909 default:
3910 break;
3912 break;
3915 /* If at the end OFF still isn't a SSA_NAME or isn't
3916 defined in the loop, punt. */
3917 if (TREE_CODE (off) != SSA_NAME
3918 || expr_invariant_in_loop_p (loop, off))
3919 return false;
3921 if (offtype == NULL_TREE)
3922 offtype = TREE_TYPE (off);
3924 if (use_ifn_p)
3926 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3927 memory_type, TYPE_PRECISION (offtype),
3928 TYPE_SIGN (offtype), scale, &ifn,
3929 &element_type))
3930 return false;
3932 else
3934 if (DR_IS_READ (dr))
3936 if (targetm.vectorize.builtin_gather)
3937 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3939 else
3941 if (targetm.vectorize.builtin_scatter)
3942 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3945 if (!decl)
3946 return false;
3948 ifn = IFN_LAST;
3949 element_type = TREE_TYPE (vectype);
3952 info->ifn = ifn;
3953 info->decl = decl;
3954 info->base = base;
3955 info->offset = off;
3956 info->offset_dt = vect_unknown_def_type;
3957 info->offset_vectype = NULL_TREE;
3958 info->scale = scale;
3959 info->element_type = element_type;
3960 info->memory_type = memory_type;
3961 return true;
3964 /* Find the data references in STMT, analyze them with respect to LOOP and
3965 append them to DATAREFS. Return false if datarefs in this stmt cannot
3966 be handled. */
3968 bool
3969 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3970 vec<data_reference_p> *datarefs)
3972 /* We can ignore clobbers for dataref analysis - they are removed during
3973 loop vectorization and BB vectorization checks dependences with a
3974 stmt walk. */
3975 if (gimple_clobber_p (stmt))
3976 return true;
3978 if (gimple_has_volatile_ops (stmt))
3980 if (dump_enabled_p ())
3982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3983 "not vectorized: volatile type ");
3984 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3986 return false;
3989 if (stmt_can_throw_internal (stmt))
3991 if (dump_enabled_p ())
3993 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3994 "not vectorized: statement can throw an "
3995 "exception ");
3996 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3998 return false;
4001 auto_vec<data_reference_p, 2> refs;
4002 if (!find_data_references_in_stmt (loop, stmt, &refs))
4003 return false;
4005 if (refs.is_empty ())
4006 return true;
4008 if (refs.length () > 1)
4010 if (dump_enabled_p ())
4012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4013 "not vectorized: more than one data ref "
4014 "in stmt: ");
4015 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4017 return false;
4020 if (gcall *call = dyn_cast <gcall *> (stmt))
4021 if (!gimple_call_internal_p (call)
4022 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4023 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4025 if (dump_enabled_p ())
4027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4028 "not vectorized: dr in a call ");
4029 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4031 return false;
4034 data_reference_p dr = refs.pop ();
4035 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4036 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4038 if (dump_enabled_p ())
4040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4041 "not vectorized: statement is bitfield "
4042 "access ");
4043 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4045 return false;
4048 if (DR_BASE_ADDRESS (dr)
4049 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4051 if (dump_enabled_p ())
4052 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4053 "not vectorized: base addr of dr is a "
4054 "constant\n");
4055 return false;
4058 /* Check whether this may be a SIMD lane access and adjust the
4059 DR to make it easier for us to handle it. */
4060 if (loop
4061 && loop->simduid
4062 && (!DR_BASE_ADDRESS (dr)
4063 || !DR_OFFSET (dr)
4064 || !DR_INIT (dr)
4065 || !DR_STEP (dr)))
4067 struct data_reference *newdr
4068 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4069 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4070 if (DR_BASE_ADDRESS (newdr)
4071 && DR_OFFSET (newdr)
4072 && DR_INIT (newdr)
4073 && DR_STEP (newdr)
4074 && integer_zerop (DR_STEP (newdr)))
4076 tree off = DR_OFFSET (newdr);
4077 STRIP_NOPS (off);
4078 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4079 && TREE_CODE (off) == MULT_EXPR
4080 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4082 tree step = TREE_OPERAND (off, 1);
4083 off = TREE_OPERAND (off, 0);
4084 STRIP_NOPS (off);
4085 if (CONVERT_EXPR_P (off)
4086 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4087 < TYPE_PRECISION (TREE_TYPE (off))))
4088 off = TREE_OPERAND (off, 0);
4089 if (TREE_CODE (off) == SSA_NAME)
4091 gimple *def = SSA_NAME_DEF_STMT (off);
4092 tree reft = TREE_TYPE (DR_REF (newdr));
4093 if (is_gimple_call (def)
4094 && gimple_call_internal_p (def)
4095 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4097 tree arg = gimple_call_arg (def, 0);
4098 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4099 arg = SSA_NAME_VAR (arg);
4100 if (arg == loop->simduid
4101 /* For now. */
4102 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4104 DR_OFFSET (newdr) = ssize_int (0);
4105 DR_STEP (newdr) = step;
4106 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4107 DR_STEP_ALIGNMENT (newdr)
4108 = highest_pow2_factor (step);
4109 /* Mark as simd-lane access. */
4110 newdr->aux = (void *)-1;
4111 free_data_ref (dr);
4112 datarefs->safe_push (newdr);
4113 return true;
4119 free_data_ref (newdr);
4122 datarefs->safe_push (dr);
4123 return true;
4126 /* Function vect_analyze_data_refs.
4128 Find all the data references in the loop or basic block.
4130 The general structure of the analysis of data refs in the vectorizer is as
4131 follows:
4132 1- vect_analyze_data_refs(loop/bb): call
4133 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4134 in the loop/bb and their dependences.
4135 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4136 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4137 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4141 bool
4142 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4144 struct loop *loop = NULL;
4145 unsigned int i;
4146 struct data_reference *dr;
4147 tree scalar_type;
4149 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4151 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4152 loop = LOOP_VINFO_LOOP (loop_vinfo);
4154 /* Go through the data-refs, check that the analysis succeeded. Update
4155 pointer from stmt_vec_info struct to DR and vectype. */
4157 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4158 FOR_EACH_VEC_ELT (datarefs, i, dr)
4160 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4161 poly_uint64 vf;
4163 gcc_assert (DR_REF (dr));
4164 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4165 gcc_assert (!stmt_info->dr_aux.dr);
4166 stmt_info->dr_aux.dr = dr;
4167 stmt_info->dr_aux.stmt = stmt_info;
4169 /* Check that analysis of the data-ref succeeded. */
4170 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4171 || !DR_STEP (dr))
4173 bool maybe_gather
4174 = DR_IS_READ (dr)
4175 && !TREE_THIS_VOLATILE (DR_REF (dr))
4176 && (targetm.vectorize.builtin_gather != NULL
4177 || supports_vec_gather_load_p ());
4178 bool maybe_scatter
4179 = DR_IS_WRITE (dr)
4180 && !TREE_THIS_VOLATILE (DR_REF (dr))
4181 && (targetm.vectorize.builtin_scatter != NULL
4182 || supports_vec_scatter_store_p ());
4184 /* If target supports vector gather loads or scatter stores,
4185 see if they can't be used. */
4186 if (is_a <loop_vec_info> (vinfo)
4187 && !nested_in_vect_loop_p (loop, stmt_info))
4189 if (maybe_gather || maybe_scatter)
4191 if (maybe_gather)
4192 gatherscatter = GATHER;
4193 else
4194 gatherscatter = SCATTER;
4198 if (gatherscatter == SG_NONE)
4200 if (dump_enabled_p ())
4202 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4203 "not vectorized: data ref analysis "
4204 "failed ");
4205 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4206 stmt_info->stmt, 0);
4208 if (is_a <bb_vec_info> (vinfo))
4210 /* In BB vectorization the ref can still participate
4211 in dependence analysis, we just can't vectorize it. */
4212 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4213 continue;
4215 return false;
4219 /* See if this was detected as SIMD lane access. */
4220 if (dr->aux == (void *)-1)
4222 if (nested_in_vect_loop_p (loop, stmt_info))
4224 if (dump_enabled_p ())
4226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4227 "not vectorized: data ref analysis "
4228 "failed ");
4229 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4230 stmt_info->stmt, 0);
4232 return false;
4234 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4237 tree base = get_base_address (DR_REF (dr));
4238 if (base && VAR_P (base) && DECL_NONALIASED (base))
4240 if (dump_enabled_p ())
4242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4243 "not vectorized: base object not addressable "
4244 "for stmt: ");
4245 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4246 stmt_info->stmt, 0);
4248 if (is_a <bb_vec_info> (vinfo))
4250 /* In BB vectorization the ref can still participate
4251 in dependence analysis, we just can't vectorize it. */
4252 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4253 continue;
4255 return false;
4258 if (is_a <loop_vec_info> (vinfo)
4259 && DR_STEP (dr)
4260 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4262 if (nested_in_vect_loop_p (loop, stmt_info))
4264 if (dump_enabled_p ())
4266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4267 "not vectorized: not suitable for strided "
4268 "load ");
4269 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4270 stmt_info->stmt, 0);
4272 return false;
4274 STMT_VINFO_STRIDED_P (stmt_info) = true;
4277 /* Update DR field in stmt_vec_info struct. */
4279 /* If the dataref is in an inner-loop of the loop that is considered for
4280 for vectorization, we also want to analyze the access relative to
4281 the outer-loop (DR contains information only relative to the
4282 inner-most enclosing loop). We do that by building a reference to the
4283 first location accessed by the inner-loop, and analyze it relative to
4284 the outer-loop. */
4285 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4287 /* Build a reference to the first location accessed by the
4288 inner loop: *(BASE + INIT + OFFSET). By construction,
4289 this address must be invariant in the inner loop, so we
4290 can consider it as being used in the outer loop. */
4291 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4292 tree offset = unshare_expr (DR_OFFSET (dr));
4293 tree init = unshare_expr (DR_INIT (dr));
4294 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4295 init, offset);
4296 tree init_addr = fold_build_pointer_plus (base, init_offset);
4297 tree init_ref = build_fold_indirect_ref (init_addr);
4299 if (dump_enabled_p ())
4301 dump_printf_loc (MSG_NOTE, vect_location,
4302 "analyze in outer loop: ");
4303 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4304 dump_printf (MSG_NOTE, "\n");
4307 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4308 init_ref, loop))
4309 /* dr_analyze_innermost already explained the failure. */
4310 return false;
4312 if (dump_enabled_p ())
4314 dump_printf_loc (MSG_NOTE, vect_location,
4315 "\touter base_address: ");
4316 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4317 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4318 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4319 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4320 STMT_VINFO_DR_OFFSET (stmt_info));
4321 dump_printf (MSG_NOTE,
4322 "\n\touter constant offset from base address: ");
4323 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4324 STMT_VINFO_DR_INIT (stmt_info));
4325 dump_printf (MSG_NOTE, "\n\touter step: ");
4326 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4327 STMT_VINFO_DR_STEP (stmt_info));
4328 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4329 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4330 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4331 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4332 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4333 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4334 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4335 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4339 /* Set vectype for STMT. */
4340 scalar_type = TREE_TYPE (DR_REF (dr));
4341 STMT_VINFO_VECTYPE (stmt_info)
4342 = get_vectype_for_scalar_type (scalar_type);
4343 if (!STMT_VINFO_VECTYPE (stmt_info))
4345 if (dump_enabled_p ())
4347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4348 "not vectorized: no vectype for stmt: ");
4349 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4350 stmt_info->stmt, 0);
4351 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4353 scalar_type);
4354 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4357 if (is_a <bb_vec_info> (vinfo))
4359 /* No vector type is fine, the ref can still participate
4360 in dependence analysis, we just can't vectorize it. */
4361 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4362 continue;
4364 return false;
4366 else
4368 if (dump_enabled_p ())
4370 dump_printf_loc (MSG_NOTE, vect_location,
4371 "got vectype for stmt: ");
4372 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
4373 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4374 STMT_VINFO_VECTYPE (stmt_info));
4375 dump_printf (MSG_NOTE, "\n");
4379 /* Adjust the minimal vectorization factor according to the
4380 vector type. */
4381 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4382 *min_vf = upper_bound (*min_vf, vf);
4384 if (gatherscatter != SG_NONE)
4386 gather_scatter_info gs_info;
4387 if (!vect_check_gather_scatter (stmt_info,
4388 as_a <loop_vec_info> (vinfo),
4389 &gs_info)
4390 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4392 if (dump_enabled_p ())
4394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4395 (gatherscatter == GATHER) ?
4396 "not vectorized: not suitable for gather "
4397 "load " :
4398 "not vectorized: not suitable for scatter "
4399 "store ");
4400 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4401 stmt_info->stmt, 0);
4403 return false;
4405 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4409 /* We used to stop processing and prune the list here. Verify we no
4410 longer need to. */
4411 gcc_assert (i == datarefs.length ());
4413 return true;
4417 /* Function vect_get_new_vect_var.
4419 Returns a name for a new variable. The current naming scheme appends the
4420 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4421 the name of vectorizer generated variables, and appends that to NAME if
4422 provided. */
4424 tree
4425 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4427 const char *prefix;
4428 tree new_vect_var;
4430 switch (var_kind)
4432 case vect_simple_var:
4433 prefix = "vect";
4434 break;
4435 case vect_scalar_var:
4436 prefix = "stmp";
4437 break;
4438 case vect_mask_var:
4439 prefix = "mask";
4440 break;
4441 case vect_pointer_var:
4442 prefix = "vectp";
4443 break;
4444 default:
4445 gcc_unreachable ();
4448 if (name)
4450 char* tmp = concat (prefix, "_", name, NULL);
4451 new_vect_var = create_tmp_reg (type, tmp);
4452 free (tmp);
4454 else
4455 new_vect_var = create_tmp_reg (type, prefix);
4457 return new_vect_var;
4460 /* Like vect_get_new_vect_var but return an SSA name. */
4462 tree
4463 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4465 const char *prefix;
4466 tree new_vect_var;
4468 switch (var_kind)
4470 case vect_simple_var:
4471 prefix = "vect";
4472 break;
4473 case vect_scalar_var:
4474 prefix = "stmp";
4475 break;
4476 case vect_pointer_var:
4477 prefix = "vectp";
4478 break;
4479 default:
4480 gcc_unreachable ();
4483 if (name)
4485 char* tmp = concat (prefix, "_", name, NULL);
4486 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4487 free (tmp);
4489 else
4490 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4492 return new_vect_var;
4495 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4497 static void
4498 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4500 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4501 int misalign = DR_MISALIGNMENT (dr_info);
4502 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4503 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4504 else
4505 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4506 DR_TARGET_ALIGNMENT (dr_info), misalign);
4509 /* Function vect_create_addr_base_for_vector_ref.
4511 Create an expression that computes the address of the first memory location
4512 that will be accessed for a data reference.
4514 Input:
4515 STMT_INFO: The statement containing the data reference.
4516 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4517 OFFSET: Optional. If supplied, it is be added to the initial address.
4518 LOOP: Specify relative to which loop-nest should the address be computed.
4519 For example, when the dataref is in an inner-loop nested in an
4520 outer-loop that is now being vectorized, LOOP can be either the
4521 outer-loop, or the inner-loop. The first memory location accessed
4522 by the following dataref ('in' points to short):
4524 for (i=0; i<N; i++)
4525 for (j=0; j<M; j++)
4526 s += in[i+j]
4528 is as follows:
4529 if LOOP=i_loop: &in (relative to i_loop)
4530 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4531 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4532 initial address. Unlike OFFSET, which is number of elements to
4533 be added, BYTE_OFFSET is measured in bytes.
4535 Output:
4536 1. Return an SSA_NAME whose value is the address of the memory location of
4537 the first vector of the data reference.
4538 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4539 these statement(s) which define the returned SSA_NAME.
4541 FORNOW: We are only handling array accesses with step 1. */
4543 tree
4544 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4545 gimple_seq *new_stmt_list,
4546 tree offset,
4547 tree byte_offset)
4549 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4550 struct data_reference *dr = dr_info->dr;
4551 const char *base_name;
4552 tree addr_base;
4553 tree dest;
4554 gimple_seq seq = NULL;
4555 tree vect_ptr_type;
4556 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4557 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4558 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4560 tree data_ref_base = unshare_expr (drb->base_address);
4561 tree base_offset = unshare_expr (drb->offset);
4562 tree init = unshare_expr (drb->init);
4564 if (loop_vinfo)
4565 base_name = get_name (data_ref_base);
4566 else
4568 base_offset = ssize_int (0);
4569 init = ssize_int (0);
4570 base_name = get_name (DR_REF (dr));
4573 /* Create base_offset */
4574 base_offset = size_binop (PLUS_EXPR,
4575 fold_convert (sizetype, base_offset),
4576 fold_convert (sizetype, init));
4578 if (offset)
4580 offset = fold_build2 (MULT_EXPR, sizetype,
4581 fold_convert (sizetype, offset), step);
4582 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4583 base_offset, offset);
4585 if (byte_offset)
4587 byte_offset = fold_convert (sizetype, byte_offset);
4588 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4589 base_offset, byte_offset);
4592 /* base + base_offset */
4593 if (loop_vinfo)
4594 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4595 else
4597 addr_base = build1 (ADDR_EXPR,
4598 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4599 unshare_expr (DR_REF (dr)));
4602 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4603 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4604 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4605 gimple_seq_add_seq (new_stmt_list, seq);
4607 if (DR_PTR_INFO (dr)
4608 && TREE_CODE (addr_base) == SSA_NAME
4609 && !SSA_NAME_PTR_INFO (addr_base))
4611 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4612 if (offset || byte_offset)
4613 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4616 if (dump_enabled_p ())
4618 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4619 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4620 dump_printf (MSG_NOTE, "\n");
4623 return addr_base;
4627 /* Function vect_create_data_ref_ptr.
4629 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4630 location accessed in the loop by STMT_INFO, along with the def-use update
4631 chain to appropriately advance the pointer through the loop iterations.
4632 Also set aliasing information for the pointer. This pointer is used by
4633 the callers to this function to create a memory reference expression for
4634 vector load/store access.
4636 Input:
4637 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4638 GIMPLE_ASSIGN <name, data-ref> or
4639 GIMPLE_ASSIGN <data-ref, name>.
4640 2. AGGR_TYPE: the type of the reference, which should be either a vector
4641 or an array.
4642 3. AT_LOOP: the loop where the vector memref is to be created.
4643 4. OFFSET (optional): an offset to be added to the initial address accessed
4644 by the data-ref in STMT_INFO.
4645 5. BSI: location where the new stmts are to be placed if there is no loop
4646 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4647 pointing to the initial address.
4648 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4649 to the initial address accessed by the data-ref in STMT_INFO. This is
4650 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4651 in bytes.
4652 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4653 to the IV during each iteration of the loop. NULL says to move
4654 by one copy of AGGR_TYPE up or down, depending on the step of the
4655 data reference.
4657 Output:
4658 1. Declare a new ptr to vector_type, and have it point to the base of the
4659 data reference (initial addressed accessed by the data reference).
4660 For example, for vector of type V8HI, the following code is generated:
4662 v8hi *ap;
4663 ap = (v8hi *)initial_address;
4665 if OFFSET is not supplied:
4666 initial_address = &a[init];
4667 if OFFSET is supplied:
4668 initial_address = &a[init + OFFSET];
4669 if BYTE_OFFSET is supplied:
4670 initial_address = &a[init] + BYTE_OFFSET;
4672 Return the initial_address in INITIAL_ADDRESS.
4674 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4675 update the pointer in each iteration of the loop.
4677 Return the increment stmt that updates the pointer in PTR_INCR.
4679 3. Set INV_P to true if the access pattern of the data reference in the
4680 vectorized loop is invariant. Set it to false otherwise.
4682 4. Return the pointer. */
4684 tree
4685 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4686 struct loop *at_loop, tree offset,
4687 tree *initial_address, gimple_stmt_iterator *gsi,
4688 gimple **ptr_incr, bool only_init, bool *inv_p,
4689 tree byte_offset, tree iv_step)
4691 const char *base_name;
4692 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4693 struct loop *loop = NULL;
4694 bool nested_in_vect_loop = false;
4695 struct loop *containing_loop = NULL;
4696 tree aggr_ptr_type;
4697 tree aggr_ptr;
4698 tree new_temp;
4699 gimple_seq new_stmt_list = NULL;
4700 edge pe = NULL;
4701 basic_block new_bb;
4702 tree aggr_ptr_init;
4703 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4704 struct data_reference *dr = dr_info->dr;
4705 tree aptr;
4706 gimple_stmt_iterator incr_gsi;
4707 bool insert_after;
4708 tree indx_before_incr, indx_after_incr;
4709 gimple *incr;
4710 tree step;
4711 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4713 gcc_assert (iv_step != NULL_TREE
4714 || TREE_CODE (aggr_type) == ARRAY_TYPE
4715 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4717 if (loop_vinfo)
4719 loop = LOOP_VINFO_LOOP (loop_vinfo);
4720 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4721 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4722 pe = loop_preheader_edge (loop);
4724 else
4726 gcc_assert (bb_vinfo);
4727 only_init = true;
4728 *ptr_incr = NULL;
4731 /* Check the step (evolution) of the load in LOOP, and record
4732 whether it's invariant. */
4733 step = vect_dr_behavior (dr_info)->step;
4734 if (integer_zerop (step))
4735 *inv_p = true;
4736 else
4737 *inv_p = false;
4739 /* Create an expression for the first address accessed by this load
4740 in LOOP. */
4741 base_name = get_name (DR_BASE_ADDRESS (dr));
4743 if (dump_enabled_p ())
4745 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4746 dump_printf_loc (MSG_NOTE, vect_location,
4747 "create %s-pointer variable to type: ",
4748 get_tree_code_name (TREE_CODE (aggr_type)));
4749 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4750 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4751 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4752 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4753 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4754 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4755 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4756 else
4757 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4758 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4759 dump_printf (MSG_NOTE, "\n");
4762 /* (1) Create the new aggregate-pointer variable.
4763 Vector and array types inherit the alias set of their component
4764 type by default so we need to use a ref-all pointer if the data
4765 reference does not conflict with the created aggregated data
4766 reference because it is not addressable. */
4767 bool need_ref_all = false;
4768 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4769 get_alias_set (DR_REF (dr))))
4770 need_ref_all = true;
4771 /* Likewise for any of the data references in the stmt group. */
4772 else if (DR_GROUP_SIZE (stmt_info) > 1)
4774 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4777 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4778 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4779 get_alias_set (DR_REF (sdr))))
4781 need_ref_all = true;
4782 break;
4784 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4786 while (sinfo);
4788 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4789 need_ref_all);
4790 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4793 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4794 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4795 def-use update cycles for the pointer: one relative to the outer-loop
4796 (LOOP), which is what steps (3) and (4) below do. The other is relative
4797 to the inner-loop (which is the inner-most loop containing the dataref),
4798 and this is done be step (5) below.
4800 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4801 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4802 redundant. Steps (3),(4) create the following:
4804 vp0 = &base_addr;
4805 LOOP: vp1 = phi(vp0,vp2)
4808 vp2 = vp1 + step
4809 goto LOOP
4811 If there is an inner-loop nested in loop, then step (5) will also be
4812 applied, and an additional update in the inner-loop will be created:
4814 vp0 = &base_addr;
4815 LOOP: vp1 = phi(vp0,vp2)
4817 inner: vp3 = phi(vp1,vp4)
4818 vp4 = vp3 + inner_step
4819 if () goto inner
4821 vp2 = vp1 + step
4822 if () goto LOOP */
4824 /* (2) Calculate the initial address of the aggregate-pointer, and set
4825 the aggregate-pointer to point to it before the loop. */
4827 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4829 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4830 offset, byte_offset);
4831 if (new_stmt_list)
4833 if (pe)
4835 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4836 gcc_assert (!new_bb);
4838 else
4839 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4842 *initial_address = new_temp;
4843 aggr_ptr_init = new_temp;
4845 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4846 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4847 inner-loop nested in LOOP (during outer-loop vectorization). */
4849 /* No update in loop is required. */
4850 if (only_init && (!loop_vinfo || at_loop == loop))
4851 aptr = aggr_ptr_init;
4852 else
4854 if (iv_step == NULL_TREE)
4856 /* The step of the aggregate pointer is the type size. */
4857 iv_step = TYPE_SIZE_UNIT (aggr_type);
4858 /* One exception to the above is when the scalar step of the load in
4859 LOOP is zero. In this case the step here is also zero. */
4860 if (*inv_p)
4861 iv_step = size_zero_node;
4862 else if (tree_int_cst_sgn (step) == -1)
4863 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4866 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4868 create_iv (aggr_ptr_init,
4869 fold_convert (aggr_ptr_type, iv_step),
4870 aggr_ptr, loop, &incr_gsi, insert_after,
4871 &indx_before_incr, &indx_after_incr);
4872 incr = gsi_stmt (incr_gsi);
4873 loop_vinfo->add_stmt (incr);
4875 /* Copy the points-to information if it exists. */
4876 if (DR_PTR_INFO (dr))
4878 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4879 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4881 if (ptr_incr)
4882 *ptr_incr = incr;
4884 aptr = indx_before_incr;
4887 if (!nested_in_vect_loop || only_init)
4888 return aptr;
4891 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4892 nested in LOOP, if exists. */
4894 gcc_assert (nested_in_vect_loop);
4895 if (!only_init)
4897 standard_iv_increment_position (containing_loop, &incr_gsi,
4898 &insert_after);
4899 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4900 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4901 &indx_after_incr);
4902 incr = gsi_stmt (incr_gsi);
4903 loop_vinfo->add_stmt (incr);
4905 /* Copy the points-to information if it exists. */
4906 if (DR_PTR_INFO (dr))
4908 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4909 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4911 if (ptr_incr)
4912 *ptr_incr = incr;
4914 return indx_before_incr;
4916 else
4917 gcc_unreachable ();
4921 /* Function bump_vector_ptr
4923 Increment a pointer (to a vector type) by vector-size. If requested,
4924 i.e. if PTR-INCR is given, then also connect the new increment stmt
4925 to the existing def-use update-chain of the pointer, by modifying
4926 the PTR_INCR as illustrated below:
4928 The pointer def-use update-chain before this function:
4929 DATAREF_PTR = phi (p_0, p_2)
4930 ....
4931 PTR_INCR: p_2 = DATAREF_PTR + step
4933 The pointer def-use update-chain after this function:
4934 DATAREF_PTR = phi (p_0, p_2)
4935 ....
4936 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4937 ....
4938 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4940 Input:
4941 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4942 in the loop.
4943 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4944 the loop. The increment amount across iterations is expected
4945 to be vector_size.
4946 BSI - location where the new update stmt is to be placed.
4947 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4948 BUMP - optional. The offset by which to bump the pointer. If not given,
4949 the offset is assumed to be vector_size.
4951 Output: Return NEW_DATAREF_PTR as illustrated above.
4955 tree
4956 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4957 stmt_vec_info stmt_info, tree bump)
4959 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4960 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4961 tree update = TYPE_SIZE_UNIT (vectype);
4962 gassign *incr_stmt;
4963 ssa_op_iter iter;
4964 use_operand_p use_p;
4965 tree new_dataref_ptr;
4967 if (bump)
4968 update = bump;
4970 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4971 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4972 else
4973 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4974 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4975 dataref_ptr, update);
4976 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4978 /* Copy the points-to information if it exists. */
4979 if (DR_PTR_INFO (dr))
4981 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4982 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4985 if (!ptr_incr)
4986 return new_dataref_ptr;
4988 /* Update the vector-pointer's cross-iteration increment. */
4989 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4991 tree use = USE_FROM_PTR (use_p);
4993 if (use == dataref_ptr)
4994 SET_USE (use_p, new_dataref_ptr);
4995 else
4996 gcc_assert (operand_equal_p (use, update, 0));
4999 return new_dataref_ptr;
5003 /* Copy memory reference info such as base/clique from the SRC reference
5004 to the DEST MEM_REF. */
5006 void
5007 vect_copy_ref_info (tree dest, tree src)
5009 if (TREE_CODE (dest) != MEM_REF)
5010 return;
5012 tree src_base = src;
5013 while (handled_component_p (src_base))
5014 src_base = TREE_OPERAND (src_base, 0);
5015 if (TREE_CODE (src_base) != MEM_REF
5016 && TREE_CODE (src_base) != TARGET_MEM_REF)
5017 return;
5019 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5020 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5024 /* Function vect_create_destination_var.
5026 Create a new temporary of type VECTYPE. */
5028 tree
5029 vect_create_destination_var (tree scalar_dest, tree vectype)
5031 tree vec_dest;
5032 const char *name;
5033 char *new_name;
5034 tree type;
5035 enum vect_var_kind kind;
5037 kind = vectype
5038 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5039 ? vect_mask_var
5040 : vect_simple_var
5041 : vect_scalar_var;
5042 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5044 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5046 name = get_name (scalar_dest);
5047 if (name)
5048 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5049 else
5050 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5051 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5052 free (new_name);
5054 return vec_dest;
5057 /* Function vect_grouped_store_supported.
5059 Returns TRUE if interleave high and interleave low permutations
5060 are supported, and FALSE otherwise. */
5062 bool
5063 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5065 machine_mode mode = TYPE_MODE (vectype);
5067 /* vect_permute_store_chain requires the group size to be equal to 3 or
5068 be a power of two. */
5069 if (count != 3 && exact_log2 (count) == -1)
5071 if (dump_enabled_p ())
5072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5073 "the size of the group of accesses"
5074 " is not a power of 2 or not eqaul to 3\n");
5075 return false;
5078 /* Check that the permutation is supported. */
5079 if (VECTOR_MODE_P (mode))
5081 unsigned int i;
5082 if (count == 3)
5084 unsigned int j0 = 0, j1 = 0, j2 = 0;
5085 unsigned int i, j;
5087 unsigned int nelt;
5088 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5090 if (dump_enabled_p ())
5091 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5092 "cannot handle groups of 3 stores for"
5093 " variable-length vectors\n");
5094 return false;
5097 vec_perm_builder sel (nelt, nelt, 1);
5098 sel.quick_grow (nelt);
5099 vec_perm_indices indices;
5100 for (j = 0; j < 3; j++)
5102 int nelt0 = ((3 - j) * nelt) % 3;
5103 int nelt1 = ((3 - j) * nelt + 1) % 3;
5104 int nelt2 = ((3 - j) * nelt + 2) % 3;
5105 for (i = 0; i < nelt; i++)
5107 if (3 * i + nelt0 < nelt)
5108 sel[3 * i + nelt0] = j0++;
5109 if (3 * i + nelt1 < nelt)
5110 sel[3 * i + nelt1] = nelt + j1++;
5111 if (3 * i + nelt2 < nelt)
5112 sel[3 * i + nelt2] = 0;
5114 indices.new_vector (sel, 2, nelt);
5115 if (!can_vec_perm_const_p (mode, indices))
5117 if (dump_enabled_p ())
5118 dump_printf (MSG_MISSED_OPTIMIZATION,
5119 "permutation op not supported by target.\n");
5120 return false;
5123 for (i = 0; i < nelt; i++)
5125 if (3 * i + nelt0 < nelt)
5126 sel[3 * i + nelt0] = 3 * i + nelt0;
5127 if (3 * i + nelt1 < nelt)
5128 sel[3 * i + nelt1] = 3 * i + nelt1;
5129 if (3 * i + nelt2 < nelt)
5130 sel[3 * i + nelt2] = nelt + j2++;
5132 indices.new_vector (sel, 2, nelt);
5133 if (!can_vec_perm_const_p (mode, indices))
5135 if (dump_enabled_p ())
5136 dump_printf (MSG_MISSED_OPTIMIZATION,
5137 "permutation op not supported by target.\n");
5138 return false;
5141 return true;
5143 else
5145 /* If length is not equal to 3 then only power of 2 is supported. */
5146 gcc_assert (pow2p_hwi (count));
5147 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5149 /* The encoding has 2 interleaved stepped patterns. */
5150 vec_perm_builder sel (nelt, 2, 3);
5151 sel.quick_grow (6);
5152 for (i = 0; i < 3; i++)
5154 sel[i * 2] = i;
5155 sel[i * 2 + 1] = i + nelt;
5157 vec_perm_indices indices (sel, 2, nelt);
5158 if (can_vec_perm_const_p (mode, indices))
5160 for (i = 0; i < 6; i++)
5161 sel[i] += exact_div (nelt, 2);
5162 indices.new_vector (sel, 2, nelt);
5163 if (can_vec_perm_const_p (mode, indices))
5164 return true;
5169 if (dump_enabled_p ())
5170 dump_printf (MSG_MISSED_OPTIMIZATION,
5171 "permutaion op not supported by target.\n");
5172 return false;
5176 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5177 type VECTYPE. MASKED_P says whether the masked form is needed. */
5179 bool
5180 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5181 bool masked_p)
5183 if (masked_p)
5184 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5185 vec_mask_store_lanes_optab,
5186 vectype, count);
5187 else
5188 return vect_lanes_optab_supported_p ("vec_store_lanes",
5189 vec_store_lanes_optab,
5190 vectype, count);
5194 /* Function vect_permute_store_chain.
5196 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5197 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5198 the data correctly for the stores. Return the final references for stores
5199 in RESULT_CHAIN.
5201 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5202 The input is 4 vectors each containing 8 elements. We assign a number to
5203 each element, the input sequence is:
5205 1st vec: 0 1 2 3 4 5 6 7
5206 2nd vec: 8 9 10 11 12 13 14 15
5207 3rd vec: 16 17 18 19 20 21 22 23
5208 4th vec: 24 25 26 27 28 29 30 31
5210 The output sequence should be:
5212 1st vec: 0 8 16 24 1 9 17 25
5213 2nd vec: 2 10 18 26 3 11 19 27
5214 3rd vec: 4 12 20 28 5 13 21 30
5215 4th vec: 6 14 22 30 7 15 23 31
5217 i.e., we interleave the contents of the four vectors in their order.
5219 We use interleave_high/low instructions to create such output. The input of
5220 each interleave_high/low operation is two vectors:
5221 1st vec 2nd vec
5222 0 1 2 3 4 5 6 7
5223 the even elements of the result vector are obtained left-to-right from the
5224 high/low elements of the first vector. The odd elements of the result are
5225 obtained left-to-right from the high/low elements of the second vector.
5226 The output of interleave_high will be: 0 4 1 5
5227 and of interleave_low: 2 6 3 7
5230 The permutation is done in log LENGTH stages. In each stage interleave_high
5231 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5232 where the first argument is taken from the first half of DR_CHAIN and the
5233 second argument from it's second half.
5234 In our example,
5236 I1: interleave_high (1st vec, 3rd vec)
5237 I2: interleave_low (1st vec, 3rd vec)
5238 I3: interleave_high (2nd vec, 4th vec)
5239 I4: interleave_low (2nd vec, 4th vec)
5241 The output for the first stage is:
5243 I1: 0 16 1 17 2 18 3 19
5244 I2: 4 20 5 21 6 22 7 23
5245 I3: 8 24 9 25 10 26 11 27
5246 I4: 12 28 13 29 14 30 15 31
5248 The output of the second stage, i.e. the final result is:
5250 I1: 0 8 16 24 1 9 17 25
5251 I2: 2 10 18 26 3 11 19 27
5252 I3: 4 12 20 28 5 13 21 30
5253 I4: 6 14 22 30 7 15 23 31. */
5255 void
5256 vect_permute_store_chain (vec<tree> dr_chain,
5257 unsigned int length,
5258 stmt_vec_info stmt_info,
5259 gimple_stmt_iterator *gsi,
5260 vec<tree> *result_chain)
5262 tree vect1, vect2, high, low;
5263 gimple *perm_stmt;
5264 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5265 tree perm_mask_low, perm_mask_high;
5266 tree data_ref;
5267 tree perm3_mask_low, perm3_mask_high;
5268 unsigned int i, j, n, log_length = exact_log2 (length);
5270 result_chain->quick_grow (length);
5271 memcpy (result_chain->address (), dr_chain.address (),
5272 length * sizeof (tree));
5274 if (length == 3)
5276 /* vect_grouped_store_supported ensures that this is constant. */
5277 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5278 unsigned int j0 = 0, j1 = 0, j2 = 0;
5280 vec_perm_builder sel (nelt, nelt, 1);
5281 sel.quick_grow (nelt);
5282 vec_perm_indices indices;
5283 for (j = 0; j < 3; j++)
5285 int nelt0 = ((3 - j) * nelt) % 3;
5286 int nelt1 = ((3 - j) * nelt + 1) % 3;
5287 int nelt2 = ((3 - j) * nelt + 2) % 3;
5289 for (i = 0; i < nelt; i++)
5291 if (3 * i + nelt0 < nelt)
5292 sel[3 * i + nelt0] = j0++;
5293 if (3 * i + nelt1 < nelt)
5294 sel[3 * i + nelt1] = nelt + j1++;
5295 if (3 * i + nelt2 < nelt)
5296 sel[3 * i + nelt2] = 0;
5298 indices.new_vector (sel, 2, nelt);
5299 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5301 for (i = 0; i < nelt; i++)
5303 if (3 * i + nelt0 < nelt)
5304 sel[3 * i + nelt0] = 3 * i + nelt0;
5305 if (3 * i + nelt1 < nelt)
5306 sel[3 * i + nelt1] = 3 * i + nelt1;
5307 if (3 * i + nelt2 < nelt)
5308 sel[3 * i + nelt2] = nelt + j2++;
5310 indices.new_vector (sel, 2, nelt);
5311 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5313 vect1 = dr_chain[0];
5314 vect2 = dr_chain[1];
5316 /* Create interleaving stmt:
5317 low = VEC_PERM_EXPR <vect1, vect2,
5318 {j, nelt, *, j + 1, nelt + j + 1, *,
5319 j + 2, nelt + j + 2, *, ...}> */
5320 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5321 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5322 vect2, perm3_mask_low);
5323 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5325 vect1 = data_ref;
5326 vect2 = dr_chain[2];
5327 /* Create interleaving stmt:
5328 low = VEC_PERM_EXPR <vect1, vect2,
5329 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5330 6, 7, nelt + j + 2, ...}> */
5331 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5332 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5333 vect2, perm3_mask_high);
5334 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5335 (*result_chain)[j] = data_ref;
5338 else
5340 /* If length is not equal to 3 then only power of 2 is supported. */
5341 gcc_assert (pow2p_hwi (length));
5343 /* The encoding has 2 interleaved stepped patterns. */
5344 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5345 vec_perm_builder sel (nelt, 2, 3);
5346 sel.quick_grow (6);
5347 for (i = 0; i < 3; i++)
5349 sel[i * 2] = i;
5350 sel[i * 2 + 1] = i + nelt;
5352 vec_perm_indices indices (sel, 2, nelt);
5353 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5355 for (i = 0; i < 6; i++)
5356 sel[i] += exact_div (nelt, 2);
5357 indices.new_vector (sel, 2, nelt);
5358 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5360 for (i = 0, n = log_length; i < n; i++)
5362 for (j = 0; j < length/2; j++)
5364 vect1 = dr_chain[j];
5365 vect2 = dr_chain[j+length/2];
5367 /* Create interleaving stmt:
5368 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5369 ...}> */
5370 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5371 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5372 vect2, perm_mask_high);
5373 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5374 (*result_chain)[2*j] = high;
5376 /* Create interleaving stmt:
5377 low = VEC_PERM_EXPR <vect1, vect2,
5378 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5379 ...}> */
5380 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5381 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5382 vect2, perm_mask_low);
5383 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5384 (*result_chain)[2*j+1] = low;
5386 memcpy (dr_chain.address (), result_chain->address (),
5387 length * sizeof (tree));
5392 /* Function vect_setup_realignment
5394 This function is called when vectorizing an unaligned load using
5395 the dr_explicit_realign[_optimized] scheme.
5396 This function generates the following code at the loop prolog:
5398 p = initial_addr;
5399 x msq_init = *(floor(p)); # prolog load
5400 realignment_token = call target_builtin;
5401 loop:
5402 x msq = phi (msq_init, ---)
5404 The stmts marked with x are generated only for the case of
5405 dr_explicit_realign_optimized.
5407 The code above sets up a new (vector) pointer, pointing to the first
5408 location accessed by STMT_INFO, and a "floor-aligned" load using that
5409 pointer. It also generates code to compute the "realignment-token"
5410 (if the relevant target hook was defined), and creates a phi-node at the
5411 loop-header bb whose arguments are the result of the prolog-load (created
5412 by this function) and the result of a load that takes place in the loop
5413 (to be created by the caller to this function).
5415 For the case of dr_explicit_realign_optimized:
5416 The caller to this function uses the phi-result (msq) to create the
5417 realignment code inside the loop, and sets up the missing phi argument,
5418 as follows:
5419 loop:
5420 msq = phi (msq_init, lsq)
5421 lsq = *(floor(p')); # load in loop
5422 result = realign_load (msq, lsq, realignment_token);
5424 For the case of dr_explicit_realign:
5425 loop:
5426 msq = *(floor(p)); # load in loop
5427 p' = p + (VS-1);
5428 lsq = *(floor(p')); # load in loop
5429 result = realign_load (msq, lsq, realignment_token);
5431 Input:
5432 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5433 a memory location that may be unaligned.
5434 BSI - place where new code is to be inserted.
5435 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5436 is used.
5438 Output:
5439 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5440 target hook, if defined.
5441 Return value - the result of the loop-header phi node. */
5443 tree
5444 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5445 tree *realignment_token,
5446 enum dr_alignment_support alignment_support_scheme,
5447 tree init_addr,
5448 struct loop **at_loop)
5450 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5451 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5452 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5453 struct data_reference *dr = dr_info->dr;
5454 struct loop *loop = NULL;
5455 edge pe = NULL;
5456 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5457 tree vec_dest;
5458 gimple *inc;
5459 tree ptr;
5460 tree data_ref;
5461 basic_block new_bb;
5462 tree msq_init = NULL_TREE;
5463 tree new_temp;
5464 gphi *phi_stmt;
5465 tree msq = NULL_TREE;
5466 gimple_seq stmts = NULL;
5467 bool inv_p;
5468 bool compute_in_loop = false;
5469 bool nested_in_vect_loop = false;
5470 struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5471 struct loop *loop_for_initial_load = NULL;
5473 if (loop_vinfo)
5475 loop = LOOP_VINFO_LOOP (loop_vinfo);
5476 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5479 gcc_assert (alignment_support_scheme == dr_explicit_realign
5480 || alignment_support_scheme == dr_explicit_realign_optimized);
5482 /* We need to generate three things:
5483 1. the misalignment computation
5484 2. the extra vector load (for the optimized realignment scheme).
5485 3. the phi node for the two vectors from which the realignment is
5486 done (for the optimized realignment scheme). */
5488 /* 1. Determine where to generate the misalignment computation.
5490 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5491 calculation will be generated by this function, outside the loop (in the
5492 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5493 caller, inside the loop.
5495 Background: If the misalignment remains fixed throughout the iterations of
5496 the loop, then both realignment schemes are applicable, and also the
5497 misalignment computation can be done outside LOOP. This is because we are
5498 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5499 are a multiple of VS (the Vector Size), and therefore the misalignment in
5500 different vectorized LOOP iterations is always the same.
5501 The problem arises only if the memory access is in an inner-loop nested
5502 inside LOOP, which is now being vectorized using outer-loop vectorization.
5503 This is the only case when the misalignment of the memory access may not
5504 remain fixed throughout the iterations of the inner-loop (as explained in
5505 detail in vect_supportable_dr_alignment). In this case, not only is the
5506 optimized realignment scheme not applicable, but also the misalignment
5507 computation (and generation of the realignment token that is passed to
5508 REALIGN_LOAD) have to be done inside the loop.
5510 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5511 or not, which in turn determines if the misalignment is computed inside
5512 the inner-loop, or outside LOOP. */
5514 if (init_addr != NULL_TREE || !loop_vinfo)
5516 compute_in_loop = true;
5517 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5521 /* 2. Determine where to generate the extra vector load.
5523 For the optimized realignment scheme, instead of generating two vector
5524 loads in each iteration, we generate a single extra vector load in the
5525 preheader of the loop, and in each iteration reuse the result of the
5526 vector load from the previous iteration. In case the memory access is in
5527 an inner-loop nested inside LOOP, which is now being vectorized using
5528 outer-loop vectorization, we need to determine whether this initial vector
5529 load should be generated at the preheader of the inner-loop, or can be
5530 generated at the preheader of LOOP. If the memory access has no evolution
5531 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5532 to be generated inside LOOP (in the preheader of the inner-loop). */
5534 if (nested_in_vect_loop)
5536 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5537 bool invariant_in_outerloop =
5538 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5539 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5541 else
5542 loop_for_initial_load = loop;
5543 if (at_loop)
5544 *at_loop = loop_for_initial_load;
5546 if (loop_for_initial_load)
5547 pe = loop_preheader_edge (loop_for_initial_load);
5549 /* 3. For the case of the optimized realignment, create the first vector
5550 load at the loop preheader. */
5552 if (alignment_support_scheme == dr_explicit_realign_optimized)
5554 /* Create msq_init = *(floor(p1)) in the loop preheader */
5555 gassign *new_stmt;
5557 gcc_assert (!compute_in_loop);
5558 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5559 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5560 loop_for_initial_load, NULL_TREE,
5561 &init_addr, NULL, &inc, true, &inv_p);
5562 if (TREE_CODE (ptr) == SSA_NAME)
5563 new_temp = copy_ssa_name (ptr);
5564 else
5565 new_temp = make_ssa_name (TREE_TYPE (ptr));
5566 unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5567 new_stmt = gimple_build_assign
5568 (new_temp, BIT_AND_EXPR, ptr,
5569 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5570 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5571 gcc_assert (!new_bb);
5572 data_ref
5573 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5574 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5575 vect_copy_ref_info (data_ref, DR_REF (dr));
5576 new_stmt = gimple_build_assign (vec_dest, data_ref);
5577 new_temp = make_ssa_name (vec_dest, new_stmt);
5578 gimple_assign_set_lhs (new_stmt, new_temp);
5579 if (pe)
5581 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5582 gcc_assert (!new_bb);
5584 else
5585 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5587 msq_init = gimple_assign_lhs (new_stmt);
5590 /* 4. Create realignment token using a target builtin, if available.
5591 It is done either inside the containing loop, or before LOOP (as
5592 determined above). */
5594 if (targetm.vectorize.builtin_mask_for_load)
5596 gcall *new_stmt;
5597 tree builtin_decl;
5599 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5600 if (!init_addr)
5602 /* Generate the INIT_ADDR computation outside LOOP. */
5603 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5604 NULL_TREE);
5605 if (loop)
5607 pe = loop_preheader_edge (loop);
5608 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5609 gcc_assert (!new_bb);
5611 else
5612 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5615 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5616 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5617 vec_dest =
5618 vect_create_destination_var (scalar_dest,
5619 gimple_call_return_type (new_stmt));
5620 new_temp = make_ssa_name (vec_dest, new_stmt);
5621 gimple_call_set_lhs (new_stmt, new_temp);
5623 if (compute_in_loop)
5624 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5625 else
5627 /* Generate the misalignment computation outside LOOP. */
5628 pe = loop_preheader_edge (loop);
5629 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5630 gcc_assert (!new_bb);
5633 *realignment_token = gimple_call_lhs (new_stmt);
5635 /* The result of the CALL_EXPR to this builtin is determined from
5636 the value of the parameter and no global variables are touched
5637 which makes the builtin a "const" function. Requiring the
5638 builtin to have the "const" attribute makes it unnecessary
5639 to call mark_call_clobbered. */
5640 gcc_assert (TREE_READONLY (builtin_decl));
5643 if (alignment_support_scheme == dr_explicit_realign)
5644 return msq;
5646 gcc_assert (!compute_in_loop);
5647 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5650 /* 5. Create msq = phi <msq_init, lsq> in loop */
5652 pe = loop_preheader_edge (containing_loop);
5653 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5654 msq = make_ssa_name (vec_dest);
5655 phi_stmt = create_phi_node (msq, containing_loop->header);
5656 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5658 return msq;
5662 /* Function vect_grouped_load_supported.
5664 COUNT is the size of the load group (the number of statements plus the
5665 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5666 only one statement, with a gap of COUNT - 1.
5668 Returns true if a suitable permute exists. */
5670 bool
5671 vect_grouped_load_supported (tree vectype, bool single_element_p,
5672 unsigned HOST_WIDE_INT count)
5674 machine_mode mode = TYPE_MODE (vectype);
5676 /* If this is single-element interleaving with an element distance
5677 that leaves unused vector loads around punt - we at least create
5678 very sub-optimal code in that case (and blow up memory,
5679 see PR65518). */
5680 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5682 if (dump_enabled_p ())
5683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5684 "single-element interleaving not supported "
5685 "for not adjacent vector loads\n");
5686 return false;
5689 /* vect_permute_load_chain requires the group size to be equal to 3 or
5690 be a power of two. */
5691 if (count != 3 && exact_log2 (count) == -1)
5693 if (dump_enabled_p ())
5694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5695 "the size of the group of accesses"
5696 " is not a power of 2 or not equal to 3\n");
5697 return false;
5700 /* Check that the permutation is supported. */
5701 if (VECTOR_MODE_P (mode))
5703 unsigned int i, j;
5704 if (count == 3)
5706 unsigned int nelt;
5707 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5709 if (dump_enabled_p ())
5710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5711 "cannot handle groups of 3 loads for"
5712 " variable-length vectors\n");
5713 return false;
5716 vec_perm_builder sel (nelt, nelt, 1);
5717 sel.quick_grow (nelt);
5718 vec_perm_indices indices;
5719 unsigned int k;
5720 for (k = 0; k < 3; k++)
5722 for (i = 0; i < nelt; i++)
5723 if (3 * i + k < 2 * nelt)
5724 sel[i] = 3 * i + k;
5725 else
5726 sel[i] = 0;
5727 indices.new_vector (sel, 2, nelt);
5728 if (!can_vec_perm_const_p (mode, indices))
5730 if (dump_enabled_p ())
5731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5732 "shuffle of 3 loads is not supported by"
5733 " target\n");
5734 return false;
5736 for (i = 0, j = 0; i < nelt; i++)
5737 if (3 * i + k < 2 * nelt)
5738 sel[i] = i;
5739 else
5740 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5741 indices.new_vector (sel, 2, nelt);
5742 if (!can_vec_perm_const_p (mode, indices))
5744 if (dump_enabled_p ())
5745 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5746 "shuffle of 3 loads is not supported by"
5747 " target\n");
5748 return false;
5751 return true;
5753 else
5755 /* If length is not equal to 3 then only power of 2 is supported. */
5756 gcc_assert (pow2p_hwi (count));
5757 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5759 /* The encoding has a single stepped pattern. */
5760 vec_perm_builder sel (nelt, 1, 3);
5761 sel.quick_grow (3);
5762 for (i = 0; i < 3; i++)
5763 sel[i] = i * 2;
5764 vec_perm_indices indices (sel, 2, nelt);
5765 if (can_vec_perm_const_p (mode, indices))
5767 for (i = 0; i < 3; i++)
5768 sel[i] = i * 2 + 1;
5769 indices.new_vector (sel, 2, nelt);
5770 if (can_vec_perm_const_p (mode, indices))
5771 return true;
5776 if (dump_enabled_p ())
5777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5778 "extract even/odd not supported by target\n");
5779 return false;
5782 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5783 type VECTYPE. MASKED_P says whether the masked form is needed. */
5785 bool
5786 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5787 bool masked_p)
5789 if (masked_p)
5790 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5791 vec_mask_load_lanes_optab,
5792 vectype, count);
5793 else
5794 return vect_lanes_optab_supported_p ("vec_load_lanes",
5795 vec_load_lanes_optab,
5796 vectype, count);
5799 /* Function vect_permute_load_chain.
5801 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5802 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5803 the input data correctly. Return the final references for loads in
5804 RESULT_CHAIN.
5806 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5807 The input is 4 vectors each containing 8 elements. We assign a number to each
5808 element, the input sequence is:
5810 1st vec: 0 1 2 3 4 5 6 7
5811 2nd vec: 8 9 10 11 12 13 14 15
5812 3rd vec: 16 17 18 19 20 21 22 23
5813 4th vec: 24 25 26 27 28 29 30 31
5815 The output sequence should be:
5817 1st vec: 0 4 8 12 16 20 24 28
5818 2nd vec: 1 5 9 13 17 21 25 29
5819 3rd vec: 2 6 10 14 18 22 26 30
5820 4th vec: 3 7 11 15 19 23 27 31
5822 i.e., the first output vector should contain the first elements of each
5823 interleaving group, etc.
5825 We use extract_even/odd instructions to create such output. The input of
5826 each extract_even/odd operation is two vectors
5827 1st vec 2nd vec
5828 0 1 2 3 4 5 6 7
5830 and the output is the vector of extracted even/odd elements. The output of
5831 extract_even will be: 0 2 4 6
5832 and of extract_odd: 1 3 5 7
5835 The permutation is done in log LENGTH stages. In each stage extract_even
5836 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5837 their order. In our example,
5839 E1: extract_even (1st vec, 2nd vec)
5840 E2: extract_odd (1st vec, 2nd vec)
5841 E3: extract_even (3rd vec, 4th vec)
5842 E4: extract_odd (3rd vec, 4th vec)
5844 The output for the first stage will be:
5846 E1: 0 2 4 6 8 10 12 14
5847 E2: 1 3 5 7 9 11 13 15
5848 E3: 16 18 20 22 24 26 28 30
5849 E4: 17 19 21 23 25 27 29 31
5851 In order to proceed and create the correct sequence for the next stage (or
5852 for the correct output, if the second stage is the last one, as in our
5853 example), we first put the output of extract_even operation and then the
5854 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5855 The input for the second stage is:
5857 1st vec (E1): 0 2 4 6 8 10 12 14
5858 2nd vec (E3): 16 18 20 22 24 26 28 30
5859 3rd vec (E2): 1 3 5 7 9 11 13 15
5860 4th vec (E4): 17 19 21 23 25 27 29 31
5862 The output of the second stage:
5864 E1: 0 4 8 12 16 20 24 28
5865 E2: 2 6 10 14 18 22 26 30
5866 E3: 1 5 9 13 17 21 25 29
5867 E4: 3 7 11 15 19 23 27 31
5869 And RESULT_CHAIN after reordering:
5871 1st vec (E1): 0 4 8 12 16 20 24 28
5872 2nd vec (E3): 1 5 9 13 17 21 25 29
5873 3rd vec (E2): 2 6 10 14 18 22 26 30
5874 4th vec (E4): 3 7 11 15 19 23 27 31. */
5876 static void
5877 vect_permute_load_chain (vec<tree> dr_chain,
5878 unsigned int length,
5879 stmt_vec_info stmt_info,
5880 gimple_stmt_iterator *gsi,
5881 vec<tree> *result_chain)
5883 tree data_ref, first_vect, second_vect;
5884 tree perm_mask_even, perm_mask_odd;
5885 tree perm3_mask_low, perm3_mask_high;
5886 gimple *perm_stmt;
5887 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5888 unsigned int i, j, log_length = exact_log2 (length);
5890 result_chain->quick_grow (length);
5891 memcpy (result_chain->address (), dr_chain.address (),
5892 length * sizeof (tree));
5894 if (length == 3)
5896 /* vect_grouped_load_supported ensures that this is constant. */
5897 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5898 unsigned int k;
5900 vec_perm_builder sel (nelt, nelt, 1);
5901 sel.quick_grow (nelt);
5902 vec_perm_indices indices;
5903 for (k = 0; k < 3; k++)
5905 for (i = 0; i < nelt; i++)
5906 if (3 * i + k < 2 * nelt)
5907 sel[i] = 3 * i + k;
5908 else
5909 sel[i] = 0;
5910 indices.new_vector (sel, 2, nelt);
5911 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5913 for (i = 0, j = 0; i < nelt; i++)
5914 if (3 * i + k < 2 * nelt)
5915 sel[i] = i;
5916 else
5917 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5918 indices.new_vector (sel, 2, nelt);
5919 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5921 first_vect = dr_chain[0];
5922 second_vect = dr_chain[1];
5924 /* Create interleaving stmt (low part of):
5925 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5926 ...}> */
5927 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5928 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5929 second_vect, perm3_mask_low);
5930 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5932 /* Create interleaving stmt (high part of):
5933 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5934 ...}> */
5935 first_vect = data_ref;
5936 second_vect = dr_chain[2];
5937 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5938 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5939 second_vect, perm3_mask_high);
5940 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5941 (*result_chain)[k] = data_ref;
5944 else
5946 /* If length is not equal to 3 then only power of 2 is supported. */
5947 gcc_assert (pow2p_hwi (length));
5949 /* The encoding has a single stepped pattern. */
5950 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5951 vec_perm_builder sel (nelt, 1, 3);
5952 sel.quick_grow (3);
5953 for (i = 0; i < 3; ++i)
5954 sel[i] = i * 2;
5955 vec_perm_indices indices (sel, 2, nelt);
5956 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5958 for (i = 0; i < 3; ++i)
5959 sel[i] = i * 2 + 1;
5960 indices.new_vector (sel, 2, nelt);
5961 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5963 for (i = 0; i < log_length; i++)
5965 for (j = 0; j < length; j += 2)
5967 first_vect = dr_chain[j];
5968 second_vect = dr_chain[j+1];
5970 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5971 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5972 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5973 first_vect, second_vect,
5974 perm_mask_even);
5975 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5976 (*result_chain)[j/2] = data_ref;
5978 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5979 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5980 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5981 first_vect, second_vect,
5982 perm_mask_odd);
5983 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5984 (*result_chain)[j/2+length/2] = data_ref;
5986 memcpy (dr_chain.address (), result_chain->address (),
5987 length * sizeof (tree));
5992 /* Function vect_shift_permute_load_chain.
5994 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5995 sequence of stmts to reorder the input data accordingly.
5996 Return the final references for loads in RESULT_CHAIN.
5997 Return true if successed, false otherwise.
5999 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6000 The input is 3 vectors each containing 8 elements. We assign a
6001 number to each element, the input sequence is:
6003 1st vec: 0 1 2 3 4 5 6 7
6004 2nd vec: 8 9 10 11 12 13 14 15
6005 3rd vec: 16 17 18 19 20 21 22 23
6007 The output sequence should be:
6009 1st vec: 0 3 6 9 12 15 18 21
6010 2nd vec: 1 4 7 10 13 16 19 22
6011 3rd vec: 2 5 8 11 14 17 20 23
6013 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6015 First we shuffle all 3 vectors to get correct elements order:
6017 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6018 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6019 3rd vec: (16 19 22) (17 20 23) (18 21)
6021 Next we unite and shift vector 3 times:
6023 1st step:
6024 shift right by 6 the concatenation of:
6025 "1st vec" and "2nd vec"
6026 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6027 "2nd vec" and "3rd vec"
6028 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6029 "3rd vec" and "1st vec"
6030 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6031 | New vectors |
6033 So that now new vectors are:
6035 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6036 2nd vec: (10 13) (16 19 22) (17 20 23)
6037 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6039 2nd step:
6040 shift right by 5 the concatenation of:
6041 "1st vec" and "3rd vec"
6042 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6043 "2nd vec" and "1st vec"
6044 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6045 "3rd vec" and "2nd vec"
6046 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6047 | New vectors |
6049 So that now new vectors are:
6051 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6052 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6053 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6055 3rd step:
6056 shift right by 5 the concatenation of:
6057 "1st vec" and "1st vec"
6058 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6059 shift right by 3 the concatenation of:
6060 "2nd vec" and "2nd vec"
6061 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6062 | New vectors |
6064 So that now all vectors are READY:
6065 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6066 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6067 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6069 This algorithm is faster than one in vect_permute_load_chain if:
6070 1. "shift of a concatination" is faster than general permutation.
6071 This is usually so.
6072 2. The TARGET machine can't execute vector instructions in parallel.
6073 This is because each step of the algorithm depends on previous.
6074 The algorithm in vect_permute_load_chain is much more parallel.
6076 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6079 static bool
6080 vect_shift_permute_load_chain (vec<tree> dr_chain,
6081 unsigned int length,
6082 stmt_vec_info stmt_info,
6083 gimple_stmt_iterator *gsi,
6084 vec<tree> *result_chain)
6086 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6087 tree perm2_mask1, perm2_mask2, perm3_mask;
6088 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6089 gimple *perm_stmt;
6091 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6092 unsigned int i;
6093 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6095 unsigned HOST_WIDE_INT nelt, vf;
6096 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6097 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6098 /* Not supported for variable-length vectors. */
6099 return false;
6101 vec_perm_builder sel (nelt, nelt, 1);
6102 sel.quick_grow (nelt);
6104 result_chain->quick_grow (length);
6105 memcpy (result_chain->address (), dr_chain.address (),
6106 length * sizeof (tree));
6108 if (pow2p_hwi (length) && vf > 4)
6110 unsigned int j, log_length = exact_log2 (length);
6111 for (i = 0; i < nelt / 2; ++i)
6112 sel[i] = i * 2;
6113 for (i = 0; i < nelt / 2; ++i)
6114 sel[nelt / 2 + i] = i * 2 + 1;
6115 vec_perm_indices indices (sel, 2, nelt);
6116 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6118 if (dump_enabled_p ())
6119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6120 "shuffle of 2 fields structure is not \
6121 supported by target\n");
6122 return false;
6124 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6126 for (i = 0; i < nelt / 2; ++i)
6127 sel[i] = i * 2 + 1;
6128 for (i = 0; i < nelt / 2; ++i)
6129 sel[nelt / 2 + i] = i * 2;
6130 indices.new_vector (sel, 2, nelt);
6131 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6133 if (dump_enabled_p ())
6134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6135 "shuffle of 2 fields structure is not \
6136 supported by target\n");
6137 return false;
6139 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6141 /* Generating permutation constant to shift all elements.
6142 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6143 for (i = 0; i < nelt; i++)
6144 sel[i] = nelt / 2 + i;
6145 indices.new_vector (sel, 2, nelt);
6146 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6148 if (dump_enabled_p ())
6149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6150 "shift permutation is not supported by target\n");
6151 return false;
6153 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6155 /* Generating permutation constant to select vector from 2.
6156 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6157 for (i = 0; i < nelt / 2; i++)
6158 sel[i] = i;
6159 for (i = nelt / 2; i < nelt; i++)
6160 sel[i] = nelt + i;
6161 indices.new_vector (sel, 2, nelt);
6162 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6164 if (dump_enabled_p ())
6165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6166 "select is not supported by target\n");
6167 return false;
6169 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6171 for (i = 0; i < log_length; i++)
6173 for (j = 0; j < length; j += 2)
6175 first_vect = dr_chain[j];
6176 second_vect = dr_chain[j + 1];
6178 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6179 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6180 first_vect, first_vect,
6181 perm2_mask1);
6182 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6183 vect[0] = data_ref;
6185 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6186 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6187 second_vect, second_vect,
6188 perm2_mask2);
6189 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6190 vect[1] = data_ref;
6192 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6193 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6194 vect[0], vect[1], shift1_mask);
6195 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6196 (*result_chain)[j/2 + length/2] = data_ref;
6198 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6199 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6200 vect[0], vect[1], select_mask);
6201 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6202 (*result_chain)[j/2] = data_ref;
6204 memcpy (dr_chain.address (), result_chain->address (),
6205 length * sizeof (tree));
6207 return true;
6209 if (length == 3 && vf > 2)
6211 unsigned int k = 0, l = 0;
6213 /* Generating permutation constant to get all elements in rigth order.
6214 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6215 for (i = 0; i < nelt; i++)
6217 if (3 * k + (l % 3) >= nelt)
6219 k = 0;
6220 l += (3 - (nelt % 3));
6222 sel[i] = 3 * k + (l % 3);
6223 k++;
6225 vec_perm_indices indices (sel, 2, nelt);
6226 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6228 if (dump_enabled_p ())
6229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6230 "shuffle of 3 fields structure is not \
6231 supported by target\n");
6232 return false;
6234 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6236 /* Generating permutation constant to shift all elements.
6237 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6238 for (i = 0; i < nelt; i++)
6239 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6240 indices.new_vector (sel, 2, nelt);
6241 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6243 if (dump_enabled_p ())
6244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6245 "shift permutation is not supported by target\n");
6246 return false;
6248 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6250 /* Generating permutation constant to shift all elements.
6251 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6252 for (i = 0; i < nelt; i++)
6253 sel[i] = 2 * (nelt / 3) + 1 + i;
6254 indices.new_vector (sel, 2, nelt);
6255 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6257 if (dump_enabled_p ())
6258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6259 "shift permutation is not supported by target\n");
6260 return false;
6262 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6264 /* Generating permutation constant to shift all elements.
6265 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6266 for (i = 0; i < nelt; i++)
6267 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6268 indices.new_vector (sel, 2, nelt);
6269 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6271 if (dump_enabled_p ())
6272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6273 "shift permutation is not supported by target\n");
6274 return false;
6276 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6278 /* Generating permutation constant to shift all elements.
6279 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6280 for (i = 0; i < nelt; i++)
6281 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6282 indices.new_vector (sel, 2, nelt);
6283 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6285 if (dump_enabled_p ())
6286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6287 "shift permutation is not supported by target\n");
6288 return false;
6290 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6292 for (k = 0; k < 3; k++)
6294 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6295 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6296 dr_chain[k], dr_chain[k],
6297 perm3_mask);
6298 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6299 vect[k] = data_ref;
6302 for (k = 0; k < 3; k++)
6304 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6305 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6306 vect[k % 3], vect[(k + 1) % 3],
6307 shift1_mask);
6308 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6309 vect_shift[k] = data_ref;
6312 for (k = 0; k < 3; k++)
6314 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6315 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6316 vect_shift[(4 - k) % 3],
6317 vect_shift[(3 - k) % 3],
6318 shift2_mask);
6319 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6320 vect[k] = data_ref;
6323 (*result_chain)[3 - (nelt % 3)] = vect[2];
6325 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6326 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6327 vect[0], shift3_mask);
6328 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6329 (*result_chain)[nelt % 3] = data_ref;
6331 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6332 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6333 vect[1], shift4_mask);
6334 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6335 (*result_chain)[0] = data_ref;
6336 return true;
6338 return false;
6341 /* Function vect_transform_grouped_load.
6343 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6344 to perform their permutation and ascribe the result vectorized statements to
6345 the scalar statements.
6348 void
6349 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6350 int size, gimple_stmt_iterator *gsi)
6352 machine_mode mode;
6353 vec<tree> result_chain = vNULL;
6355 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6356 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6357 vectors, that are ready for vector computation. */
6358 result_chain.create (size);
6360 /* If reassociation width for vector type is 2 or greater target machine can
6361 execute 2 or more vector instructions in parallel. Otherwise try to
6362 get chain for loads group using vect_shift_permute_load_chain. */
6363 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6364 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6365 || pow2p_hwi (size)
6366 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6367 gsi, &result_chain))
6368 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6369 vect_record_grouped_load_vectors (stmt_info, result_chain);
6370 result_chain.release ();
6373 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6374 generated as part of the vectorization of STMT_INFO. Assign the statement
6375 for each vector to the associated scalar statement. */
6377 void
6378 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6379 vec<tree> result_chain)
6381 vec_info *vinfo = stmt_info->vinfo;
6382 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6383 unsigned int i, gap_count;
6384 tree tmp_data_ref;
6386 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6387 Since we scan the chain starting from it's first node, their order
6388 corresponds the order of data-refs in RESULT_CHAIN. */
6389 stmt_vec_info next_stmt_info = first_stmt_info;
6390 gap_count = 1;
6391 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6393 if (!next_stmt_info)
6394 break;
6396 /* Skip the gaps. Loads created for the gaps will be removed by dead
6397 code elimination pass later. No need to check for the first stmt in
6398 the group, since it always exists.
6399 DR_GROUP_GAP is the number of steps in elements from the previous
6400 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6401 correspond to the gaps. */
6402 if (next_stmt_info != first_stmt_info
6403 && gap_count < DR_GROUP_GAP (next_stmt_info))
6405 gap_count++;
6406 continue;
6409 while (next_stmt_info)
6411 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6412 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6413 copies, and we put the new vector statement in the first available
6414 RELATED_STMT. */
6415 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6416 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6417 else
6419 if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6421 stmt_vec_info prev_stmt_info
6422 = STMT_VINFO_VEC_STMT (next_stmt_info);
6423 stmt_vec_info rel_stmt_info
6424 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6425 while (rel_stmt_info)
6427 prev_stmt_info = rel_stmt_info;
6428 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6431 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6435 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6436 gap_count = 1;
6437 /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6438 put the same TMP_DATA_REF as its vectorized statement; otherwise
6439 get the next data-ref from RESULT_CHAIN. */
6440 if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6441 break;
6446 /* Function vect_force_dr_alignment_p.
6448 Returns whether the alignment of a DECL can be forced to be aligned
6449 on ALIGNMENT bit boundary. */
6451 bool
6452 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6454 if (!VAR_P (decl))
6455 return false;
6457 if (decl_in_symtab_p (decl)
6458 && !symtab_node::get (decl)->can_increase_alignment_p ())
6459 return false;
6461 if (TREE_STATIC (decl))
6462 return (alignment <= MAX_OFILE_ALIGNMENT);
6463 else
6464 return (alignment <= MAX_STACK_ALIGNMENT);
6468 /* Return whether the data reference DR_INFO is supported with respect to its
6469 alignment.
6470 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6471 it is aligned, i.e., check if it is possible to vectorize it with different
6472 alignment. */
6474 enum dr_alignment_support
6475 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6476 bool check_aligned_accesses)
6478 data_reference *dr = dr_info->dr;
6479 stmt_vec_info stmt_info = dr_info->stmt;
6480 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6481 machine_mode mode = TYPE_MODE (vectype);
6482 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6483 struct loop *vect_loop = NULL;
6484 bool nested_in_vect_loop = false;
6486 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6487 return dr_aligned;
6489 /* For now assume all conditional loads/stores support unaligned
6490 access without any special code. */
6491 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6492 if (gimple_call_internal_p (stmt)
6493 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6494 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6495 return dr_unaligned_supported;
6497 if (loop_vinfo)
6499 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6500 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6503 /* Possibly unaligned access. */
6505 /* We can choose between using the implicit realignment scheme (generating
6506 a misaligned_move stmt) and the explicit realignment scheme (generating
6507 aligned loads with a REALIGN_LOAD). There are two variants to the
6508 explicit realignment scheme: optimized, and unoptimized.
6509 We can optimize the realignment only if the step between consecutive
6510 vector loads is equal to the vector size. Since the vector memory
6511 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6512 is guaranteed that the misalignment amount remains the same throughout the
6513 execution of the vectorized loop. Therefore, we can create the
6514 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6515 at the loop preheader.
6517 However, in the case of outer-loop vectorization, when vectorizing a
6518 memory access in the inner-loop nested within the LOOP that is now being
6519 vectorized, while it is guaranteed that the misalignment of the
6520 vectorized memory access will remain the same in different outer-loop
6521 iterations, it is *not* guaranteed that is will remain the same throughout
6522 the execution of the inner-loop. This is because the inner-loop advances
6523 with the original scalar step (and not in steps of VS). If the inner-loop
6524 step happens to be a multiple of VS, then the misalignment remains fixed
6525 and we can use the optimized realignment scheme. For example:
6527 for (i=0; i<N; i++)
6528 for (j=0; j<M; j++)
6529 s += a[i+j];
6531 When vectorizing the i-loop in the above example, the step between
6532 consecutive vector loads is 1, and so the misalignment does not remain
6533 fixed across the execution of the inner-loop, and the realignment cannot
6534 be optimized (as illustrated in the following pseudo vectorized loop):
6536 for (i=0; i<N; i+=4)
6537 for (j=0; j<M; j++){
6538 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6539 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6540 // (assuming that we start from an aligned address).
6543 We therefore have to use the unoptimized realignment scheme:
6545 for (i=0; i<N; i+=4)
6546 for (j=k; j<M; j+=4)
6547 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6548 // that the misalignment of the initial address is
6549 // 0).
6551 The loop can then be vectorized as follows:
6553 for (k=0; k<4; k++){
6554 rt = get_realignment_token (&vp[k]);
6555 for (i=0; i<N; i+=4){
6556 v1 = vp[i+k];
6557 for (j=k; j<M; j+=4){
6558 v2 = vp[i+j+VS-1];
6559 va = REALIGN_LOAD <v1,v2,rt>;
6560 vs += va;
6561 v1 = v2;
6564 } */
6566 if (DR_IS_READ (dr))
6568 bool is_packed = false;
6569 tree type = (TREE_TYPE (DR_REF (dr)));
6571 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6572 && (!targetm.vectorize.builtin_mask_for_load
6573 || targetm.vectorize.builtin_mask_for_load ()))
6575 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6577 /* If we are doing SLP then the accesses need not have the
6578 same alignment, instead it depends on the SLP group size. */
6579 if (loop_vinfo
6580 && STMT_SLP_TYPE (stmt_info)
6581 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6582 * (DR_GROUP_SIZE
6583 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6584 TYPE_VECTOR_SUBPARTS (vectype)))
6586 else if (!loop_vinfo
6587 || (nested_in_vect_loop
6588 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6589 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6590 return dr_explicit_realign;
6591 else
6592 return dr_explicit_realign_optimized;
6594 if (!known_alignment_for_access_p (dr_info))
6595 is_packed = not_size_aligned (DR_REF (dr));
6597 if (targetm.vectorize.support_vector_misalignment
6598 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6599 /* Can't software pipeline the loads, but can at least do them. */
6600 return dr_unaligned_supported;
6602 else
6604 bool is_packed = false;
6605 tree type = (TREE_TYPE (DR_REF (dr)));
6607 if (!known_alignment_for_access_p (dr_info))
6608 is_packed = not_size_aligned (DR_REF (dr));
6610 if (targetm.vectorize.support_vector_misalignment
6611 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6612 return dr_unaligned_supported;
6615 /* Unsupported. */
6616 return dr_unaligned_unsupported;