[06/11] Handle VMAT_INVARIANT separately
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobe20b502b719a65cdaf75404913401f315f0a5905
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT_INFO.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
121 HOST_WIDE_INT *lhs_size_unit,
122 HOST_WIDE_INT *rhs_size_unit)
124 tree scalar_type = gimple_expr_type (stmt_info->stmt);
125 HOST_WIDE_INT lhs, rhs;
127 /* During the analysis phase, this function is called on arbitrary
128 statements that might not have scalar results. */
129 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
130 return scalar_type;
132 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
134 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
135 if (assign
136 && (gimple_assign_cast_p (assign)
137 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
140 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
141 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
143 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
145 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
146 if (rhs < lhs)
147 scalar_type = rhs_type;
150 *lhs_size_unit = lhs;
151 *rhs_size_unit = rhs;
152 return scalar_type;
156 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
157 tested at run-time. Return TRUE if DDR was successfully inserted.
158 Return false if versioning is not supported. */
160 static bool
161 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
163 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
165 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
166 return false;
168 if (!runtime_alias_check_p (ddr, loop,
169 optimize_loop_nest_for_speed_p (loop)))
170 return false;
172 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
173 return true;
176 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
178 static void
179 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
181 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
182 for (unsigned int i = 0; i < checks.length(); ++i)
183 if (checks[i] == value)
184 return;
186 if (dump_enabled_p ())
188 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
189 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
190 dump_printf (MSG_NOTE, " is nonzero\n");
192 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
195 /* Return true if we know that the order of vectorized DR_INFO_A and
196 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
197 DR_INFO_B. At least one of the accesses is a write. */
199 static bool
200 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
202 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
203 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
205 /* Single statements are always kept in their original order. */
206 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
207 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
208 return true;
210 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
211 group are emitted at the position of the first scalar load and all
212 stores in a group are emitted at the position of the last scalar store.
213 Thus writes will happen no earlier than their current position
214 (but could happen later) while reads will happen no later than their
215 current position (but could happen earlier). Reordering is therefore
216 only possible if the first access is a write. */
217 stmtinfo_a = vect_orig_stmt (stmtinfo_a);
218 stmtinfo_b = vect_orig_stmt (stmtinfo_b);
219 stmt_vec_info earlier_stmt_info = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
220 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (earlier_stmt_info));
223 /* A subroutine of vect_analyze_data_ref_dependence. Handle
224 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
225 distances. These distances are conservatively correct but they don't
226 reflect a guaranteed dependence.
228 Return true if this function does all the work necessary to avoid
229 an alias or false if the caller should use the dependence distances
230 to limit the vectorization factor in the usual way. LOOP_DEPTH is
231 the depth of the loop described by LOOP_VINFO and the other arguments
232 are as for vect_analyze_data_ref_dependence. */
234 static bool
235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
236 loop_vec_info loop_vinfo,
237 int loop_depth, unsigned int *max_vf)
239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
240 lambda_vector dist_v;
241 unsigned int i;
242 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
244 int dist = dist_v[loop_depth];
245 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
247 /* If the user asserted safelen >= DIST consecutive iterations
248 can be executed concurrently, assume independence.
250 ??? An alternative would be to add the alias check even
251 in this case, and vectorize the fallback loop with the
252 maximum VF set to safelen. However, if the user has
253 explicitly given a length, it's less likely that that
254 would be a win. */
255 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
257 if ((unsigned int) loop->safelen < *max_vf)
258 *max_vf = loop->safelen;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
260 continue;
263 /* For dependence distances of 2 or more, we have the option
264 of limiting VF or checking for an alias at runtime.
265 Prefer to check at runtime if we can, to avoid limiting
266 the VF unnecessarily when the bases are in fact independent.
268 Note that the alias checks will be removed if the VF ends up
269 being small enough. */
270 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
271 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
272 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
273 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
274 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
277 return true;
281 /* Function vect_analyze_data_ref_dependence.
283 Return TRUE if there (might) exist a dependence between a memory-reference
284 DRA and a memory-reference DRB. When versioning for alias may check a
285 dependence at run-time, return FALSE. Adjust *MAX_VF according to
286 the data dependence. */
288 static bool
289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
290 loop_vec_info loop_vinfo,
291 unsigned int *max_vf)
293 unsigned int i;
294 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
295 struct data_reference *dra = DDR_A (ddr);
296 struct data_reference *drb = DDR_B (ddr);
297 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
298 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
299 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
300 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
301 lambda_vector dist_v;
302 unsigned int loop_depth;
304 /* In loop analysis all data references should be vectorizable. */
305 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
306 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
307 gcc_unreachable ();
309 /* Independent data accesses. */
310 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
311 return false;
313 if (dra == drb
314 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
315 return false;
317 /* We do not have to consider dependences between accesses that belong
318 to the same group, unless the stride could be smaller than the
319 group size. */
320 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
321 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
322 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
323 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
324 return false;
326 /* Even if we have an anti-dependence then, as the vectorized loop covers at
327 least two scalar iterations, there is always also a true dependence.
328 As the vectorizer does not re-order loads and stores we can ignore
329 the anti-dependence if TBAA can disambiguate both DRs similar to the
330 case with known negative distance anti-dependences (positive
331 distance anti-dependences would violate TBAA constraints). */
332 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
333 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
334 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
335 get_alias_set (DR_REF (drb))))
336 return false;
338 /* Unknown data dependence. */
339 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
341 /* If user asserted safelen consecutive iterations can be
342 executed concurrently, assume independence. */
343 if (loop->safelen >= 2)
345 if ((unsigned int) loop->safelen < *max_vf)
346 *max_vf = loop->safelen;
347 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
348 return false;
351 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
352 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
357 "versioning for alias not supported for: "
358 "can't determine dependence between ");
359 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
360 DR_REF (dra));
361 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
362 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
363 DR_REF (drb));
364 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
366 return true;
369 if (dump_enabled_p ())
371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
372 "versioning for alias required: "
373 "can't determine dependence between ");
374 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
375 DR_REF (dra));
376 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
377 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
378 DR_REF (drb));
379 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
382 /* Add to list of ddrs that need to be tested at run-time. */
383 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
386 /* Known data dependence. */
387 if (DDR_NUM_DIST_VECTS (ddr) == 0)
389 /* If user asserted safelen consecutive iterations can be
390 executed concurrently, assume independence. */
391 if (loop->safelen >= 2)
393 if ((unsigned int) loop->safelen < *max_vf)
394 *max_vf = loop->safelen;
395 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
396 return false;
399 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
400 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
402 if (dump_enabled_p ())
404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
405 "versioning for alias not supported for: "
406 "bad dist vector for ");
407 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
408 DR_REF (dra));
409 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
410 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
411 DR_REF (drb));
412 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
414 return true;
417 if (dump_enabled_p ())
419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 "versioning for alias required: "
421 "bad dist vector for ");
422 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
423 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
424 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
425 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
427 /* Add to list of ddrs that need to be tested at run-time. */
428 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
431 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
433 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
434 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
435 loop_depth, max_vf))
436 return false;
438 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
440 int dist = dist_v[loop_depth];
442 if (dump_enabled_p ())
443 dump_printf_loc (MSG_NOTE, vect_location,
444 "dependence distance = %d.\n", dist);
446 if (dist == 0)
448 if (dump_enabled_p ())
450 dump_printf_loc (MSG_NOTE, vect_location,
451 "dependence distance == 0 between ");
452 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
453 dump_printf (MSG_NOTE, " and ");
454 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
455 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
458 /* When we perform grouped accesses and perform implicit CSE
459 by detecting equal accesses and doing disambiguation with
460 runtime alias tests like for
461 .. = a[i];
462 .. = a[i+1];
463 a[i] = ..;
464 a[i+1] = ..;
465 *p = ..;
466 .. = a[i];
467 .. = a[i+1];
468 where we will end up loading { a[i], a[i+1] } once, make
469 sure that inserting group loads before the first load and
470 stores after the last store will do the right thing.
471 Similar for groups like
472 a[i] = ...;
473 ... = a[i];
474 a[i+1] = ...;
475 where loads from the group interleave with the store. */
476 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
478 if (dump_enabled_p ())
479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
480 "READ_WRITE dependence in interleaving.\n");
481 return true;
484 if (loop->safelen < 2)
486 tree indicator = dr_zero_step_indicator (dra);
487 if (!indicator || integer_zerop (indicator))
489 if (dump_enabled_p ())
490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
491 "access also has a zero step\n");
492 return true;
494 else if (TREE_CODE (indicator) != INTEGER_CST)
495 vect_check_nonzero_value (loop_vinfo, indicator);
497 continue;
500 if (dist > 0 && DDR_REVERSED_P (ddr))
502 /* If DDR_REVERSED_P the order of the data-refs in DDR was
503 reversed (to make distance vector positive), and the actual
504 distance is negative. */
505 if (dump_enabled_p ())
506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
507 "dependence distance negative.\n");
508 /* Record a negative dependence distance to later limit the
509 amount of stmt copying / unrolling we can perform.
510 Only need to handle read-after-write dependence. */
511 if (DR_IS_READ (drb)
512 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
513 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
514 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
515 continue;
518 unsigned int abs_dist = abs (dist);
519 if (abs_dist >= 2 && abs_dist < *max_vf)
521 /* The dependence distance requires reduction of the maximal
522 vectorization factor. */
523 *max_vf = abs (dist);
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location,
526 "adjusting maximal vectorization factor to %i\n",
527 *max_vf);
530 if (abs_dist >= *max_vf)
532 /* Dependence distance does not create dependence, as far as
533 vectorization is concerned, in this case. */
534 if (dump_enabled_p ())
535 dump_printf_loc (MSG_NOTE, vect_location,
536 "dependence distance >= VF.\n");
537 continue;
540 if (dump_enabled_p ())
542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
543 "not vectorized, possible dependence "
544 "between data-refs ");
545 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
546 dump_printf (MSG_NOTE, " and ");
547 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
548 dump_printf (MSG_NOTE, "\n");
551 return true;
554 return false;
557 /* Function vect_analyze_data_ref_dependences.
559 Examine all the data references in the loop, and make sure there do not
560 exist any data dependences between them. Set *MAX_VF according to
561 the maximum vectorization factor the data dependences allow. */
563 bool
564 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
565 unsigned int *max_vf)
567 unsigned int i;
568 struct data_dependence_relation *ddr;
570 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
572 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
574 LOOP_VINFO_DDRS (loop_vinfo)
575 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
576 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
577 /* We need read-read dependences to compute
578 STMT_VINFO_SAME_ALIGN_REFS. */
579 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
580 &LOOP_VINFO_DDRS (loop_vinfo),
581 LOOP_VINFO_LOOP_NEST (loop_vinfo),
582 true);
583 gcc_assert (res);
586 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
588 /* For epilogues we either have no aliases or alias versioning
589 was applied to original loop. Therefore we may just get max_vf
590 using VF of original loop. */
591 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
592 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
593 else
594 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
595 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
596 return false;
598 return true;
602 /* Function vect_slp_analyze_data_ref_dependence.
604 Return TRUE if there (might) exist a dependence between a memory-reference
605 DRA and a memory-reference DRB for VINFO. When versioning for alias
606 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
607 according to the data dependence. */
609 static bool
610 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
611 struct data_dependence_relation *ddr)
613 struct data_reference *dra = DDR_A (ddr);
614 struct data_reference *drb = DDR_B (ddr);
615 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
616 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
618 /* We need to check dependences of statements marked as unvectorizable
619 as well, they still can prohibit vectorization. */
621 /* Independent data accesses. */
622 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
623 return false;
625 if (dra == drb)
626 return false;
628 /* Read-read is OK. */
629 if (DR_IS_READ (dra) && DR_IS_READ (drb))
630 return false;
632 /* If dra and drb are part of the same interleaving chain consider
633 them independent. */
634 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
635 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
636 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
637 return false;
639 /* Unknown data dependence. */
640 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
642 if (dump_enabled_p ())
644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
645 "can't determine dependence between ");
646 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
647 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
648 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
649 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
652 else if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location,
655 "determined dependence between ");
656 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
657 dump_printf (MSG_NOTE, " and ");
658 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
659 dump_printf (MSG_NOTE, "\n");
662 return true;
666 /* Analyze dependences involved in the transform of SLP NODE. STORES
667 contain the vector of scalar stores of this instance if we are
668 disambiguating the loads. */
670 static bool
671 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
672 vec<stmt_vec_info> stores,
673 stmt_vec_info last_store_info)
675 /* This walks over all stmts involved in the SLP load/store done
676 in NODE verifying we can sink them up to the last stmt in the
677 group. */
678 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
679 vec_info *vinfo = last_access_info->vinfo;
680 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
682 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
683 if (access_info == last_access_info)
684 continue;
685 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
686 ao_ref ref;
687 bool ref_initialized_p = false;
688 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
689 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
691 gimple *stmt = gsi_stmt (gsi);
692 if (! gimple_vuse (stmt)
693 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
694 continue;
696 /* If we couldn't record a (single) data reference for this
697 stmt we have to resort to the alias oracle. */
698 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
699 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
700 if (!dr_b)
702 /* We are moving a store or sinking a load - this means
703 we cannot use TBAA for disambiguation. */
704 if (!ref_initialized_p)
705 ao_ref_init (&ref, DR_REF (dr_a));
706 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
707 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
708 return false;
709 continue;
712 bool dependent = false;
713 /* If we run into a store of this same instance (we've just
714 marked those) then delay dependence checking until we run
715 into the last store because this is where it will have
716 been sunk to (and we verify if we can do that as well). */
717 if (gimple_visited_p (stmt))
719 if (stmt_info != last_store_info)
720 continue;
721 unsigned i;
722 stmt_vec_info store_info;
723 FOR_EACH_VEC_ELT (stores, i, store_info)
725 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
726 ddr_p ddr = initialize_data_dependence_relation
727 (dr_a, store_dr, vNULL);
728 dependent
729 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
730 free_dependence_relation (ddr);
731 if (dependent)
732 break;
735 else
737 ddr_p ddr = initialize_data_dependence_relation (dr_a,
738 dr_b, vNULL);
739 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
740 free_dependence_relation (ddr);
742 if (dependent)
743 return false;
746 return true;
750 /* Function vect_analyze_data_ref_dependences.
752 Examine all the data references in the basic-block, and make sure there
753 do not exist any data dependences between them. Set *MAX_VF according to
754 the maximum vectorization factor the data dependences allow. */
756 bool
757 vect_slp_analyze_instance_dependence (slp_instance instance)
759 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
761 /* The stores of this instance are at the root of the SLP tree. */
762 slp_tree store = SLP_INSTANCE_TREE (instance);
763 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
764 store = NULL;
766 /* Verify we can sink stores to the vectorized stmt insert location. */
767 stmt_vec_info last_store_info = NULL;
768 if (store)
770 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
771 return false;
773 /* Mark stores in this instance and remember the last one. */
774 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
775 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
776 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
779 bool res = true;
781 /* Verify we can sink loads to the vectorized stmt insert location,
782 special-casing stores of this instance. */
783 slp_tree load;
784 unsigned int i;
785 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
786 if (! vect_slp_analyze_node_dependences (instance, load,
787 store
788 ? SLP_TREE_SCALAR_STMTS (store)
789 : vNULL, last_store_info))
791 res = false;
792 break;
795 /* Unset the visited flag. */
796 if (store)
797 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
798 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
800 return res;
803 /* Record the base alignment guarantee given by DRB, which occurs
804 in STMT_INFO. */
806 static void
807 vect_record_base_alignment (stmt_vec_info stmt_info,
808 innermost_loop_behavior *drb)
810 vec_info *vinfo = stmt_info->vinfo;
811 bool existed;
812 innermost_loop_behavior *&entry
813 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
814 if (!existed || entry->base_alignment < drb->base_alignment)
816 entry = drb;
817 if (dump_enabled_p ())
819 dump_printf_loc (MSG_NOTE, vect_location,
820 "recording new base alignment for ");
821 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
822 dump_printf (MSG_NOTE, "\n");
823 dump_printf_loc (MSG_NOTE, vect_location,
824 " alignment: %d\n", drb->base_alignment);
825 dump_printf_loc (MSG_NOTE, vect_location,
826 " misalignment: %d\n", drb->base_misalignment);
827 dump_printf_loc (MSG_NOTE, vect_location,
828 " based on: ");
829 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
834 /* If the region we're going to vectorize is reached, all unconditional
835 data references occur at least once. We can therefore pool the base
836 alignment guarantees from each unconditional reference. Do this by
837 going through all the data references in VINFO and checking whether
838 the containing statement makes the reference unconditionally. If so,
839 record the alignment of the base address in VINFO so that it can be
840 used for all other references with the same base. */
842 void
843 vect_record_base_alignments (vec_info *vinfo)
845 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
846 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
847 data_reference *dr;
848 unsigned int i;
849 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
851 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
852 stmt_vec_info stmt_info = dr_info->stmt;
853 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
854 && STMT_VINFO_VECTORIZABLE (stmt_info)
855 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
857 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
859 /* If DR is nested in the loop that is being vectorized, we can also
860 record the alignment of the base wrt the outer loop. */
861 if (loop && nested_in_vect_loop_p (loop, stmt_info))
862 vect_record_base_alignment
863 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
868 /* Return the target alignment for the vectorized form of DR_INFO. */
870 static unsigned int
871 vect_calculate_target_alignment (dr_vec_info *dr_info)
873 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
874 return targetm.vectorize.preferred_vector_alignment (vectype);
877 /* Function vect_compute_data_ref_alignment
879 Compute the misalignment of the data reference DR_INFO.
881 Output:
882 1. DR_MISALIGNMENT (DR_INFO) is defined.
884 FOR NOW: No analysis is actually performed. Misalignment is calculated
885 only for trivial cases. TODO. */
887 static void
888 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
890 stmt_vec_info stmt_info = dr_info->stmt;
891 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
892 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
893 struct loop *loop = NULL;
894 tree ref = DR_REF (dr_info->dr);
895 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
897 if (dump_enabled_p ())
898 dump_printf_loc (MSG_NOTE, vect_location,
899 "vect_compute_data_ref_alignment:\n");
901 if (loop_vinfo)
902 loop = LOOP_VINFO_LOOP (loop_vinfo);
904 /* Initialize misalignment to unknown. */
905 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
907 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
908 return;
910 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
911 bool step_preserves_misalignment_p;
913 unsigned HOST_WIDE_INT vector_alignment
914 = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
915 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
917 /* No step for BB vectorization. */
918 if (!loop)
920 gcc_assert (integer_zerop (drb->step));
921 step_preserves_misalignment_p = true;
924 /* In case the dataref is in an inner-loop of the loop that is being
925 vectorized (LOOP), we use the base and misalignment information
926 relative to the outer-loop (LOOP). This is ok only if the misalignment
927 stays the same throughout the execution of the inner-loop, which is why
928 we have to check that the stride of the dataref in the inner-loop evenly
929 divides by the vector alignment. */
930 else if (nested_in_vect_loop_p (loop, stmt_info))
932 step_preserves_misalignment_p
933 = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
935 if (dump_enabled_p ())
937 if (step_preserves_misalignment_p)
938 dump_printf_loc (MSG_NOTE, vect_location,
939 "inner step divides the vector alignment.\n");
940 else
941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
942 "inner step doesn't divide the vector"
943 " alignment.\n");
947 /* Similarly we can only use base and misalignment information relative to
948 an innermost loop if the misalignment stays the same throughout the
949 execution of the loop. As above, this is the case if the stride of
950 the dataref evenly divides by the alignment. */
951 else
953 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
954 step_preserves_misalignment_p
955 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
957 if (!step_preserves_misalignment_p && dump_enabled_p ())
958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
959 "step doesn't divide the vector alignment.\n");
962 unsigned int base_alignment = drb->base_alignment;
963 unsigned int base_misalignment = drb->base_misalignment;
965 /* Calculate the maximum of the pooled base address alignment and the
966 alignment that we can compute for DR itself. */
967 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
968 if (entry && base_alignment < (*entry)->base_alignment)
970 base_alignment = (*entry)->base_alignment;
971 base_misalignment = (*entry)->base_misalignment;
974 if (drb->offset_alignment < vector_alignment
975 || !step_preserves_misalignment_p
976 /* We need to know whether the step wrt the vectorized loop is
977 negative when computing the starting misalignment below. */
978 || TREE_CODE (drb->step) != INTEGER_CST)
980 if (dump_enabled_p ())
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
983 "Unknown alignment for access: ");
984 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
985 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
987 return;
990 if (base_alignment < vector_alignment)
992 unsigned int max_alignment;
993 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
994 if (max_alignment < vector_alignment
995 || !vect_can_force_dr_alignment_p (base,
996 vector_alignment * BITS_PER_UNIT))
998 if (dump_enabled_p ())
1000 dump_printf_loc (MSG_NOTE, vect_location,
1001 "can't force alignment of ref: ");
1002 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1003 dump_printf (MSG_NOTE, "\n");
1005 return;
1008 /* Force the alignment of the decl.
1009 NOTE: This is the only change to the code we make during
1010 the analysis phase, before deciding to vectorize the loop. */
1011 if (dump_enabled_p ())
1013 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1014 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1015 dump_printf (MSG_NOTE, "\n");
1018 dr_info->base_decl = base;
1019 dr_info->base_misaligned = true;
1020 base_misalignment = 0;
1022 poly_int64 misalignment
1023 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1025 /* If this is a backward running DR then first access in the larger
1026 vectype actually is N-1 elements before the address in the DR.
1027 Adjust misalign accordingly. */
1028 if (tree_int_cst_sgn (drb->step) < 0)
1029 /* PLUS because STEP is negative. */
1030 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1031 * TREE_INT_CST_LOW (drb->step));
1033 unsigned int const_misalignment;
1034 if (!known_misalignment (misalignment, vector_alignment,
1035 &const_misalignment))
1037 if (dump_enabled_p ())
1039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1040 "Non-constant misalignment for access: ");
1041 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1042 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1044 return;
1047 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1049 if (dump_enabled_p ())
1051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1052 "misalign = %d bytes of ref ",
1053 DR_MISALIGNMENT (dr_info));
1054 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1055 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1058 return;
1061 /* Function vect_update_misalignment_for_peel.
1062 Sets DR_INFO's misalignment
1063 - to 0 if it has the same alignment as DR_PEEL_INFO,
1064 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1065 - to -1 (unknown) otherwise.
1067 DR_INFO - the data reference whose misalignment is to be adjusted.
1068 DR_PEEL_INFO - the data reference whose misalignment is being made
1069 zero in the vector loop by the peel.
1070 NPEEL - the number of iterations in the peel loop if the misalignment
1071 of DR_PEEL_INFO is known at compile time. */
1073 static void
1074 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1075 dr_vec_info *dr_peel_info, int npeel)
1077 unsigned int i;
1078 vec<dr_p> same_aligned_drs;
1079 struct data_reference *current_dr;
1080 int dr_size = vect_get_scalar_dr_size (dr_info);
1081 int dr_peel_size = vect_get_scalar_dr_size (dr_peel_info);
1082 stmt_vec_info stmt_info = dr_info->stmt;
1083 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1085 /* For interleaved data accesses the step in the loop must be multiplied by
1086 the size of the interleaving group. */
1087 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1088 dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1089 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1090 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1092 /* It can be assumed that the data refs with the same alignment as dr_peel
1093 are aligned in the vector loop. */
1094 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1095 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1097 if (current_dr != dr_info->dr)
1098 continue;
1099 gcc_assert (!known_alignment_for_access_p (dr_info)
1100 || !known_alignment_for_access_p (dr_peel_info)
1101 || (DR_MISALIGNMENT (dr_info) / dr_size
1102 == DR_MISALIGNMENT (dr_peel_info) / dr_peel_size));
1103 SET_DR_MISALIGNMENT (dr_info, 0);
1104 return;
1107 if (known_alignment_for_access_p (dr_info)
1108 && known_alignment_for_access_p (dr_peel_info))
1110 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1111 size_zero_node) < 0;
1112 int misal = DR_MISALIGNMENT (dr_info);
1113 misal += negative ? -npeel * dr_size : npeel * dr_size;
1114 misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1115 SET_DR_MISALIGNMENT (dr_info, misal);
1116 return;
1119 if (dump_enabled_p ())
1120 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1121 "to unknown (-1).\n");
1122 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1126 /* Function verify_data_ref_alignment
1128 Return TRUE if DR_INFO can be handled with respect to alignment. */
1130 static bool
1131 verify_data_ref_alignment (dr_vec_info *dr_info)
1133 enum dr_alignment_support supportable_dr_alignment
1134 = vect_supportable_dr_alignment (dr_info, false);
1135 if (!supportable_dr_alignment)
1137 if (dump_enabled_p ())
1139 if (DR_IS_READ (dr_info->dr))
1140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141 "not vectorized: unsupported unaligned load.");
1142 else
1143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1144 "not vectorized: unsupported unaligned "
1145 "store.");
1147 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1148 DR_REF (dr_info->dr));
1149 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1151 return false;
1154 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1155 dump_printf_loc (MSG_NOTE, vect_location,
1156 "Vectorizing an unaligned access.\n");
1158 return true;
1161 /* Function vect_verify_datarefs_alignment
1163 Return TRUE if all data references in the loop can be
1164 handled with respect to alignment. */
1166 bool
1167 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1169 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1170 struct data_reference *dr;
1171 unsigned int i;
1173 FOR_EACH_VEC_ELT (datarefs, i, dr)
1175 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1176 stmt_vec_info stmt_info = dr_info->stmt;
1178 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1179 continue;
1181 /* For interleaving, only the alignment of the first access matters. */
1182 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1183 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1184 continue;
1186 /* Strided accesses perform only component accesses, alignment is
1187 irrelevant for them. */
1188 if (STMT_VINFO_STRIDED_P (stmt_info)
1189 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1190 continue;
1192 if (! verify_data_ref_alignment (dr_info))
1193 return false;
1196 return true;
1199 /* Given an memory reference EXP return whether its alignment is less
1200 than its size. */
1202 static bool
1203 not_size_aligned (tree exp)
1205 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1206 return true;
1208 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1209 > get_object_alignment (exp));
1212 /* Function vector_alignment_reachable_p
1214 Return true if vector alignment for DR_INFO is reachable by peeling
1215 a few loop iterations. Return false otherwise. */
1217 static bool
1218 vector_alignment_reachable_p (dr_vec_info *dr_info)
1220 stmt_vec_info stmt_info = dr_info->stmt;
1221 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1223 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1225 /* For interleaved access we peel only if number of iterations in
1226 the prolog loop ({VF - misalignment}), is a multiple of the
1227 number of the interleaved accesses. */
1228 int elem_size, mis_in_elements;
1230 /* FORNOW: handle only known alignment. */
1231 if (!known_alignment_for_access_p (dr_info))
1232 return false;
1234 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1235 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1236 elem_size = vector_element_size (vector_size, nelements);
1237 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1239 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1240 return false;
1243 /* If misalignment is known at the compile time then allow peeling
1244 only if natural alignment is reachable through peeling. */
1245 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1247 HOST_WIDE_INT elmsize =
1248 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1249 if (dump_enabled_p ())
1251 dump_printf_loc (MSG_NOTE, vect_location,
1252 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1253 dump_printf (MSG_NOTE,
1254 ". misalignment = %d.\n", DR_MISALIGNMENT (dr_info));
1256 if (DR_MISALIGNMENT (dr_info) % elmsize)
1258 if (dump_enabled_p ())
1259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1260 "data size does not divide the misalignment.\n");
1261 return false;
1265 if (!known_alignment_for_access_p (dr_info))
1267 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1268 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1269 if (dump_enabled_p ())
1270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271 "Unknown misalignment, %snaturally aligned\n",
1272 is_packed ? "not " : "");
1273 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1276 return true;
1280 /* Calculate the cost of the memory access represented by DR_INFO. */
1282 static void
1283 vect_get_data_access_cost (dr_vec_info *dr_info,
1284 unsigned int *inside_cost,
1285 unsigned int *outside_cost,
1286 stmt_vector_for_cost *body_cost_vec,
1287 stmt_vector_for_cost *prologue_cost_vec)
1289 stmt_vec_info stmt_info = dr_info->stmt;
1290 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1291 int ncopies;
1293 if (PURE_SLP_STMT (stmt_info))
1294 ncopies = 1;
1295 else
1296 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1298 if (DR_IS_READ (dr_info->dr))
1299 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1300 prologue_cost_vec, body_cost_vec, false);
1301 else
1302 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1304 if (dump_enabled_p ())
1305 dump_printf_loc (MSG_NOTE, vect_location,
1306 "vect_get_data_access_cost: inside_cost = %d, "
1307 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1311 typedef struct _vect_peel_info
1313 dr_vec_info *dr_info;
1314 int npeel;
1315 unsigned int count;
1316 } *vect_peel_info;
1318 typedef struct _vect_peel_extended_info
1320 struct _vect_peel_info peel_info;
1321 unsigned int inside_cost;
1322 unsigned int outside_cost;
1323 } *vect_peel_extended_info;
1326 /* Peeling hashtable helpers. */
1328 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1330 static inline hashval_t hash (const _vect_peel_info *);
1331 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1334 inline hashval_t
1335 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1337 return (hashval_t) peel_info->npeel;
1340 inline bool
1341 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1343 return (a->npeel == b->npeel);
1347 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1349 static void
1350 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1351 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1352 int npeel)
1354 struct _vect_peel_info elem, *slot;
1355 _vect_peel_info **new_slot;
1356 bool supportable_dr_alignment
1357 = vect_supportable_dr_alignment (dr_info, true);
1359 elem.npeel = npeel;
1360 slot = peeling_htab->find (&elem);
1361 if (slot)
1362 slot->count++;
1363 else
1365 slot = XNEW (struct _vect_peel_info);
1366 slot->npeel = npeel;
1367 slot->dr_info = dr_info;
1368 slot->count = 1;
1369 new_slot = peeling_htab->find_slot (slot, INSERT);
1370 *new_slot = slot;
1373 if (!supportable_dr_alignment
1374 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1375 slot->count += VECT_MAX_COST;
1379 /* Traverse peeling hash table to find peeling option that aligns maximum
1380 number of data accesses. */
1383 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1384 _vect_peel_extended_info *max)
1386 vect_peel_info elem = *slot;
1388 if (elem->count > max->peel_info.count
1389 || (elem->count == max->peel_info.count
1390 && max->peel_info.npeel > elem->npeel))
1392 max->peel_info.npeel = elem->npeel;
1393 max->peel_info.count = elem->count;
1394 max->peel_info.dr_info = elem->dr_info;
1397 return 1;
1400 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1401 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1402 we assume DR0_INFO's misalignment will be zero after peeling. */
1404 static void
1405 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1406 dr_vec_info *dr0_info,
1407 unsigned int *inside_cost,
1408 unsigned int *outside_cost,
1409 stmt_vector_for_cost *body_cost_vec,
1410 stmt_vector_for_cost *prologue_cost_vec,
1411 unsigned int npeel,
1412 bool unknown_misalignment)
1414 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1415 unsigned i;
1416 data_reference *dr;
1418 FOR_EACH_VEC_ELT (datarefs, i, dr)
1420 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1421 stmt_vec_info stmt_info = dr_info->stmt;
1422 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1423 continue;
1425 /* For interleaving, only the alignment of the first access
1426 matters. */
1427 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1428 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1429 continue;
1431 /* Strided accesses perform only component accesses, alignment is
1432 irrelevant for them. */
1433 if (STMT_VINFO_STRIDED_P (stmt_info)
1434 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1435 continue;
1437 int save_misalignment;
1438 save_misalignment = DR_MISALIGNMENT (dr_info);
1439 if (npeel == 0)
1441 else if (unknown_misalignment && dr_info == dr0_info)
1442 SET_DR_MISALIGNMENT (dr_info, 0);
1443 else
1444 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1445 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1446 body_cost_vec, prologue_cost_vec);
1447 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1451 /* Traverse peeling hash table and calculate cost for each peeling option.
1452 Find the one with the lowest cost. */
1455 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1456 _vect_peel_extended_info *min)
1458 vect_peel_info elem = *slot;
1459 int dummy;
1460 unsigned int inside_cost = 0, outside_cost = 0;
1461 stmt_vec_info stmt_info = elem->dr_info->stmt;
1462 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1463 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1464 epilogue_cost_vec;
1466 prologue_cost_vec.create (2);
1467 body_cost_vec.create (2);
1468 epilogue_cost_vec.create (2);
1470 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1471 &outside_cost, &body_cost_vec,
1472 &prologue_cost_vec, elem->npeel, false);
1474 body_cost_vec.release ();
1476 outside_cost += vect_get_known_peeling_cost
1477 (loop_vinfo, elem->npeel, &dummy,
1478 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1479 &prologue_cost_vec, &epilogue_cost_vec);
1481 /* Prologue and epilogue costs are added to the target model later.
1482 These costs depend only on the scalar iteration cost, the
1483 number of peeling iterations finally chosen, and the number of
1484 misaligned statements. So discard the information found here. */
1485 prologue_cost_vec.release ();
1486 epilogue_cost_vec.release ();
1488 if (inside_cost < min->inside_cost
1489 || (inside_cost == min->inside_cost
1490 && outside_cost < min->outside_cost))
1492 min->inside_cost = inside_cost;
1493 min->outside_cost = outside_cost;
1494 min->peel_info.dr_info = elem->dr_info;
1495 min->peel_info.npeel = elem->npeel;
1496 min->peel_info.count = elem->count;
1499 return 1;
1503 /* Choose best peeling option by traversing peeling hash table and either
1504 choosing an option with the lowest cost (if cost model is enabled) or the
1505 option that aligns as many accesses as possible. */
1507 static struct _vect_peel_extended_info
1508 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1509 loop_vec_info loop_vinfo)
1511 struct _vect_peel_extended_info res;
1513 res.peel_info.dr_info = NULL;
1515 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1517 res.inside_cost = INT_MAX;
1518 res.outside_cost = INT_MAX;
1519 peeling_htab->traverse <_vect_peel_extended_info *,
1520 vect_peeling_hash_get_lowest_cost> (&res);
1522 else
1524 res.peel_info.count = 0;
1525 peeling_htab->traverse <_vect_peel_extended_info *,
1526 vect_peeling_hash_get_most_frequent> (&res);
1527 res.inside_cost = 0;
1528 res.outside_cost = 0;
1531 return res;
1534 /* Return true if the new peeling NPEEL is supported. */
1536 static bool
1537 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1538 unsigned npeel)
1540 unsigned i;
1541 struct data_reference *dr = NULL;
1542 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1543 enum dr_alignment_support supportable_dr_alignment;
1545 /* Ensure that all data refs can be vectorized after the peel. */
1546 FOR_EACH_VEC_ELT (datarefs, i, dr)
1548 int save_misalignment;
1550 if (dr == dr0_info->dr)
1551 continue;
1553 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1554 stmt_vec_info stmt_info = dr_info->stmt;
1555 /* For interleaving, only the alignment of the first access
1556 matters. */
1557 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1558 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1559 continue;
1561 /* Strided accesses perform only component accesses, alignment is
1562 irrelevant for them. */
1563 if (STMT_VINFO_STRIDED_P (stmt_info)
1564 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1565 continue;
1567 save_misalignment = DR_MISALIGNMENT (dr_info);
1568 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1569 supportable_dr_alignment
1570 = vect_supportable_dr_alignment (dr_info, false);
1571 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1573 if (!supportable_dr_alignment)
1574 return false;
1577 return true;
1580 /* Function vect_enhance_data_refs_alignment
1582 This pass will use loop versioning and loop peeling in order to enhance
1583 the alignment of data references in the loop.
1585 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1586 original loop is to be vectorized. Any other loops that are created by
1587 the transformations performed in this pass - are not supposed to be
1588 vectorized. This restriction will be relaxed.
1590 This pass will require a cost model to guide it whether to apply peeling
1591 or versioning or a combination of the two. For example, the scheme that
1592 intel uses when given a loop with several memory accesses, is as follows:
1593 choose one memory access ('p') which alignment you want to force by doing
1594 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1595 other accesses are not necessarily aligned, or (2) use loop versioning to
1596 generate one loop in which all accesses are aligned, and another loop in
1597 which only 'p' is necessarily aligned.
1599 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1600 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1601 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1603 Devising a cost model is the most critical aspect of this work. It will
1604 guide us on which access to peel for, whether to use loop versioning, how
1605 many versions to create, etc. The cost model will probably consist of
1606 generic considerations as well as target specific considerations (on
1607 powerpc for example, misaligned stores are more painful than misaligned
1608 loads).
1610 Here are the general steps involved in alignment enhancements:
1612 -- original loop, before alignment analysis:
1613 for (i=0; i<N; i++){
1614 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1615 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1618 -- After vect_compute_data_refs_alignment:
1619 for (i=0; i<N; i++){
1620 x = q[i]; # DR_MISALIGNMENT(q) = 3
1621 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1624 -- Possibility 1: we do loop versioning:
1625 if (p is aligned) {
1626 for (i=0; i<N; i++){ # loop 1A
1627 x = q[i]; # DR_MISALIGNMENT(q) = 3
1628 p[i] = y; # DR_MISALIGNMENT(p) = 0
1631 else {
1632 for (i=0; i<N; i++){ # loop 1B
1633 x = q[i]; # DR_MISALIGNMENT(q) = 3
1634 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1638 -- Possibility 2: we do loop peeling:
1639 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1640 x = q[i];
1641 p[i] = y;
1643 for (i = 3; i < N; i++){ # loop 2A
1644 x = q[i]; # DR_MISALIGNMENT(q) = 0
1645 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1648 -- Possibility 3: combination of loop peeling and versioning:
1649 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1650 x = q[i];
1651 p[i] = y;
1653 if (p is aligned) {
1654 for (i = 3; i<N; i++){ # loop 3A
1655 x = q[i]; # DR_MISALIGNMENT(q) = 0
1656 p[i] = y; # DR_MISALIGNMENT(p) = 0
1659 else {
1660 for (i = 3; i<N; i++){ # loop 3B
1661 x = q[i]; # DR_MISALIGNMENT(q) = 0
1662 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1666 These loops are later passed to loop_transform to be vectorized. The
1667 vectorizer will use the alignment information to guide the transformation
1668 (whether to generate regular loads/stores, or with special handling for
1669 misalignment). */
1671 bool
1672 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1674 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1675 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1676 enum dr_alignment_support supportable_dr_alignment;
1677 dr_vec_info *first_store = NULL;
1678 dr_vec_info *dr0_info = NULL;
1679 struct data_reference *dr;
1680 unsigned int i, j;
1681 bool do_peeling = false;
1682 bool do_versioning = false;
1683 bool stat;
1684 unsigned int npeel = 0;
1685 bool one_misalignment_known = false;
1686 bool one_misalignment_unknown = false;
1687 bool one_dr_unsupportable = false;
1688 dr_vec_info *unsupportable_dr_info = NULL;
1689 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1690 unsigned possible_npeel_number = 1;
1691 tree vectype;
1692 unsigned int mis, same_align_drs_max = 0;
1693 hash_table<peel_info_hasher> peeling_htab (1);
1695 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1697 /* Reset data so we can safely be called multiple times. */
1698 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1699 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1701 /* While cost model enhancements are expected in the future, the high level
1702 view of the code at this time is as follows:
1704 A) If there is a misaligned access then see if peeling to align
1705 this access can make all data references satisfy
1706 vect_supportable_dr_alignment. If so, update data structures
1707 as needed and return true.
1709 B) If peeling wasn't possible and there is a data reference with an
1710 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1711 then see if loop versioning checks can be used to make all data
1712 references satisfy vect_supportable_dr_alignment. If so, update
1713 data structures as needed and return true.
1715 C) If neither peeling nor versioning were successful then return false if
1716 any data reference does not satisfy vect_supportable_dr_alignment.
1718 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1720 Note, Possibility 3 above (which is peeling and versioning together) is not
1721 being done at this time. */
1723 /* (1) Peeling to force alignment. */
1725 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1726 Considerations:
1727 + How many accesses will become aligned due to the peeling
1728 - How many accesses will become unaligned due to the peeling,
1729 and the cost of misaligned accesses.
1730 - The cost of peeling (the extra runtime checks, the increase
1731 in code size). */
1733 FOR_EACH_VEC_ELT (datarefs, i, dr)
1735 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1736 stmt_vec_info stmt_info = dr_info->stmt;
1738 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1739 continue;
1741 /* For interleaving, only the alignment of the first access
1742 matters. */
1743 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1744 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1745 continue;
1747 /* For scatter-gather or invariant accesses there is nothing
1748 to enhance. */
1749 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1750 || integer_zerop (DR_STEP (dr)))
1751 continue;
1753 /* Strided accesses perform only component accesses, alignment is
1754 irrelevant for them. */
1755 if (STMT_VINFO_STRIDED_P (stmt_info)
1756 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1757 continue;
1759 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1760 do_peeling = vector_alignment_reachable_p (dr_info);
1761 if (do_peeling)
1763 if (known_alignment_for_access_p (dr_info))
1765 unsigned int npeel_tmp = 0;
1766 bool negative = tree_int_cst_compare (DR_STEP (dr),
1767 size_zero_node) < 0;
1769 vectype = STMT_VINFO_VECTYPE (stmt_info);
1770 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1771 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1772 mis = (negative
1773 ? DR_MISALIGNMENT (dr_info)
1774 : -DR_MISALIGNMENT (dr_info));
1775 if (DR_MISALIGNMENT (dr_info) != 0)
1776 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1778 /* For multiple types, it is possible that the bigger type access
1779 will have more than one peeling option. E.g., a loop with two
1780 types: one of size (vector size / 4), and the other one of
1781 size (vector size / 8). Vectorization factor will 8. If both
1782 accesses are misaligned by 3, the first one needs one scalar
1783 iteration to be aligned, and the second one needs 5. But the
1784 first one will be aligned also by peeling 5 scalar
1785 iterations, and in that case both accesses will be aligned.
1786 Hence, except for the immediate peeling amount, we also want
1787 to try to add full vector size, while we don't exceed
1788 vectorization factor.
1789 We do this automatically for cost model, since we calculate
1790 cost for every peeling option. */
1791 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1793 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1794 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1795 possible_npeel_number
1796 = vect_get_num_vectors (nscalars, vectype);
1798 /* NPEEL_TMP is 0 when there is no misalignment, but also
1799 allow peeling NELEMENTS. */
1800 if (DR_MISALIGNMENT (dr_info) == 0)
1801 possible_npeel_number++;
1804 /* Save info about DR in the hash table. Also include peeling
1805 amounts according to the explanation above. */
1806 for (j = 0; j < possible_npeel_number; j++)
1808 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1809 dr_info, npeel_tmp);
1810 npeel_tmp += target_align / dr_size;
1813 one_misalignment_known = true;
1815 else
1817 /* If we don't know any misalignment values, we prefer
1818 peeling for data-ref that has the maximum number of data-refs
1819 with the same alignment, unless the target prefers to align
1820 stores over load. */
1821 unsigned same_align_drs
1822 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1823 if (!dr0_info
1824 || same_align_drs_max < same_align_drs)
1826 same_align_drs_max = same_align_drs;
1827 dr0_info = dr_info;
1829 /* For data-refs with the same number of related
1830 accesses prefer the one where the misalign
1831 computation will be invariant in the outermost loop. */
1832 else if (same_align_drs_max == same_align_drs)
1834 struct loop *ivloop0, *ivloop;
1835 ivloop0 = outermost_invariant_loop_for_expr
1836 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1837 ivloop = outermost_invariant_loop_for_expr
1838 (loop, DR_BASE_ADDRESS (dr));
1839 if ((ivloop && !ivloop0)
1840 || (ivloop && ivloop0
1841 && flow_loop_nested_p (ivloop, ivloop0)))
1842 dr0_info = dr_info;
1845 one_misalignment_unknown = true;
1847 /* Check for data refs with unsupportable alignment that
1848 can be peeled. */
1849 if (!supportable_dr_alignment)
1851 one_dr_unsupportable = true;
1852 unsupportable_dr_info = dr_info;
1855 if (!first_store && DR_IS_WRITE (dr))
1856 first_store = dr_info;
1859 else
1861 if (!aligned_access_p (dr_info))
1863 if (dump_enabled_p ())
1864 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1865 "vector alignment may not be reachable\n");
1866 break;
1871 /* Check if we can possibly peel the loop. */
1872 if (!vect_can_advance_ivs_p (loop_vinfo)
1873 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1874 || loop->inner)
1875 do_peeling = false;
1877 struct _vect_peel_extended_info peel_for_known_alignment;
1878 struct _vect_peel_extended_info peel_for_unknown_alignment;
1879 struct _vect_peel_extended_info best_peel;
1881 peel_for_unknown_alignment.inside_cost = INT_MAX;
1882 peel_for_unknown_alignment.outside_cost = INT_MAX;
1883 peel_for_unknown_alignment.peel_info.count = 0;
1885 if (do_peeling
1886 && one_misalignment_unknown)
1888 /* Check if the target requires to prefer stores over loads, i.e., if
1889 misaligned stores are more expensive than misaligned loads (taking
1890 drs with same alignment into account). */
1891 unsigned int load_inside_cost = 0;
1892 unsigned int load_outside_cost = 0;
1893 unsigned int store_inside_cost = 0;
1894 unsigned int store_outside_cost = 0;
1895 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1897 stmt_vector_for_cost dummy;
1898 dummy.create (2);
1899 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1900 &load_inside_cost,
1901 &load_outside_cost,
1902 &dummy, &dummy, estimated_npeels, true);
1903 dummy.release ();
1905 if (first_store)
1907 dummy.create (2);
1908 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1909 &store_inside_cost,
1910 &store_outside_cost,
1911 &dummy, &dummy,
1912 estimated_npeels, true);
1913 dummy.release ();
1915 else
1917 store_inside_cost = INT_MAX;
1918 store_outside_cost = INT_MAX;
1921 if (load_inside_cost > store_inside_cost
1922 || (load_inside_cost == store_inside_cost
1923 && load_outside_cost > store_outside_cost))
1925 dr0_info = first_store;
1926 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1927 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1929 else
1931 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1932 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1935 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1936 prologue_cost_vec.create (2);
1937 epilogue_cost_vec.create (2);
1939 int dummy2;
1940 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1941 (loop_vinfo, estimated_npeels, &dummy2,
1942 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1943 &prologue_cost_vec, &epilogue_cost_vec);
1945 prologue_cost_vec.release ();
1946 epilogue_cost_vec.release ();
1948 peel_for_unknown_alignment.peel_info.count = 1
1949 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1952 peel_for_unknown_alignment.peel_info.npeel = 0;
1953 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1955 best_peel = peel_for_unknown_alignment;
1957 peel_for_known_alignment.inside_cost = INT_MAX;
1958 peel_for_known_alignment.outside_cost = INT_MAX;
1959 peel_for_known_alignment.peel_info.count = 0;
1960 peel_for_known_alignment.peel_info.dr_info = NULL;
1962 if (do_peeling && one_misalignment_known)
1964 /* Peeling is possible, but there is no data access that is not supported
1965 unless aligned. So we try to choose the best possible peeling from
1966 the hash table. */
1967 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1968 (&peeling_htab, loop_vinfo);
1971 /* Compare costs of peeling for known and unknown alignment. */
1972 if (peel_for_known_alignment.peel_info.dr_info != NULL
1973 && peel_for_unknown_alignment.inside_cost
1974 >= peel_for_known_alignment.inside_cost)
1976 best_peel = peel_for_known_alignment;
1978 /* If the best peeling for known alignment has NPEEL == 0, perform no
1979 peeling at all except if there is an unsupportable dr that we can
1980 align. */
1981 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1982 do_peeling = false;
1985 /* If there is an unsupportable data ref, prefer this over all choices so far
1986 since we'd have to discard a chosen peeling except when it accidentally
1987 aligned the unsupportable data ref. */
1988 if (one_dr_unsupportable)
1989 dr0_info = unsupportable_dr_info;
1990 else if (do_peeling)
1992 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1993 TODO: Use nopeel_outside_cost or get rid of it? */
1994 unsigned nopeel_inside_cost = 0;
1995 unsigned nopeel_outside_cost = 0;
1997 stmt_vector_for_cost dummy;
1998 dummy.create (2);
1999 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2000 &nopeel_outside_cost, &dummy, &dummy,
2001 0, false);
2002 dummy.release ();
2004 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2005 costs will be recorded. */
2006 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2007 prologue_cost_vec.create (2);
2008 epilogue_cost_vec.create (2);
2010 int dummy2;
2011 nopeel_outside_cost += vect_get_known_peeling_cost
2012 (loop_vinfo, 0, &dummy2,
2013 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2014 &prologue_cost_vec, &epilogue_cost_vec);
2016 prologue_cost_vec.release ();
2017 epilogue_cost_vec.release ();
2019 npeel = best_peel.peel_info.npeel;
2020 dr0_info = best_peel.peel_info.dr_info;
2022 /* If no peeling is not more expensive than the best peeling we
2023 have so far, don't perform any peeling. */
2024 if (nopeel_inside_cost <= best_peel.inside_cost)
2025 do_peeling = false;
2028 if (do_peeling)
2030 stmt_vec_info stmt_info = dr0_info->stmt;
2031 vectype = STMT_VINFO_VECTYPE (stmt_info);
2033 if (known_alignment_for_access_p (dr0_info))
2035 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2036 size_zero_node) < 0;
2037 if (!npeel)
2039 /* Since it's known at compile time, compute the number of
2040 iterations in the peeled loop (the peeling factor) for use in
2041 updating DR_MISALIGNMENT values. The peeling factor is the
2042 vectorization factor minus the misalignment as an element
2043 count. */
2044 mis = (negative
2045 ? DR_MISALIGNMENT (dr0_info)
2046 : -DR_MISALIGNMENT (dr0_info));
2047 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2048 npeel = ((mis & (target_align - 1))
2049 / vect_get_scalar_dr_size (dr0_info));
2052 /* For interleaved data access every iteration accesses all the
2053 members of the group, therefore we divide the number of iterations
2054 by the group size. */
2055 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2056 npeel /= DR_GROUP_SIZE (stmt_info);
2058 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_NOTE, vect_location,
2060 "Try peeling by %d\n", npeel);
2063 /* Ensure that all datarefs can be vectorized after the peel. */
2064 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2065 do_peeling = false;
2067 /* Check if all datarefs are supportable and log. */
2068 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2070 stat = vect_verify_datarefs_alignment (loop_vinfo);
2071 if (!stat)
2072 do_peeling = false;
2073 else
2074 return stat;
2077 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2078 if (do_peeling)
2080 unsigned max_allowed_peel
2081 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2082 if (max_allowed_peel != (unsigned)-1)
2084 unsigned max_peel = npeel;
2085 if (max_peel == 0)
2087 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2088 max_peel = (target_align
2089 / vect_get_scalar_dr_size (dr0_info) - 1);
2091 if (max_peel > max_allowed_peel)
2093 do_peeling = false;
2094 if (dump_enabled_p ())
2095 dump_printf_loc (MSG_NOTE, vect_location,
2096 "Disable peeling, max peels reached: %d\n", max_peel);
2101 /* Cost model #2 - if peeling may result in a remaining loop not
2102 iterating enough to be vectorized then do not peel. Since this
2103 is a cost heuristic rather than a correctness decision, use the
2104 most likely runtime value for variable vectorization factors. */
2105 if (do_peeling
2106 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2108 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2109 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2110 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2111 < assumed_vf + max_peel)
2112 do_peeling = false;
2115 if (do_peeling)
2117 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2118 If the misalignment of DR_i is identical to that of dr0 then set
2119 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2120 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2121 by the peeling factor times the element size of DR_i (MOD the
2122 vectorization factor times the size). Otherwise, the
2123 misalignment of DR_i must be set to unknown. */
2124 FOR_EACH_VEC_ELT (datarefs, i, dr)
2125 if (dr != dr0_info->dr)
2127 /* Strided accesses perform only component accesses, alignment
2128 is irrelevant for them. */
2129 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2130 stmt_info = dr_info->stmt;
2131 if (STMT_VINFO_STRIDED_P (stmt_info)
2132 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2133 continue;
2135 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2138 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2139 if (npeel)
2140 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2141 else
2142 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2143 = DR_MISALIGNMENT (dr0_info);
2144 SET_DR_MISALIGNMENT (dr0_info, 0);
2145 if (dump_enabled_p ())
2147 dump_printf_loc (MSG_NOTE, vect_location,
2148 "Alignment of access forced using peeling.\n");
2149 dump_printf_loc (MSG_NOTE, vect_location,
2150 "Peeling for alignment will be applied.\n");
2153 /* The inside-loop cost will be accounted for in vectorizable_load
2154 and vectorizable_store correctly with adjusted alignments.
2155 Drop the body_cst_vec on the floor here. */
2156 stat = vect_verify_datarefs_alignment (loop_vinfo);
2157 gcc_assert (stat);
2158 return stat;
2162 /* (2) Versioning to force alignment. */
2164 /* Try versioning if:
2165 1) optimize loop for speed
2166 2) there is at least one unsupported misaligned data ref with an unknown
2167 misalignment, and
2168 3) all misaligned data refs with a known misalignment are supported, and
2169 4) the number of runtime alignment checks is within reason. */
2171 do_versioning =
2172 optimize_loop_nest_for_speed_p (loop)
2173 && (!loop->inner); /* FORNOW */
2175 if (do_versioning)
2177 FOR_EACH_VEC_ELT (datarefs, i, dr)
2179 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2180 stmt_vec_info stmt_info = dr_info->stmt;
2182 /* For interleaving, only the alignment of the first access
2183 matters. */
2184 if (aligned_access_p (dr_info)
2185 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2186 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2187 continue;
2189 if (STMT_VINFO_STRIDED_P (stmt_info))
2191 /* Strided loads perform only component accesses, alignment is
2192 irrelevant for them. */
2193 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2194 continue;
2195 do_versioning = false;
2196 break;
2199 supportable_dr_alignment
2200 = vect_supportable_dr_alignment (dr_info, false);
2202 if (!supportable_dr_alignment)
2204 int mask;
2205 tree vectype;
2207 if (known_alignment_for_access_p (dr_info)
2208 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2209 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2211 do_versioning = false;
2212 break;
2215 vectype = STMT_VINFO_VECTYPE (stmt_info);
2216 gcc_assert (vectype);
2218 /* At present we don't support versioning for alignment
2219 with variable VF, since there's no guarantee that the
2220 VF is a power of two. We could relax this if we added
2221 a way of enforcing a power-of-two size. */
2222 unsigned HOST_WIDE_INT size;
2223 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2225 do_versioning = false;
2226 break;
2229 /* The rightmost bits of an aligned address must be zeros.
2230 Construct the mask needed for this test. For example,
2231 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2232 mask must be 15 = 0xf. */
2233 mask = size - 1;
2235 /* FORNOW: use the same mask to test all potentially unaligned
2236 references in the loop. The vectorizer currently supports
2237 a single vector size, see the reference to
2238 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2239 vectorization factor is computed. */
2240 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2241 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2242 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2243 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2247 /* Versioning requires at least one misaligned data reference. */
2248 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2249 do_versioning = false;
2250 else if (!do_versioning)
2251 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2254 if (do_versioning)
2256 vec<stmt_vec_info> may_misalign_stmts
2257 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2258 stmt_vec_info stmt_info;
2260 /* It can now be assumed that the data references in the statements
2261 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2262 of the loop being vectorized. */
2263 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2265 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2266 SET_DR_MISALIGNMENT (dr_info, 0);
2267 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_NOTE, vect_location,
2269 "Alignment of access forced using versioning.\n");
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_NOTE, vect_location,
2274 "Versioning for alignment will be applied.\n");
2276 /* Peeling and versioning can't be done together at this time. */
2277 gcc_assert (! (do_peeling && do_versioning));
2279 stat = vect_verify_datarefs_alignment (loop_vinfo);
2280 gcc_assert (stat);
2281 return stat;
2284 /* This point is reached if neither peeling nor versioning is being done. */
2285 gcc_assert (! (do_peeling || do_versioning));
2287 stat = vect_verify_datarefs_alignment (loop_vinfo);
2288 return stat;
2292 /* Function vect_find_same_alignment_drs.
2294 Update group and alignment relations in VINFO according to the chosen
2295 vectorization factor. */
2297 static void
2298 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2300 struct data_reference *dra = DDR_A (ddr);
2301 struct data_reference *drb = DDR_B (ddr);
2302 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2303 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2304 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2305 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2307 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2308 return;
2310 if (dra == drb)
2311 return;
2313 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2314 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2315 return;
2317 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2318 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2319 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2320 return;
2322 /* Two references with distance zero have the same alignment. */
2323 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2324 - wi::to_poly_offset (DR_INIT (drb)));
2325 if (maybe_ne (diff, 0))
2327 /* Get the wider of the two alignments. */
2328 unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2329 / BITS_PER_UNIT);
2330 unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2331 / BITS_PER_UNIT);
2332 unsigned int max_align = MAX (align_a, align_b);
2334 /* Require the gap to be a multiple of the larger vector alignment. */
2335 if (!multiple_p (diff, max_align))
2336 return;
2339 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2340 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2341 if (dump_enabled_p ())
2343 dump_printf_loc (MSG_NOTE, vect_location,
2344 "accesses have the same alignment: ");
2345 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2346 dump_printf (MSG_NOTE, " and ");
2347 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2348 dump_printf (MSG_NOTE, "\n");
2353 /* Function vect_analyze_data_refs_alignment
2355 Analyze the alignment of the data-references in the loop.
2356 Return FALSE if a data reference is found that cannot be vectorized. */
2358 bool
2359 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2361 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2363 /* Mark groups of data references with same alignment using
2364 data dependence information. */
2365 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2366 struct data_dependence_relation *ddr;
2367 unsigned int i;
2369 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2370 vect_find_same_alignment_drs (vinfo, ddr);
2372 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2373 struct data_reference *dr;
2375 vect_record_base_alignments (vinfo);
2376 FOR_EACH_VEC_ELT (datarefs, i, dr)
2378 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2379 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2380 vect_compute_data_ref_alignment (dr_info);
2383 return true;
2387 /* Analyze alignment of DRs of stmts in NODE. */
2389 static bool
2390 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2392 /* We vectorize from the first scalar stmt in the node unless
2393 the node is permuted in which case we start from the first
2394 element in the group. */
2395 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2396 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2397 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2398 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2400 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2401 vect_compute_data_ref_alignment (dr_info);
2402 /* For creating the data-ref pointer we need alignment of the
2403 first element anyway. */
2404 if (dr_info != first_dr_info)
2405 vect_compute_data_ref_alignment (first_dr_info);
2406 if (! verify_data_ref_alignment (dr_info))
2408 if (dump_enabled_p ())
2409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2410 "not vectorized: bad data alignment in basic "
2411 "block.\n");
2412 return false;
2415 return true;
2418 /* Function vect_slp_analyze_instance_alignment
2420 Analyze the alignment of the data-references in the SLP instance.
2421 Return FALSE if a data reference is found that cannot be vectorized. */
2423 bool
2424 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2426 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2428 slp_tree node;
2429 unsigned i;
2430 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2431 if (! vect_slp_analyze_and_verify_node_alignment (node))
2432 return false;
2434 node = SLP_INSTANCE_TREE (instance);
2435 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2436 && ! vect_slp_analyze_and_verify_node_alignment
2437 (SLP_INSTANCE_TREE (instance)))
2438 return false;
2440 return true;
2444 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2445 accesses of legal size, step, etc. Detect gaps, single element
2446 interleaving, and other special cases. Set grouped access info.
2447 Collect groups of strided stores for further use in SLP analysis.
2448 Worker for vect_analyze_group_access. */
2450 static bool
2451 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2453 data_reference *dr = dr_info->dr;
2454 tree step = DR_STEP (dr);
2455 tree scalar_type = TREE_TYPE (DR_REF (dr));
2456 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2457 stmt_vec_info stmt_info = dr_info->stmt;
2458 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2459 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2460 HOST_WIDE_INT dr_step = -1;
2461 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2462 bool slp_impossible = false;
2464 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2465 size of the interleaving group (including gaps). */
2466 if (tree_fits_shwi_p (step))
2468 dr_step = tree_to_shwi (step);
2469 /* Check that STEP is a multiple of type size. Otherwise there is
2470 a non-element-sized gap at the end of the group which we
2471 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2472 ??? As we can handle non-constant step fine here we should
2473 simply remove uses of DR_GROUP_GAP between the last and first
2474 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2475 simply not include that gap. */
2476 if ((dr_step % type_size) != 0)
2478 if (dump_enabled_p ())
2480 dump_printf_loc (MSG_NOTE, vect_location,
2481 "Step ");
2482 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2483 dump_printf (MSG_NOTE,
2484 " is not a multiple of the element size for ");
2485 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2486 dump_printf (MSG_NOTE, "\n");
2488 return false;
2490 groupsize = absu_hwi (dr_step) / type_size;
2492 else
2493 groupsize = 0;
2495 /* Not consecutive access is possible only if it is a part of interleaving. */
2496 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2498 /* Check if it this DR is a part of interleaving, and is a single
2499 element of the group that is accessed in the loop. */
2501 /* Gaps are supported only for loads. STEP must be a multiple of the type
2502 size. */
2503 if (DR_IS_READ (dr)
2504 && (dr_step % type_size) == 0
2505 && groupsize > 0)
2507 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2508 DR_GROUP_SIZE (stmt_info) = groupsize;
2509 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2510 if (dump_enabled_p ())
2512 dump_printf_loc (MSG_NOTE, vect_location,
2513 "Detected single element interleaving ");
2514 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2515 dump_printf (MSG_NOTE, " step ");
2516 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2517 dump_printf (MSG_NOTE, "\n");
2520 return true;
2523 if (dump_enabled_p ())
2525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526 "not consecutive access ");
2527 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2528 stmt_info->stmt, 0);
2531 if (bb_vinfo)
2533 /* Mark the statement as unvectorizable. */
2534 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2535 return true;
2538 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2539 STMT_VINFO_STRIDED_P (stmt_info) = true;
2540 return true;
2543 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2545 /* First stmt in the interleaving chain. Check the chain. */
2546 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2547 struct data_reference *data_ref = dr;
2548 unsigned int count = 1;
2549 tree prev_init = DR_INIT (data_ref);
2550 stmt_vec_info prev = stmt_info;
2551 HOST_WIDE_INT diff, gaps = 0;
2553 /* By construction, all group members have INTEGER_CST DR_INITs. */
2554 while (next)
2556 /* Skip same data-refs. In case that two or more stmts share
2557 data-ref (supported only for loads), we vectorize only the first
2558 stmt, and the rest get their vectorized loads from the first
2559 one. */
2560 if (!tree_int_cst_compare (DR_INIT (data_ref),
2561 DR_INIT (STMT_VINFO_DATA_REF (next))))
2563 if (DR_IS_WRITE (data_ref))
2565 if (dump_enabled_p ())
2566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2567 "Two store stmts share the same dr.\n");
2568 return false;
2571 if (dump_enabled_p ())
2572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2573 "Two or more load stmts share the same dr.\n");
2575 /* For load use the same data-ref load. */
2576 DR_GROUP_SAME_DR_STMT (next) = prev;
2578 prev = next;
2579 next = DR_GROUP_NEXT_ELEMENT (next);
2580 continue;
2583 prev = next;
2584 data_ref = STMT_VINFO_DATA_REF (next);
2586 /* All group members have the same STEP by construction. */
2587 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2589 /* Check that the distance between two accesses is equal to the type
2590 size. Otherwise, we have gaps. */
2591 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2592 - TREE_INT_CST_LOW (prev_init)) / type_size;
2593 if (diff != 1)
2595 /* FORNOW: SLP of accesses with gaps is not supported. */
2596 slp_impossible = true;
2597 if (DR_IS_WRITE (data_ref))
2599 if (dump_enabled_p ())
2600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2601 "interleaved store with gaps\n");
2602 return false;
2605 gaps += diff - 1;
2608 last_accessed_element += diff;
2610 /* Store the gap from the previous member of the group. If there is no
2611 gap in the access, DR_GROUP_GAP is always 1. */
2612 DR_GROUP_GAP (next) = diff;
2614 prev_init = DR_INIT (data_ref);
2615 next = DR_GROUP_NEXT_ELEMENT (next);
2616 /* Count the number of data-refs in the chain. */
2617 count++;
2620 if (groupsize == 0)
2621 groupsize = count + gaps;
2623 /* This could be UINT_MAX but as we are generating code in a very
2624 inefficient way we have to cap earlier. See PR78699 for example. */
2625 if (groupsize > 4096)
2627 if (dump_enabled_p ())
2628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2629 "group is too large\n");
2630 return false;
2633 /* Check that the size of the interleaving is equal to count for stores,
2634 i.e., that there are no gaps. */
2635 if (groupsize != count
2636 && !DR_IS_READ (dr))
2638 if (dump_enabled_p ())
2639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2640 "interleaved store with gaps\n");
2641 return false;
2644 /* If there is a gap after the last load in the group it is the
2645 difference between the groupsize and the last accessed
2646 element.
2647 When there is no gap, this difference should be 0. */
2648 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2650 DR_GROUP_SIZE (stmt_info) = groupsize;
2651 if (dump_enabled_p ())
2653 dump_printf_loc (MSG_NOTE, vect_location,
2654 "Detected interleaving ");
2655 if (DR_IS_READ (dr))
2656 dump_printf (MSG_NOTE, "load ");
2657 else
2658 dump_printf (MSG_NOTE, "store ");
2659 dump_printf (MSG_NOTE, "of size %u starting with ",
2660 (unsigned)groupsize);
2661 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2662 if (DR_GROUP_GAP (stmt_info) != 0)
2663 dump_printf_loc (MSG_NOTE, vect_location,
2664 "There is a gap of %u elements after the group\n",
2665 DR_GROUP_GAP (stmt_info));
2668 /* SLP: create an SLP data structure for every interleaving group of
2669 stores for further analysis in vect_analyse_slp. */
2670 if (DR_IS_WRITE (dr) && !slp_impossible)
2672 if (loop_vinfo)
2673 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2674 if (bb_vinfo)
2675 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2679 return true;
2682 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2683 accesses of legal size, step, etc. Detect gaps, single element
2684 interleaving, and other special cases. Set grouped access info.
2685 Collect groups of strided stores for further use in SLP analysis. */
2687 static bool
2688 vect_analyze_group_access (dr_vec_info *dr_info)
2690 if (!vect_analyze_group_access_1 (dr_info))
2692 /* Dissolve the group if present. */
2693 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2694 while (stmt_info)
2696 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2697 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2698 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2699 stmt_info = next;
2701 return false;
2703 return true;
2706 /* Analyze the access pattern of the data-reference DR_INFO.
2707 In case of non-consecutive accesses call vect_analyze_group_access() to
2708 analyze groups of accesses. */
2710 static bool
2711 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2713 data_reference *dr = dr_info->dr;
2714 tree step = DR_STEP (dr);
2715 tree scalar_type = TREE_TYPE (DR_REF (dr));
2716 stmt_vec_info stmt_info = dr_info->stmt;
2717 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2718 struct loop *loop = NULL;
2720 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2721 return true;
2723 if (loop_vinfo)
2724 loop = LOOP_VINFO_LOOP (loop_vinfo);
2726 if (loop_vinfo && !step)
2728 if (dump_enabled_p ())
2729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2730 "bad data-ref access in loop\n");
2731 return false;
2734 /* Allow loads with zero step in inner-loop vectorization. */
2735 if (loop_vinfo && integer_zerop (step))
2737 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2738 if (!nested_in_vect_loop_p (loop, stmt_info))
2739 return DR_IS_READ (dr);
2740 /* Allow references with zero step for outer loops marked
2741 with pragma omp simd only - it guarantees absence of
2742 loop-carried dependencies between inner loop iterations. */
2743 if (loop->safelen < 2)
2745 if (dump_enabled_p ())
2746 dump_printf_loc (MSG_NOTE, vect_location,
2747 "zero step in inner loop of nest\n");
2748 return false;
2752 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2754 /* Interleaved accesses are not yet supported within outer-loop
2755 vectorization for references in the inner-loop. */
2756 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2758 /* For the rest of the analysis we use the outer-loop step. */
2759 step = STMT_VINFO_DR_STEP (stmt_info);
2760 if (integer_zerop (step))
2762 if (dump_enabled_p ())
2763 dump_printf_loc (MSG_NOTE, vect_location,
2764 "zero step in outer loop.\n");
2765 return DR_IS_READ (dr);
2769 /* Consecutive? */
2770 if (TREE_CODE (step) == INTEGER_CST)
2772 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2773 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2774 || (dr_step < 0
2775 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2777 /* Mark that it is not interleaving. */
2778 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2779 return true;
2783 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2785 if (dump_enabled_p ())
2786 dump_printf_loc (MSG_NOTE, vect_location,
2787 "grouped access in outer loop.\n");
2788 return false;
2792 /* Assume this is a DR handled by non-constant strided load case. */
2793 if (TREE_CODE (step) != INTEGER_CST)
2794 return (STMT_VINFO_STRIDED_P (stmt_info)
2795 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2796 || vect_analyze_group_access (dr_info)));
2798 /* Not consecutive access - check if it's a part of interleaving group. */
2799 return vect_analyze_group_access (dr_info);
2802 /* Compare two data-references DRA and DRB to group them into chunks
2803 suitable for grouping. */
2805 static int
2806 dr_group_sort_cmp (const void *dra_, const void *drb_)
2808 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2809 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2810 int cmp;
2812 /* Stabilize sort. */
2813 if (dra == drb)
2814 return 0;
2816 /* DRs in different loops never belong to the same group. */
2817 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2818 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2819 if (loopa != loopb)
2820 return loopa->num < loopb->num ? -1 : 1;
2822 /* Ordering of DRs according to base. */
2823 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2824 DR_BASE_ADDRESS (drb));
2825 if (cmp != 0)
2826 return cmp;
2828 /* And according to DR_OFFSET. */
2829 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2830 if (cmp != 0)
2831 return cmp;
2833 /* Put reads before writes. */
2834 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2835 return DR_IS_READ (dra) ? -1 : 1;
2837 /* Then sort after access size. */
2838 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2839 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2840 if (cmp != 0)
2841 return cmp;
2843 /* And after step. */
2844 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2845 if (cmp != 0)
2846 return cmp;
2848 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2849 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2850 if (cmp == 0)
2851 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2852 return cmp;
2855 /* If OP is the result of a conversion, return the unconverted value,
2856 otherwise return null. */
2858 static tree
2859 strip_conversion (tree op)
2861 if (TREE_CODE (op) != SSA_NAME)
2862 return NULL_TREE;
2863 gimple *stmt = SSA_NAME_DEF_STMT (op);
2864 if (!is_gimple_assign (stmt)
2865 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2866 return NULL_TREE;
2867 return gimple_assign_rhs1 (stmt);
2870 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2871 and STMT2_INFO being in a single group. */
2873 static bool
2874 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2876 if (gimple_assign_single_p (stmt1_info->stmt))
2877 return gimple_assign_single_p (stmt2_info->stmt);
2879 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2880 if (call1 && gimple_call_internal_p (call1))
2882 /* Check for two masked loads or two masked stores. */
2883 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2884 if (!call2 || !gimple_call_internal_p (call2))
2885 return false;
2886 internal_fn ifn = gimple_call_internal_fn (call1);
2887 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2888 return false;
2889 if (ifn != gimple_call_internal_fn (call2))
2890 return false;
2892 /* Check that the masks are the same. Cope with casts of masks,
2893 like those created by build_mask_conversion. */
2894 tree mask1 = gimple_call_arg (call1, 2);
2895 tree mask2 = gimple_call_arg (call2, 2);
2896 if (!operand_equal_p (mask1, mask2, 0))
2898 mask1 = strip_conversion (mask1);
2899 if (!mask1)
2900 return false;
2901 mask2 = strip_conversion (mask2);
2902 if (!mask2)
2903 return false;
2904 if (!operand_equal_p (mask1, mask2, 0))
2905 return false;
2907 return true;
2910 return false;
2913 /* Function vect_analyze_data_ref_accesses.
2915 Analyze the access pattern of all the data references in the loop.
2917 FORNOW: the only access pattern that is considered vectorizable is a
2918 simple step 1 (consecutive) access.
2920 FORNOW: handle only arrays and pointer accesses. */
2922 bool
2923 vect_analyze_data_ref_accesses (vec_info *vinfo)
2925 unsigned int i;
2926 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2927 struct data_reference *dr;
2929 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2931 if (datarefs.is_empty ())
2932 return true;
2934 /* Sort the array of datarefs to make building the interleaving chains
2935 linear. Don't modify the original vector's order, it is needed for
2936 determining what dependencies are reversed. */
2937 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2938 datarefs_copy.qsort (dr_group_sort_cmp);
2940 /* Build the interleaving chains. */
2941 for (i = 0; i < datarefs_copy.length () - 1;)
2943 data_reference_p dra = datarefs_copy[i];
2944 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2945 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2946 stmt_vec_info lastinfo = NULL;
2947 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2948 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2950 ++i;
2951 continue;
2953 for (i = i + 1; i < datarefs_copy.length (); ++i)
2955 data_reference_p drb = datarefs_copy[i];
2956 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2957 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2958 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2959 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2960 break;
2962 /* ??? Imperfect sorting (non-compatible types, non-modulo
2963 accesses, same accesses) can lead to a group to be artificially
2964 split here as we don't just skip over those. If it really
2965 matters we can push those to a worklist and re-iterate
2966 over them. The we can just skip ahead to the next DR here. */
2968 /* DRs in a different loop should not be put into the same
2969 interleaving group. */
2970 if (gimple_bb (DR_STMT (dra))->loop_father
2971 != gimple_bb (DR_STMT (drb))->loop_father)
2972 break;
2974 /* Check that the data-refs have same first location (except init)
2975 and they are both either store or load (not load and store,
2976 not masked loads or stores). */
2977 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2978 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2979 DR_BASE_ADDRESS (drb)) != 0
2980 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2981 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2982 break;
2984 /* Check that the data-refs have the same constant size. */
2985 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2986 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2987 if (!tree_fits_uhwi_p (sza)
2988 || !tree_fits_uhwi_p (szb)
2989 || !tree_int_cst_equal (sza, szb))
2990 break;
2992 /* Check that the data-refs have the same step. */
2993 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2994 break;
2996 /* Check the types are compatible.
2997 ??? We don't distinguish this during sorting. */
2998 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2999 TREE_TYPE (DR_REF (drb))))
3000 break;
3002 /* Check that the DR_INITs are compile-time constants. */
3003 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3004 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3005 break;
3007 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3008 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3009 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3010 HOST_WIDE_INT init_prev
3011 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3012 gcc_assert (init_a <= init_b
3013 && init_a <= init_prev
3014 && init_prev <= init_b);
3016 /* Do not place the same access in the interleaving chain twice. */
3017 if (init_b == init_prev)
3019 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3020 < gimple_uid (DR_STMT (drb)));
3021 /* ??? For now we simply "drop" the later reference which is
3022 otherwise the same rather than finishing off this group.
3023 In the end we'd want to re-process duplicates forming
3024 multiple groups from the refs, likely by just collecting
3025 all candidates (including duplicates and split points
3026 below) in a vector and then process them together. */
3027 continue;
3030 /* If init_b == init_a + the size of the type * k, we have an
3031 interleaving, and DRA is accessed before DRB. */
3032 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3033 if (type_size_a == 0
3034 || (init_b - init_a) % type_size_a != 0)
3035 break;
3037 /* If we have a store, the accesses are adjacent. This splits
3038 groups into chunks we support (we don't support vectorization
3039 of stores with gaps). */
3040 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3041 break;
3043 /* If the step (if not zero or non-constant) is greater than the
3044 difference between data-refs' inits this splits groups into
3045 suitable sizes. */
3046 if (tree_fits_shwi_p (DR_STEP (dra)))
3048 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3049 if (step != 0 && step <= (init_b - init_a))
3050 break;
3053 if (dump_enabled_p ())
3055 dump_printf_loc (MSG_NOTE, vect_location,
3056 "Detected interleaving ");
3057 if (DR_IS_READ (dra))
3058 dump_printf (MSG_NOTE, "load ");
3059 else
3060 dump_printf (MSG_NOTE, "store ");
3061 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3062 dump_printf (MSG_NOTE, " and ");
3063 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3064 dump_printf (MSG_NOTE, "\n");
3067 /* Link the found element into the group list. */
3068 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3070 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3071 lastinfo = stmtinfo_a;
3073 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3074 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3075 lastinfo = stmtinfo_b;
3079 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3081 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3082 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3083 && !vect_analyze_data_ref_access (dr_info))
3085 if (dump_enabled_p ())
3086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3087 "not vectorized: complicated access pattern.\n");
3089 if (is_a <bb_vec_info> (vinfo))
3091 /* Mark the statement as not vectorizable. */
3092 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3093 continue;
3095 else
3097 datarefs_copy.release ();
3098 return false;
3103 datarefs_copy.release ();
3104 return true;
3107 /* Function vect_vfa_segment_size.
3109 Input:
3110 DR_INFO: The data reference.
3111 LENGTH_FACTOR: segment length to consider.
3113 Return a value suitable for the dr_with_seg_len::seg_len field.
3114 This is the "distance travelled" by the pointer from the first
3115 iteration in the segment to the last. Note that it does not include
3116 the size of the access; in effect it only describes the first byte. */
3118 static tree
3119 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3121 length_factor = size_binop (MINUS_EXPR,
3122 fold_convert (sizetype, length_factor),
3123 size_one_node);
3124 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3125 length_factor);
3128 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3129 gives the worst-case number of bytes covered by the segment. */
3131 static unsigned HOST_WIDE_INT
3132 vect_vfa_access_size (dr_vec_info *dr_info)
3134 stmt_vec_info stmt_vinfo = dr_info->stmt;
3135 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3136 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3137 unsigned HOST_WIDE_INT access_size = ref_size;
3138 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3140 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3141 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3143 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3144 && (vect_supportable_dr_alignment (dr_info, false)
3145 == dr_explicit_realign_optimized))
3147 /* We might access a full vector's worth. */
3148 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3149 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3151 return access_size;
3154 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3155 describes. */
3157 static unsigned int
3158 vect_vfa_align (dr_vec_info *dr_info)
3160 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3163 /* Function vect_no_alias_p.
3165 Given data references A and B with equal base and offset, see whether
3166 the alias relation can be decided at compilation time. Return 1 if
3167 it can and the references alias, 0 if it can and the references do
3168 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3169 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3170 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3172 static int
3173 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3174 tree segment_length_a, tree segment_length_b,
3175 unsigned HOST_WIDE_INT access_size_a,
3176 unsigned HOST_WIDE_INT access_size_b)
3178 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3179 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3180 poly_uint64 const_length_a;
3181 poly_uint64 const_length_b;
3183 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3184 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3185 [a, a+12) */
3186 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3188 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3189 offset_a = (offset_a + access_size_a) - const_length_a;
3191 else
3192 const_length_a = tree_to_poly_uint64 (segment_length_a);
3193 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3195 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3196 offset_b = (offset_b + access_size_b) - const_length_b;
3198 else
3199 const_length_b = tree_to_poly_uint64 (segment_length_b);
3201 const_length_a += access_size_a;
3202 const_length_b += access_size_b;
3204 if (ranges_known_overlap_p (offset_a, const_length_a,
3205 offset_b, const_length_b))
3206 return 1;
3208 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3209 offset_b, const_length_b))
3210 return 0;
3212 return -1;
3215 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3216 in DDR is >= VF. */
3218 static bool
3219 dependence_distance_ge_vf (data_dependence_relation *ddr,
3220 unsigned int loop_depth, poly_uint64 vf)
3222 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3223 || DDR_NUM_DIST_VECTS (ddr) == 0)
3224 return false;
3226 /* If the dependence is exact, we should have limited the VF instead. */
3227 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3229 unsigned int i;
3230 lambda_vector dist_v;
3231 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3233 HOST_WIDE_INT dist = dist_v[loop_depth];
3234 if (dist != 0
3235 && !(dist > 0 && DDR_REVERSED_P (ddr))
3236 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3237 return false;
3240 if (dump_enabled_p ())
3242 dump_printf_loc (MSG_NOTE, vect_location,
3243 "dependence distance between ");
3244 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3245 dump_printf (MSG_NOTE, " and ");
3246 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3247 dump_printf (MSG_NOTE, " is >= VF\n");
3250 return true;
3253 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3255 static void
3256 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3258 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3259 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3260 dump_printf (dump_kind, ") >= ");
3261 dump_dec (dump_kind, lower_bound.min_value);
3264 /* Record that the vectorized loop requires the vec_lower_bound described
3265 by EXPR, UNSIGNED_P and MIN_VALUE. */
3267 static void
3268 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3269 poly_uint64 min_value)
3271 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3272 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3273 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3275 unsigned_p &= lower_bounds[i].unsigned_p;
3276 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3277 if (lower_bounds[i].unsigned_p != unsigned_p
3278 || maybe_lt (lower_bounds[i].min_value, min_value))
3280 lower_bounds[i].unsigned_p = unsigned_p;
3281 lower_bounds[i].min_value = min_value;
3282 if (dump_enabled_p ())
3284 dump_printf_loc (MSG_NOTE, vect_location,
3285 "updating run-time check to ");
3286 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3287 dump_printf (MSG_NOTE, "\n");
3290 return;
3293 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3294 if (dump_enabled_p ())
3296 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3297 dump_lower_bound (MSG_NOTE, lower_bound);
3298 dump_printf (MSG_NOTE, "\n");
3300 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3303 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3304 will span fewer than GAP bytes. */
3306 static bool
3307 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3308 poly_int64 gap)
3310 stmt_vec_info stmt_info = dr_info->stmt;
3311 HOST_WIDE_INT count
3312 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3313 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3314 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3315 return (estimated_poly_value (gap)
3316 <= count * vect_get_scalar_dr_size (dr_info));
3319 /* Return true if we know that there is no alias between DR_INFO_A and
3320 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3321 When returning true, set *LOWER_BOUND_OUT to this N. */
3323 static bool
3324 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3325 poly_uint64 *lower_bound_out)
3327 /* Check that there is a constant gap of known sign between DR_A
3328 and DR_B. */
3329 data_reference *dr_a = dr_info_a->dr;
3330 data_reference *dr_b = dr_info_b->dr;
3331 poly_int64 init_a, init_b;
3332 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3333 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3334 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3335 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3336 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3337 || !ordered_p (init_a, init_b))
3338 return false;
3340 /* Sort DR_A and DR_B by the address they access. */
3341 if (maybe_lt (init_b, init_a))
3343 std::swap (init_a, init_b);
3344 std::swap (dr_info_a, dr_info_b);
3345 std::swap (dr_a, dr_b);
3348 /* If the two accesses could be dependent within a scalar iteration,
3349 make sure that we'd retain their order. */
3350 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3351 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3352 return false;
3354 /* There is no alias if abs (DR_STEP) is greater than or equal to
3355 the bytes spanned by the combination of the two accesses. */
3356 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3357 return true;
3360 /* Function vect_prune_runtime_alias_test_list.
3362 Prune a list of ddrs to be tested at run-time by versioning for alias.
3363 Merge several alias checks into one if possible.
3364 Return FALSE if resulting list of ddrs is longer then allowed by
3365 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3367 bool
3368 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3370 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3371 hash_set <tree_pair_hash> compared_objects;
3373 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3374 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3375 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3376 vec<vec_object_pair> &check_unequal_addrs
3377 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3378 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3379 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3381 ddr_p ddr;
3382 unsigned int i;
3383 tree length_factor;
3385 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3387 /* Step values are irrelevant for aliasing if the number of vector
3388 iterations is equal to the number of scalar iterations (which can
3389 happen for fully-SLP loops). */
3390 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3392 if (!ignore_step_p)
3394 /* Convert the checks for nonzero steps into bound tests. */
3395 tree value;
3396 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3397 vect_check_lower_bound (loop_vinfo, value, true, 1);
3400 if (may_alias_ddrs.is_empty ())
3401 return true;
3403 comp_alias_ddrs.create (may_alias_ddrs.length ());
3405 unsigned int loop_depth
3406 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3407 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3409 /* First, we collect all data ref pairs for aliasing checks. */
3410 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3412 int comp_res;
3413 poly_uint64 lower_bound;
3414 tree segment_length_a, segment_length_b;
3415 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3416 unsigned int align_a, align_b;
3418 /* Ignore the alias if the VF we chose ended up being no greater
3419 than the dependence distance. */
3420 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3421 continue;
3423 if (DDR_OBJECT_A (ddr))
3425 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3426 if (!compared_objects.add (new_pair))
3428 if (dump_enabled_p ())
3430 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3431 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3432 dump_printf (MSG_NOTE, " and ");
3433 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3434 dump_printf (MSG_NOTE, " have different addresses\n");
3436 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3438 continue;
3441 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3442 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3444 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3445 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3447 /* Skip the pair if inter-iteration dependencies are irrelevant
3448 and intra-iteration dependencies are guaranteed to be honored. */
3449 if (ignore_step_p
3450 && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3451 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3452 &lower_bound)))
3454 if (dump_enabled_p ())
3456 dump_printf_loc (MSG_NOTE, vect_location,
3457 "no need for alias check between ");
3458 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3459 dump_printf (MSG_NOTE, " and ");
3460 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3461 dump_printf (MSG_NOTE, " when VF is 1\n");
3463 continue;
3466 /* See whether we can handle the alias using a bounds check on
3467 the step, and whether that's likely to be the best approach.
3468 (It might not be, for example, if the minimum step is much larger
3469 than the number of bytes handled by one vector iteration.) */
3470 if (!ignore_step_p
3471 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3472 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3473 &lower_bound)
3474 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3475 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3477 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3478 if (dump_enabled_p ())
3480 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3481 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3482 dump_printf (MSG_NOTE, " and ");
3483 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3484 dump_printf (MSG_NOTE, " when the step ");
3485 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_info_a->dr));
3486 dump_printf (MSG_NOTE, " is outside ");
3487 if (unsigned_p)
3488 dump_printf (MSG_NOTE, "[0");
3489 else
3491 dump_printf (MSG_NOTE, "(");
3492 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3494 dump_printf (MSG_NOTE, ", ");
3495 dump_dec (MSG_NOTE, lower_bound);
3496 dump_printf (MSG_NOTE, ")\n");
3498 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3499 unsigned_p, lower_bound);
3500 continue;
3503 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3504 if (dr_group_first_a)
3506 stmt_info_a = dr_group_first_a;
3507 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3510 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3511 if (dr_group_first_b)
3513 stmt_info_b = dr_group_first_b;
3514 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3517 if (ignore_step_p)
3519 segment_length_a = size_zero_node;
3520 segment_length_b = size_zero_node;
3522 else
3524 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3525 DR_STEP (dr_info_b->dr), 0))
3526 length_factor = scalar_loop_iters;
3527 else
3528 length_factor = size_int (vect_factor);
3529 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3530 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3532 access_size_a = vect_vfa_access_size (dr_info_a);
3533 access_size_b = vect_vfa_access_size (dr_info_b);
3534 align_a = vect_vfa_align (dr_info_a);
3535 align_b = vect_vfa_align (dr_info_b);
3537 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3538 DR_BASE_ADDRESS (dr_info_b->dr));
3539 if (comp_res == 0)
3540 comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3541 DR_OFFSET (dr_info_b->dr));
3543 /* See whether the alias is known at compilation time. */
3544 if (comp_res == 0
3545 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3546 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3547 && poly_int_tree_p (segment_length_a)
3548 && poly_int_tree_p (segment_length_b))
3550 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3551 segment_length_a,
3552 segment_length_b,
3553 access_size_a,
3554 access_size_b);
3555 if (res >= 0 && dump_enabled_p ())
3557 dump_printf_loc (MSG_NOTE, vect_location,
3558 "can tell at compile time that ");
3559 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3560 dump_printf (MSG_NOTE, " and ");
3561 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3562 if (res == 0)
3563 dump_printf (MSG_NOTE, " do not alias\n");
3564 else
3565 dump_printf (MSG_NOTE, " alias\n");
3568 if (res == 0)
3569 continue;
3571 if (res == 1)
3573 if (dump_enabled_p ())
3574 dump_printf_loc (MSG_NOTE, vect_location,
3575 "not vectorized: compilation time alias.\n");
3576 return false;
3580 dr_with_seg_len_pair_t dr_with_seg_len_pair
3581 (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3582 access_size_a, align_a),
3583 dr_with_seg_len (dr_info_b->dr, segment_length_b,
3584 access_size_b, align_b));
3586 /* Canonicalize pairs by sorting the two DR members. */
3587 if (comp_res > 0)
3588 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3590 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3593 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3595 unsigned int count = (comp_alias_ddrs.length ()
3596 + check_unequal_addrs.length ());
3598 dump_printf_loc (MSG_NOTE, vect_location,
3599 "improved number of alias checks from %d to %d\n",
3600 may_alias_ddrs.length (), count);
3601 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3603 if (dump_enabled_p ())
3604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3605 "number of versioning for alias "
3606 "run-time tests exceeds %d "
3607 "(--param vect-max-version-for-alias-checks)\n",
3608 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3609 return false;
3612 return true;
3615 /* Check whether we can use an internal function for a gather load
3616 or scatter store. READ_P is true for loads and false for stores.
3617 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3618 the type of the memory elements being loaded or stored. OFFSET_BITS
3619 is the number of bits in each scalar offset and OFFSET_SIGN is the
3620 sign of the offset. SCALE is the amount by which the offset should
3621 be multiplied *after* it has been converted to address width.
3623 Return true if the function is supported, storing the function
3624 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3626 bool
3627 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3628 tree memory_type, unsigned int offset_bits,
3629 signop offset_sign, int scale,
3630 internal_fn *ifn_out, tree *element_type_out)
3632 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3633 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3634 if (offset_bits > element_bits)
3635 /* Internal functions require the offset to be the same width as
3636 the vector elements. We can extend narrower offsets, but it isn't
3637 safe to truncate wider offsets. */
3638 return false;
3640 if (element_bits != memory_bits)
3641 /* For now the vector elements must be the same width as the
3642 memory elements. */
3643 return false;
3645 /* Work out which function we need. */
3646 internal_fn ifn;
3647 if (read_p)
3648 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3649 else
3650 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3652 /* Test whether the target supports this combination. */
3653 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3654 offset_sign, scale))
3655 return false;
3657 *ifn_out = ifn;
3658 *element_type_out = TREE_TYPE (vectype);
3659 return true;
3662 /* STMT_INFO is a call to an internal gather load or scatter store function.
3663 Describe the operation in INFO. */
3665 static void
3666 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3667 gather_scatter_info *info)
3669 gcall *call = as_a <gcall *> (stmt_info->stmt);
3670 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3671 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3673 info->ifn = gimple_call_internal_fn (call);
3674 info->decl = NULL_TREE;
3675 info->base = gimple_call_arg (call, 0);
3676 info->offset = gimple_call_arg (call, 1);
3677 info->offset_dt = vect_unknown_def_type;
3678 info->offset_vectype = NULL_TREE;
3679 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3680 info->element_type = TREE_TYPE (vectype);
3681 info->memory_type = TREE_TYPE (DR_REF (dr));
3684 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3685 gather load or scatter store. Describe the operation in *INFO if so. */
3687 bool
3688 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3689 gather_scatter_info *info)
3691 HOST_WIDE_INT scale = 1;
3692 poly_int64 pbitpos, pbitsize;
3693 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3694 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3695 tree offtype = NULL_TREE;
3696 tree decl = NULL_TREE, base, off;
3697 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3698 tree memory_type = TREE_TYPE (DR_REF (dr));
3699 machine_mode pmode;
3700 int punsignedp, reversep, pvolatilep = 0;
3701 internal_fn ifn;
3702 tree element_type;
3703 bool masked_p = false;
3705 /* See whether this is already a call to a gather/scatter internal function.
3706 If not, see whether it's a masked load or store. */
3707 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3708 if (call && gimple_call_internal_p (call))
3710 ifn = gimple_call_internal_fn (call);
3711 if (internal_gather_scatter_fn_p (ifn))
3713 vect_describe_gather_scatter_call (stmt_info, info);
3714 return true;
3716 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3719 /* True if we should aim to use internal functions rather than
3720 built-in functions. */
3721 bool use_ifn_p = (DR_IS_READ (dr)
3722 ? supports_vec_gather_load_p ()
3723 : supports_vec_scatter_store_p ());
3725 base = DR_REF (dr);
3726 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3727 see if we can use the def stmt of the address. */
3728 if (masked_p
3729 && TREE_CODE (base) == MEM_REF
3730 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3731 && integer_zerop (TREE_OPERAND (base, 1))
3732 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3734 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3735 if (is_gimple_assign (def_stmt)
3736 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3737 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3740 /* The gather and scatter builtins need address of the form
3741 loop_invariant + vector * {1, 2, 4, 8}
3743 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3744 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3745 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3746 multiplications and additions in it. To get a vector, we need
3747 a single SSA_NAME that will be defined in the loop and will
3748 contain everything that is not loop invariant and that can be
3749 vectorized. The following code attempts to find such a preexistng
3750 SSA_NAME OFF and put the loop invariants into a tree BASE
3751 that can be gimplified before the loop. */
3752 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3753 &punsignedp, &reversep, &pvolatilep);
3754 if (reversep)
3755 return false;
3757 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3759 if (TREE_CODE (base) == MEM_REF)
3761 if (!integer_zerop (TREE_OPERAND (base, 1)))
3763 if (off == NULL_TREE)
3764 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3765 else
3766 off = size_binop (PLUS_EXPR, off,
3767 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3769 base = TREE_OPERAND (base, 0);
3771 else
3772 base = build_fold_addr_expr (base);
3774 if (off == NULL_TREE)
3775 off = size_zero_node;
3777 /* If base is not loop invariant, either off is 0, then we start with just
3778 the constant offset in the loop invariant BASE and continue with base
3779 as OFF, otherwise give up.
3780 We could handle that case by gimplifying the addition of base + off
3781 into some SSA_NAME and use that as off, but for now punt. */
3782 if (!expr_invariant_in_loop_p (loop, base))
3784 if (!integer_zerop (off))
3785 return false;
3786 off = base;
3787 base = size_int (pbytepos);
3789 /* Otherwise put base + constant offset into the loop invariant BASE
3790 and continue with OFF. */
3791 else
3793 base = fold_convert (sizetype, base);
3794 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3797 /* OFF at this point may be either a SSA_NAME or some tree expression
3798 from get_inner_reference. Try to peel off loop invariants from it
3799 into BASE as long as possible. */
3800 STRIP_NOPS (off);
3801 while (offtype == NULL_TREE)
3803 enum tree_code code;
3804 tree op0, op1, add = NULL_TREE;
3806 if (TREE_CODE (off) == SSA_NAME)
3808 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3810 if (expr_invariant_in_loop_p (loop, off))
3811 return false;
3813 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3814 break;
3816 op0 = gimple_assign_rhs1 (def_stmt);
3817 code = gimple_assign_rhs_code (def_stmt);
3818 op1 = gimple_assign_rhs2 (def_stmt);
3820 else
3822 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3823 return false;
3824 code = TREE_CODE (off);
3825 extract_ops_from_tree (off, &code, &op0, &op1);
3827 switch (code)
3829 case POINTER_PLUS_EXPR:
3830 case PLUS_EXPR:
3831 if (expr_invariant_in_loop_p (loop, op0))
3833 add = op0;
3834 off = op1;
3835 do_add:
3836 add = fold_convert (sizetype, add);
3837 if (scale != 1)
3838 add = size_binop (MULT_EXPR, add, size_int (scale));
3839 base = size_binop (PLUS_EXPR, base, add);
3840 continue;
3842 if (expr_invariant_in_loop_p (loop, op1))
3844 add = op1;
3845 off = op0;
3846 goto do_add;
3848 break;
3849 case MINUS_EXPR:
3850 if (expr_invariant_in_loop_p (loop, op1))
3852 add = fold_convert (sizetype, op1);
3853 add = size_binop (MINUS_EXPR, size_zero_node, add);
3854 off = op0;
3855 goto do_add;
3857 break;
3858 case MULT_EXPR:
3859 if (scale == 1 && tree_fits_shwi_p (op1))
3861 int new_scale = tree_to_shwi (op1);
3862 /* Only treat this as a scaling operation if the target
3863 supports it. */
3864 if (use_ifn_p
3865 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3866 vectype, memory_type, 1,
3867 TYPE_SIGN (TREE_TYPE (op0)),
3868 new_scale, &ifn,
3869 &element_type))
3870 break;
3871 scale = new_scale;
3872 off = op0;
3873 continue;
3875 break;
3876 case SSA_NAME:
3877 off = op0;
3878 continue;
3879 CASE_CONVERT:
3880 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3881 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3882 break;
3883 if (TYPE_PRECISION (TREE_TYPE (op0))
3884 == TYPE_PRECISION (TREE_TYPE (off)))
3886 off = op0;
3887 continue;
3890 /* The internal functions need the offset to be the same width
3891 as the elements of VECTYPE. Don't include operations that
3892 cast the offset from that width to a different width. */
3893 if (use_ifn_p
3894 && (int_size_in_bytes (TREE_TYPE (vectype))
3895 == int_size_in_bytes (TREE_TYPE (off))))
3896 break;
3898 if (TYPE_PRECISION (TREE_TYPE (op0))
3899 < TYPE_PRECISION (TREE_TYPE (off)))
3901 off = op0;
3902 offtype = TREE_TYPE (off);
3903 STRIP_NOPS (off);
3904 continue;
3906 break;
3907 default:
3908 break;
3910 break;
3913 /* If at the end OFF still isn't a SSA_NAME or isn't
3914 defined in the loop, punt. */
3915 if (TREE_CODE (off) != SSA_NAME
3916 || expr_invariant_in_loop_p (loop, off))
3917 return false;
3919 if (offtype == NULL_TREE)
3920 offtype = TREE_TYPE (off);
3922 if (use_ifn_p)
3924 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3925 memory_type, TYPE_PRECISION (offtype),
3926 TYPE_SIGN (offtype), scale, &ifn,
3927 &element_type))
3928 return false;
3930 else
3932 if (DR_IS_READ (dr))
3934 if (targetm.vectorize.builtin_gather)
3935 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3937 else
3939 if (targetm.vectorize.builtin_scatter)
3940 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3943 if (!decl)
3944 return false;
3946 ifn = IFN_LAST;
3947 element_type = TREE_TYPE (vectype);
3950 info->ifn = ifn;
3951 info->decl = decl;
3952 info->base = base;
3953 info->offset = off;
3954 info->offset_dt = vect_unknown_def_type;
3955 info->offset_vectype = NULL_TREE;
3956 info->scale = scale;
3957 info->element_type = element_type;
3958 info->memory_type = memory_type;
3959 return true;
3962 /* Find the data references in STMT, analyze them with respect to LOOP and
3963 append them to DATAREFS. Return false if datarefs in this stmt cannot
3964 be handled. */
3966 bool
3967 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3968 vec<data_reference_p> *datarefs)
3970 /* We can ignore clobbers for dataref analysis - they are removed during
3971 loop vectorization and BB vectorization checks dependences with a
3972 stmt walk. */
3973 if (gimple_clobber_p (stmt))
3974 return true;
3976 if (gimple_has_volatile_ops (stmt))
3978 if (dump_enabled_p ())
3980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3981 "not vectorized: volatile type ");
3982 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3984 return false;
3987 if (stmt_can_throw_internal (stmt))
3989 if (dump_enabled_p ())
3991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3992 "not vectorized: statement can throw an "
3993 "exception ");
3994 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3996 return false;
3999 auto_vec<data_reference_p, 2> refs;
4000 if (!find_data_references_in_stmt (loop, stmt, &refs))
4001 return false;
4003 if (refs.is_empty ())
4004 return true;
4006 if (refs.length () > 1)
4008 if (dump_enabled_p ())
4010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4011 "not vectorized: more than one data ref "
4012 "in stmt: ");
4013 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4015 return false;
4018 if (gcall *call = dyn_cast <gcall *> (stmt))
4019 if (!gimple_call_internal_p (call)
4020 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4021 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4023 if (dump_enabled_p ())
4025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4026 "not vectorized: dr in a call ");
4027 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4029 return false;
4032 data_reference_p dr = refs.pop ();
4033 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4034 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4036 if (dump_enabled_p ())
4038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4039 "not vectorized: statement is bitfield "
4040 "access ");
4041 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4043 return false;
4046 if (DR_BASE_ADDRESS (dr)
4047 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4049 if (dump_enabled_p ())
4050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4051 "not vectorized: base addr of dr is a "
4052 "constant\n");
4053 return false;
4056 /* Check whether this may be a SIMD lane access and adjust the
4057 DR to make it easier for us to handle it. */
4058 if (loop
4059 && loop->simduid
4060 && (!DR_BASE_ADDRESS (dr)
4061 || !DR_OFFSET (dr)
4062 || !DR_INIT (dr)
4063 || !DR_STEP (dr)))
4065 struct data_reference *newdr
4066 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4067 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4068 if (DR_BASE_ADDRESS (newdr)
4069 && DR_OFFSET (newdr)
4070 && DR_INIT (newdr)
4071 && DR_STEP (newdr)
4072 && integer_zerop (DR_STEP (newdr)))
4074 tree off = DR_OFFSET (newdr);
4075 STRIP_NOPS (off);
4076 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4077 && TREE_CODE (off) == MULT_EXPR
4078 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4080 tree step = TREE_OPERAND (off, 1);
4081 off = TREE_OPERAND (off, 0);
4082 STRIP_NOPS (off);
4083 if (CONVERT_EXPR_P (off)
4084 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4085 < TYPE_PRECISION (TREE_TYPE (off))))
4086 off = TREE_OPERAND (off, 0);
4087 if (TREE_CODE (off) == SSA_NAME)
4089 gimple *def = SSA_NAME_DEF_STMT (off);
4090 tree reft = TREE_TYPE (DR_REF (newdr));
4091 if (is_gimple_call (def)
4092 && gimple_call_internal_p (def)
4093 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4095 tree arg = gimple_call_arg (def, 0);
4096 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4097 arg = SSA_NAME_VAR (arg);
4098 if (arg == loop->simduid
4099 /* For now. */
4100 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4102 DR_OFFSET (newdr) = ssize_int (0);
4103 DR_STEP (newdr) = step;
4104 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4105 DR_STEP_ALIGNMENT (newdr)
4106 = highest_pow2_factor (step);
4107 /* Mark as simd-lane access. */
4108 newdr->aux = (void *)-1;
4109 free_data_ref (dr);
4110 datarefs->safe_push (newdr);
4111 return true;
4117 free_data_ref (newdr);
4120 datarefs->safe_push (dr);
4121 return true;
4124 /* Function vect_analyze_data_refs.
4126 Find all the data references in the loop or basic block.
4128 The general structure of the analysis of data refs in the vectorizer is as
4129 follows:
4130 1- vect_analyze_data_refs(loop/bb): call
4131 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4132 in the loop/bb and their dependences.
4133 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4134 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4135 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4139 bool
4140 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4142 struct loop *loop = NULL;
4143 unsigned int i;
4144 struct data_reference *dr;
4145 tree scalar_type;
4147 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4149 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4150 loop = LOOP_VINFO_LOOP (loop_vinfo);
4152 /* Go through the data-refs, check that the analysis succeeded. Update
4153 pointer from stmt_vec_info struct to DR and vectype. */
4155 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4156 FOR_EACH_VEC_ELT (datarefs, i, dr)
4158 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4159 poly_uint64 vf;
4161 gcc_assert (DR_REF (dr));
4162 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4163 gcc_assert (!stmt_info->dr_aux.dr);
4164 stmt_info->dr_aux.dr = dr;
4165 stmt_info->dr_aux.stmt = stmt_info;
4167 /* Check that analysis of the data-ref succeeded. */
4168 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4169 || !DR_STEP (dr))
4171 bool maybe_gather
4172 = DR_IS_READ (dr)
4173 && !TREE_THIS_VOLATILE (DR_REF (dr))
4174 && (targetm.vectorize.builtin_gather != NULL
4175 || supports_vec_gather_load_p ());
4176 bool maybe_scatter
4177 = DR_IS_WRITE (dr)
4178 && !TREE_THIS_VOLATILE (DR_REF (dr))
4179 && (targetm.vectorize.builtin_scatter != NULL
4180 || supports_vec_scatter_store_p ());
4182 /* If target supports vector gather loads or scatter stores,
4183 see if they can't be used. */
4184 if (is_a <loop_vec_info> (vinfo)
4185 && !nested_in_vect_loop_p (loop, stmt_info))
4187 if (maybe_gather || maybe_scatter)
4189 if (maybe_gather)
4190 gatherscatter = GATHER;
4191 else
4192 gatherscatter = SCATTER;
4196 if (gatherscatter == SG_NONE)
4198 if (dump_enabled_p ())
4200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4201 "not vectorized: data ref analysis "
4202 "failed ");
4203 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4204 stmt_info->stmt, 0);
4206 if (is_a <bb_vec_info> (vinfo))
4208 /* In BB vectorization the ref can still participate
4209 in dependence analysis, we just can't vectorize it. */
4210 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4211 continue;
4213 return false;
4217 /* See if this was detected as SIMD lane access. */
4218 if (dr->aux == (void *)-1)
4220 if (nested_in_vect_loop_p (loop, stmt_info))
4222 if (dump_enabled_p ())
4224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4225 "not vectorized: data ref analysis "
4226 "failed ");
4227 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4228 stmt_info->stmt, 0);
4230 return false;
4232 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4235 tree base = get_base_address (DR_REF (dr));
4236 if (base && VAR_P (base) && DECL_NONALIASED (base))
4238 if (dump_enabled_p ())
4240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4241 "not vectorized: base object not addressable "
4242 "for stmt: ");
4243 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4244 stmt_info->stmt, 0);
4246 if (is_a <bb_vec_info> (vinfo))
4248 /* In BB vectorization the ref can still participate
4249 in dependence analysis, we just can't vectorize it. */
4250 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4251 continue;
4253 return false;
4256 if (is_a <loop_vec_info> (vinfo)
4257 && DR_STEP (dr)
4258 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4260 if (nested_in_vect_loop_p (loop, stmt_info))
4262 if (dump_enabled_p ())
4264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4265 "not vectorized: not suitable for strided "
4266 "load ");
4267 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4268 stmt_info->stmt, 0);
4270 return false;
4272 STMT_VINFO_STRIDED_P (stmt_info) = true;
4275 /* Update DR field in stmt_vec_info struct. */
4277 /* If the dataref is in an inner-loop of the loop that is considered for
4278 for vectorization, we also want to analyze the access relative to
4279 the outer-loop (DR contains information only relative to the
4280 inner-most enclosing loop). We do that by building a reference to the
4281 first location accessed by the inner-loop, and analyze it relative to
4282 the outer-loop. */
4283 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4285 /* Build a reference to the first location accessed by the
4286 inner loop: *(BASE + INIT + OFFSET). By construction,
4287 this address must be invariant in the inner loop, so we
4288 can consider it as being used in the outer loop. */
4289 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4290 tree offset = unshare_expr (DR_OFFSET (dr));
4291 tree init = unshare_expr (DR_INIT (dr));
4292 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4293 init, offset);
4294 tree init_addr = fold_build_pointer_plus (base, init_offset);
4295 tree init_ref = build_fold_indirect_ref (init_addr);
4297 if (dump_enabled_p ())
4299 dump_printf_loc (MSG_NOTE, vect_location,
4300 "analyze in outer loop: ");
4301 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4302 dump_printf (MSG_NOTE, "\n");
4305 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4306 init_ref, loop))
4307 /* dr_analyze_innermost already explained the failure. */
4308 return false;
4310 if (dump_enabled_p ())
4312 dump_printf_loc (MSG_NOTE, vect_location,
4313 "\touter base_address: ");
4314 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4315 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4316 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4317 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4318 STMT_VINFO_DR_OFFSET (stmt_info));
4319 dump_printf (MSG_NOTE,
4320 "\n\touter constant offset from base address: ");
4321 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4322 STMT_VINFO_DR_INIT (stmt_info));
4323 dump_printf (MSG_NOTE, "\n\touter step: ");
4324 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4325 STMT_VINFO_DR_STEP (stmt_info));
4326 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4327 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4328 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4329 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4330 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4331 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4332 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4333 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4337 /* Set vectype for STMT. */
4338 scalar_type = TREE_TYPE (DR_REF (dr));
4339 STMT_VINFO_VECTYPE (stmt_info)
4340 = get_vectype_for_scalar_type (scalar_type);
4341 if (!STMT_VINFO_VECTYPE (stmt_info))
4343 if (dump_enabled_p ())
4345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4346 "not vectorized: no vectype for stmt: ");
4347 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4348 stmt_info->stmt, 0);
4349 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4350 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4351 scalar_type);
4352 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4355 if (is_a <bb_vec_info> (vinfo))
4357 /* No vector type is fine, the ref can still participate
4358 in dependence analysis, we just can't vectorize it. */
4359 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4360 continue;
4362 return false;
4364 else
4366 if (dump_enabled_p ())
4368 dump_printf_loc (MSG_NOTE, vect_location,
4369 "got vectype for stmt: ");
4370 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
4371 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4372 STMT_VINFO_VECTYPE (stmt_info));
4373 dump_printf (MSG_NOTE, "\n");
4377 /* Adjust the minimal vectorization factor according to the
4378 vector type. */
4379 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4380 *min_vf = upper_bound (*min_vf, vf);
4382 if (gatherscatter != SG_NONE)
4384 gather_scatter_info gs_info;
4385 if (!vect_check_gather_scatter (stmt_info,
4386 as_a <loop_vec_info> (vinfo),
4387 &gs_info)
4388 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4390 if (dump_enabled_p ())
4392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4393 (gatherscatter == GATHER) ?
4394 "not vectorized: not suitable for gather "
4395 "load " :
4396 "not vectorized: not suitable for scatter "
4397 "store ");
4398 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4399 stmt_info->stmt, 0);
4401 return false;
4403 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4407 /* We used to stop processing and prune the list here. Verify we no
4408 longer need to. */
4409 gcc_assert (i == datarefs.length ());
4411 return true;
4415 /* Function vect_get_new_vect_var.
4417 Returns a name for a new variable. The current naming scheme appends the
4418 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4419 the name of vectorizer generated variables, and appends that to NAME if
4420 provided. */
4422 tree
4423 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4425 const char *prefix;
4426 tree new_vect_var;
4428 switch (var_kind)
4430 case vect_simple_var:
4431 prefix = "vect";
4432 break;
4433 case vect_scalar_var:
4434 prefix = "stmp";
4435 break;
4436 case vect_mask_var:
4437 prefix = "mask";
4438 break;
4439 case vect_pointer_var:
4440 prefix = "vectp";
4441 break;
4442 default:
4443 gcc_unreachable ();
4446 if (name)
4448 char* tmp = concat (prefix, "_", name, NULL);
4449 new_vect_var = create_tmp_reg (type, tmp);
4450 free (tmp);
4452 else
4453 new_vect_var = create_tmp_reg (type, prefix);
4455 return new_vect_var;
4458 /* Like vect_get_new_vect_var but return an SSA name. */
4460 tree
4461 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4463 const char *prefix;
4464 tree new_vect_var;
4466 switch (var_kind)
4468 case vect_simple_var:
4469 prefix = "vect";
4470 break;
4471 case vect_scalar_var:
4472 prefix = "stmp";
4473 break;
4474 case vect_pointer_var:
4475 prefix = "vectp";
4476 break;
4477 default:
4478 gcc_unreachable ();
4481 if (name)
4483 char* tmp = concat (prefix, "_", name, NULL);
4484 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4485 free (tmp);
4487 else
4488 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4490 return new_vect_var;
4493 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4495 static void
4496 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4498 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4499 int misalign = DR_MISALIGNMENT (dr_info);
4500 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4501 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4502 else
4503 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4504 DR_TARGET_ALIGNMENT (dr_info), misalign);
4507 /* Function vect_create_addr_base_for_vector_ref.
4509 Create an expression that computes the address of the first memory location
4510 that will be accessed for a data reference.
4512 Input:
4513 STMT_INFO: The statement containing the data reference.
4514 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4515 OFFSET: Optional. If supplied, it is be added to the initial address.
4516 LOOP: Specify relative to which loop-nest should the address be computed.
4517 For example, when the dataref is in an inner-loop nested in an
4518 outer-loop that is now being vectorized, LOOP can be either the
4519 outer-loop, or the inner-loop. The first memory location accessed
4520 by the following dataref ('in' points to short):
4522 for (i=0; i<N; i++)
4523 for (j=0; j<M; j++)
4524 s += in[i+j]
4526 is as follows:
4527 if LOOP=i_loop: &in (relative to i_loop)
4528 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4529 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4530 initial address. Unlike OFFSET, which is number of elements to
4531 be added, BYTE_OFFSET is measured in bytes.
4533 Output:
4534 1. Return an SSA_NAME whose value is the address of the memory location of
4535 the first vector of the data reference.
4536 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4537 these statement(s) which define the returned SSA_NAME.
4539 FORNOW: We are only handling array accesses with step 1. */
4541 tree
4542 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4543 gimple_seq *new_stmt_list,
4544 tree offset,
4545 tree byte_offset)
4547 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4548 struct data_reference *dr = dr_info->dr;
4549 const char *base_name;
4550 tree addr_base;
4551 tree dest;
4552 gimple_seq seq = NULL;
4553 tree vect_ptr_type;
4554 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4555 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4556 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4558 tree data_ref_base = unshare_expr (drb->base_address);
4559 tree base_offset = unshare_expr (drb->offset);
4560 tree init = unshare_expr (drb->init);
4562 if (loop_vinfo)
4563 base_name = get_name (data_ref_base);
4564 else
4566 base_offset = ssize_int (0);
4567 init = ssize_int (0);
4568 base_name = get_name (DR_REF (dr));
4571 /* Create base_offset */
4572 base_offset = size_binop (PLUS_EXPR,
4573 fold_convert (sizetype, base_offset),
4574 fold_convert (sizetype, init));
4576 if (offset)
4578 offset = fold_build2 (MULT_EXPR, sizetype,
4579 fold_convert (sizetype, offset), step);
4580 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4581 base_offset, offset);
4583 if (byte_offset)
4585 byte_offset = fold_convert (sizetype, byte_offset);
4586 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4587 base_offset, byte_offset);
4590 /* base + base_offset */
4591 if (loop_vinfo)
4592 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4593 else
4595 addr_base = build1 (ADDR_EXPR,
4596 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4597 unshare_expr (DR_REF (dr)));
4600 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4601 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4602 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4603 gimple_seq_add_seq (new_stmt_list, seq);
4605 if (DR_PTR_INFO (dr)
4606 && TREE_CODE (addr_base) == SSA_NAME
4607 && !SSA_NAME_PTR_INFO (addr_base))
4609 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4610 if (offset || byte_offset)
4611 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4614 if (dump_enabled_p ())
4616 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4617 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4618 dump_printf (MSG_NOTE, "\n");
4621 return addr_base;
4625 /* Function vect_create_data_ref_ptr.
4627 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4628 location accessed in the loop by STMT_INFO, along with the def-use update
4629 chain to appropriately advance the pointer through the loop iterations.
4630 Also set aliasing information for the pointer. This pointer is used by
4631 the callers to this function to create a memory reference expression for
4632 vector load/store access.
4634 Input:
4635 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4636 GIMPLE_ASSIGN <name, data-ref> or
4637 GIMPLE_ASSIGN <data-ref, name>.
4638 2. AGGR_TYPE: the type of the reference, which should be either a vector
4639 or an array.
4640 3. AT_LOOP: the loop where the vector memref is to be created.
4641 4. OFFSET (optional): an offset to be added to the initial address accessed
4642 by the data-ref in STMT_INFO.
4643 5. BSI: location where the new stmts are to be placed if there is no loop
4644 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4645 pointing to the initial address.
4646 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4647 to the initial address accessed by the data-ref in STMT_INFO. This is
4648 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4649 in bytes.
4650 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4651 to the IV during each iteration of the loop. NULL says to move
4652 by one copy of AGGR_TYPE up or down, depending on the step of the
4653 data reference.
4655 Output:
4656 1. Declare a new ptr to vector_type, and have it point to the base of the
4657 data reference (initial addressed accessed by the data reference).
4658 For example, for vector of type V8HI, the following code is generated:
4660 v8hi *ap;
4661 ap = (v8hi *)initial_address;
4663 if OFFSET is not supplied:
4664 initial_address = &a[init];
4665 if OFFSET is supplied:
4666 initial_address = &a[init + OFFSET];
4667 if BYTE_OFFSET is supplied:
4668 initial_address = &a[init] + BYTE_OFFSET;
4670 Return the initial_address in INITIAL_ADDRESS.
4672 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4673 update the pointer in each iteration of the loop.
4675 Return the increment stmt that updates the pointer in PTR_INCR.
4677 3. Return the pointer. */
4679 tree
4680 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4681 struct loop *at_loop, tree offset,
4682 tree *initial_address, gimple_stmt_iterator *gsi,
4683 gimple **ptr_incr, bool only_init,
4684 tree byte_offset, tree iv_step)
4686 const char *base_name;
4687 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4688 struct loop *loop = NULL;
4689 bool nested_in_vect_loop = false;
4690 struct loop *containing_loop = NULL;
4691 tree aggr_ptr_type;
4692 tree aggr_ptr;
4693 tree new_temp;
4694 gimple_seq new_stmt_list = NULL;
4695 edge pe = NULL;
4696 basic_block new_bb;
4697 tree aggr_ptr_init;
4698 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4699 struct data_reference *dr = dr_info->dr;
4700 tree aptr;
4701 gimple_stmt_iterator incr_gsi;
4702 bool insert_after;
4703 tree indx_before_incr, indx_after_incr;
4704 gimple *incr;
4705 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4707 gcc_assert (iv_step != NULL_TREE
4708 || TREE_CODE (aggr_type) == ARRAY_TYPE
4709 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4711 if (loop_vinfo)
4713 loop = LOOP_VINFO_LOOP (loop_vinfo);
4714 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4715 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4716 pe = loop_preheader_edge (loop);
4718 else
4720 gcc_assert (bb_vinfo);
4721 only_init = true;
4722 *ptr_incr = NULL;
4725 /* Create an expression for the first address accessed by this load
4726 in LOOP. */
4727 base_name = get_name (DR_BASE_ADDRESS (dr));
4729 if (dump_enabled_p ())
4731 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4732 dump_printf_loc (MSG_NOTE, vect_location,
4733 "create %s-pointer variable to type: ",
4734 get_tree_code_name (TREE_CODE (aggr_type)));
4735 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4736 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4737 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4738 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4739 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4740 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4741 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4742 else
4743 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4744 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4745 dump_printf (MSG_NOTE, "\n");
4748 /* (1) Create the new aggregate-pointer variable.
4749 Vector and array types inherit the alias set of their component
4750 type by default so we need to use a ref-all pointer if the data
4751 reference does not conflict with the created aggregated data
4752 reference because it is not addressable. */
4753 bool need_ref_all = false;
4754 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4755 get_alias_set (DR_REF (dr))))
4756 need_ref_all = true;
4757 /* Likewise for any of the data references in the stmt group. */
4758 else if (DR_GROUP_SIZE (stmt_info) > 1)
4760 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4763 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4764 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4765 get_alias_set (DR_REF (sdr))))
4767 need_ref_all = true;
4768 break;
4770 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4772 while (sinfo);
4774 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4775 need_ref_all);
4776 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4779 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4780 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4781 def-use update cycles for the pointer: one relative to the outer-loop
4782 (LOOP), which is what steps (3) and (4) below do. The other is relative
4783 to the inner-loop (which is the inner-most loop containing the dataref),
4784 and this is done be step (5) below.
4786 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4787 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4788 redundant. Steps (3),(4) create the following:
4790 vp0 = &base_addr;
4791 LOOP: vp1 = phi(vp0,vp2)
4794 vp2 = vp1 + step
4795 goto LOOP
4797 If there is an inner-loop nested in loop, then step (5) will also be
4798 applied, and an additional update in the inner-loop will be created:
4800 vp0 = &base_addr;
4801 LOOP: vp1 = phi(vp0,vp2)
4803 inner: vp3 = phi(vp1,vp4)
4804 vp4 = vp3 + inner_step
4805 if () goto inner
4807 vp2 = vp1 + step
4808 if () goto LOOP */
4810 /* (2) Calculate the initial address of the aggregate-pointer, and set
4811 the aggregate-pointer to point to it before the loop. */
4813 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4815 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4816 offset, byte_offset);
4817 if (new_stmt_list)
4819 if (pe)
4821 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4822 gcc_assert (!new_bb);
4824 else
4825 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4828 *initial_address = new_temp;
4829 aggr_ptr_init = new_temp;
4831 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4832 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4833 inner-loop nested in LOOP (during outer-loop vectorization). */
4835 /* No update in loop is required. */
4836 if (only_init && (!loop_vinfo || at_loop == loop))
4837 aptr = aggr_ptr_init;
4838 else
4840 /* Accesses to invariant addresses should be handled specially
4841 by the caller. */
4842 tree step = vect_dr_behavior (dr_info)->step;
4843 gcc_assert (!integer_zerop (step));
4845 if (iv_step == NULL_TREE)
4847 /* The step of the aggregate pointer is the type size,
4848 negated for downward accesses. */
4849 iv_step = TYPE_SIZE_UNIT (aggr_type);
4850 if (tree_int_cst_sgn (step) == -1)
4851 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4854 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4856 create_iv (aggr_ptr_init,
4857 fold_convert (aggr_ptr_type, iv_step),
4858 aggr_ptr, loop, &incr_gsi, insert_after,
4859 &indx_before_incr, &indx_after_incr);
4860 incr = gsi_stmt (incr_gsi);
4861 loop_vinfo->add_stmt (incr);
4863 /* Copy the points-to information if it exists. */
4864 if (DR_PTR_INFO (dr))
4866 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4867 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4869 if (ptr_incr)
4870 *ptr_incr = incr;
4872 aptr = indx_before_incr;
4875 if (!nested_in_vect_loop || only_init)
4876 return aptr;
4879 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4880 nested in LOOP, if exists. */
4882 gcc_assert (nested_in_vect_loop);
4883 if (!only_init)
4885 standard_iv_increment_position (containing_loop, &incr_gsi,
4886 &insert_after);
4887 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4888 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4889 &indx_after_incr);
4890 incr = gsi_stmt (incr_gsi);
4891 loop_vinfo->add_stmt (incr);
4893 /* Copy the points-to information if it exists. */
4894 if (DR_PTR_INFO (dr))
4896 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4897 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4899 if (ptr_incr)
4900 *ptr_incr = incr;
4902 return indx_before_incr;
4904 else
4905 gcc_unreachable ();
4909 /* Function bump_vector_ptr
4911 Increment a pointer (to a vector type) by vector-size. If requested,
4912 i.e. if PTR-INCR is given, then also connect the new increment stmt
4913 to the existing def-use update-chain of the pointer, by modifying
4914 the PTR_INCR as illustrated below:
4916 The pointer def-use update-chain before this function:
4917 DATAREF_PTR = phi (p_0, p_2)
4918 ....
4919 PTR_INCR: p_2 = DATAREF_PTR + step
4921 The pointer def-use update-chain after this function:
4922 DATAREF_PTR = phi (p_0, p_2)
4923 ....
4924 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4925 ....
4926 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4928 Input:
4929 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4930 in the loop.
4931 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4932 the loop. The increment amount across iterations is expected
4933 to be vector_size.
4934 BSI - location where the new update stmt is to be placed.
4935 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4936 BUMP - optional. The offset by which to bump the pointer. If not given,
4937 the offset is assumed to be vector_size.
4939 Output: Return NEW_DATAREF_PTR as illustrated above.
4943 tree
4944 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4945 stmt_vec_info stmt_info, tree bump)
4947 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4948 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4949 tree update = TYPE_SIZE_UNIT (vectype);
4950 gassign *incr_stmt;
4951 ssa_op_iter iter;
4952 use_operand_p use_p;
4953 tree new_dataref_ptr;
4955 if (bump)
4956 update = bump;
4958 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4959 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4960 else
4961 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4962 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4963 dataref_ptr, update);
4964 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4966 /* Copy the points-to information if it exists. */
4967 if (DR_PTR_INFO (dr))
4969 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4970 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4973 if (!ptr_incr)
4974 return new_dataref_ptr;
4976 /* Update the vector-pointer's cross-iteration increment. */
4977 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4979 tree use = USE_FROM_PTR (use_p);
4981 if (use == dataref_ptr)
4982 SET_USE (use_p, new_dataref_ptr);
4983 else
4984 gcc_assert (operand_equal_p (use, update, 0));
4987 return new_dataref_ptr;
4991 /* Copy memory reference info such as base/clique from the SRC reference
4992 to the DEST MEM_REF. */
4994 void
4995 vect_copy_ref_info (tree dest, tree src)
4997 if (TREE_CODE (dest) != MEM_REF)
4998 return;
5000 tree src_base = src;
5001 while (handled_component_p (src_base))
5002 src_base = TREE_OPERAND (src_base, 0);
5003 if (TREE_CODE (src_base) != MEM_REF
5004 && TREE_CODE (src_base) != TARGET_MEM_REF)
5005 return;
5007 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5008 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5012 /* Function vect_create_destination_var.
5014 Create a new temporary of type VECTYPE. */
5016 tree
5017 vect_create_destination_var (tree scalar_dest, tree vectype)
5019 tree vec_dest;
5020 const char *name;
5021 char *new_name;
5022 tree type;
5023 enum vect_var_kind kind;
5025 kind = vectype
5026 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5027 ? vect_mask_var
5028 : vect_simple_var
5029 : vect_scalar_var;
5030 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5032 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5034 name = get_name (scalar_dest);
5035 if (name)
5036 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5037 else
5038 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5039 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5040 free (new_name);
5042 return vec_dest;
5045 /* Function vect_grouped_store_supported.
5047 Returns TRUE if interleave high and interleave low permutations
5048 are supported, and FALSE otherwise. */
5050 bool
5051 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5053 machine_mode mode = TYPE_MODE (vectype);
5055 /* vect_permute_store_chain requires the group size to be equal to 3 or
5056 be a power of two. */
5057 if (count != 3 && exact_log2 (count) == -1)
5059 if (dump_enabled_p ())
5060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5061 "the size of the group of accesses"
5062 " is not a power of 2 or not eqaul to 3\n");
5063 return false;
5066 /* Check that the permutation is supported. */
5067 if (VECTOR_MODE_P (mode))
5069 unsigned int i;
5070 if (count == 3)
5072 unsigned int j0 = 0, j1 = 0, j2 = 0;
5073 unsigned int i, j;
5075 unsigned int nelt;
5076 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5078 if (dump_enabled_p ())
5079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5080 "cannot handle groups of 3 stores for"
5081 " variable-length vectors\n");
5082 return false;
5085 vec_perm_builder sel (nelt, nelt, 1);
5086 sel.quick_grow (nelt);
5087 vec_perm_indices indices;
5088 for (j = 0; j < 3; j++)
5090 int nelt0 = ((3 - j) * nelt) % 3;
5091 int nelt1 = ((3 - j) * nelt + 1) % 3;
5092 int nelt2 = ((3 - j) * nelt + 2) % 3;
5093 for (i = 0; i < nelt; i++)
5095 if (3 * i + nelt0 < nelt)
5096 sel[3 * i + nelt0] = j0++;
5097 if (3 * i + nelt1 < nelt)
5098 sel[3 * i + nelt1] = nelt + j1++;
5099 if (3 * i + nelt2 < nelt)
5100 sel[3 * i + nelt2] = 0;
5102 indices.new_vector (sel, 2, nelt);
5103 if (!can_vec_perm_const_p (mode, indices))
5105 if (dump_enabled_p ())
5106 dump_printf (MSG_MISSED_OPTIMIZATION,
5107 "permutation op not supported by target.\n");
5108 return false;
5111 for (i = 0; i < nelt; i++)
5113 if (3 * i + nelt0 < nelt)
5114 sel[3 * i + nelt0] = 3 * i + nelt0;
5115 if (3 * i + nelt1 < nelt)
5116 sel[3 * i + nelt1] = 3 * i + nelt1;
5117 if (3 * i + nelt2 < nelt)
5118 sel[3 * i + nelt2] = nelt + j2++;
5120 indices.new_vector (sel, 2, nelt);
5121 if (!can_vec_perm_const_p (mode, indices))
5123 if (dump_enabled_p ())
5124 dump_printf (MSG_MISSED_OPTIMIZATION,
5125 "permutation op not supported by target.\n");
5126 return false;
5129 return true;
5131 else
5133 /* If length is not equal to 3 then only power of 2 is supported. */
5134 gcc_assert (pow2p_hwi (count));
5135 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5137 /* The encoding has 2 interleaved stepped patterns. */
5138 vec_perm_builder sel (nelt, 2, 3);
5139 sel.quick_grow (6);
5140 for (i = 0; i < 3; i++)
5142 sel[i * 2] = i;
5143 sel[i * 2 + 1] = i + nelt;
5145 vec_perm_indices indices (sel, 2, nelt);
5146 if (can_vec_perm_const_p (mode, indices))
5148 for (i = 0; i < 6; i++)
5149 sel[i] += exact_div (nelt, 2);
5150 indices.new_vector (sel, 2, nelt);
5151 if (can_vec_perm_const_p (mode, indices))
5152 return true;
5157 if (dump_enabled_p ())
5158 dump_printf (MSG_MISSED_OPTIMIZATION,
5159 "permutaion op not supported by target.\n");
5160 return false;
5164 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5165 type VECTYPE. MASKED_P says whether the masked form is needed. */
5167 bool
5168 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5169 bool masked_p)
5171 if (masked_p)
5172 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5173 vec_mask_store_lanes_optab,
5174 vectype, count);
5175 else
5176 return vect_lanes_optab_supported_p ("vec_store_lanes",
5177 vec_store_lanes_optab,
5178 vectype, count);
5182 /* Function vect_permute_store_chain.
5184 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5185 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5186 the data correctly for the stores. Return the final references for stores
5187 in RESULT_CHAIN.
5189 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5190 The input is 4 vectors each containing 8 elements. We assign a number to
5191 each element, the input sequence is:
5193 1st vec: 0 1 2 3 4 5 6 7
5194 2nd vec: 8 9 10 11 12 13 14 15
5195 3rd vec: 16 17 18 19 20 21 22 23
5196 4th vec: 24 25 26 27 28 29 30 31
5198 The output sequence should be:
5200 1st vec: 0 8 16 24 1 9 17 25
5201 2nd vec: 2 10 18 26 3 11 19 27
5202 3rd vec: 4 12 20 28 5 13 21 30
5203 4th vec: 6 14 22 30 7 15 23 31
5205 i.e., we interleave the contents of the four vectors in their order.
5207 We use interleave_high/low instructions to create such output. The input of
5208 each interleave_high/low operation is two vectors:
5209 1st vec 2nd vec
5210 0 1 2 3 4 5 6 7
5211 the even elements of the result vector are obtained left-to-right from the
5212 high/low elements of the first vector. The odd elements of the result are
5213 obtained left-to-right from the high/low elements of the second vector.
5214 The output of interleave_high will be: 0 4 1 5
5215 and of interleave_low: 2 6 3 7
5218 The permutation is done in log LENGTH stages. In each stage interleave_high
5219 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5220 where the first argument is taken from the first half of DR_CHAIN and the
5221 second argument from it's second half.
5222 In our example,
5224 I1: interleave_high (1st vec, 3rd vec)
5225 I2: interleave_low (1st vec, 3rd vec)
5226 I3: interleave_high (2nd vec, 4th vec)
5227 I4: interleave_low (2nd vec, 4th vec)
5229 The output for the first stage is:
5231 I1: 0 16 1 17 2 18 3 19
5232 I2: 4 20 5 21 6 22 7 23
5233 I3: 8 24 9 25 10 26 11 27
5234 I4: 12 28 13 29 14 30 15 31
5236 The output of the second stage, i.e. the final result is:
5238 I1: 0 8 16 24 1 9 17 25
5239 I2: 2 10 18 26 3 11 19 27
5240 I3: 4 12 20 28 5 13 21 30
5241 I4: 6 14 22 30 7 15 23 31. */
5243 void
5244 vect_permute_store_chain (vec<tree> dr_chain,
5245 unsigned int length,
5246 stmt_vec_info stmt_info,
5247 gimple_stmt_iterator *gsi,
5248 vec<tree> *result_chain)
5250 tree vect1, vect2, high, low;
5251 gimple *perm_stmt;
5252 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5253 tree perm_mask_low, perm_mask_high;
5254 tree data_ref;
5255 tree perm3_mask_low, perm3_mask_high;
5256 unsigned int i, j, n, log_length = exact_log2 (length);
5258 result_chain->quick_grow (length);
5259 memcpy (result_chain->address (), dr_chain.address (),
5260 length * sizeof (tree));
5262 if (length == 3)
5264 /* vect_grouped_store_supported ensures that this is constant. */
5265 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5266 unsigned int j0 = 0, j1 = 0, j2 = 0;
5268 vec_perm_builder sel (nelt, nelt, 1);
5269 sel.quick_grow (nelt);
5270 vec_perm_indices indices;
5271 for (j = 0; j < 3; j++)
5273 int nelt0 = ((3 - j) * nelt) % 3;
5274 int nelt1 = ((3 - j) * nelt + 1) % 3;
5275 int nelt2 = ((3 - j) * nelt + 2) % 3;
5277 for (i = 0; i < nelt; i++)
5279 if (3 * i + nelt0 < nelt)
5280 sel[3 * i + nelt0] = j0++;
5281 if (3 * i + nelt1 < nelt)
5282 sel[3 * i + nelt1] = nelt + j1++;
5283 if (3 * i + nelt2 < nelt)
5284 sel[3 * i + nelt2] = 0;
5286 indices.new_vector (sel, 2, nelt);
5287 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5289 for (i = 0; i < nelt; i++)
5291 if (3 * i + nelt0 < nelt)
5292 sel[3 * i + nelt0] = 3 * i + nelt0;
5293 if (3 * i + nelt1 < nelt)
5294 sel[3 * i + nelt1] = 3 * i + nelt1;
5295 if (3 * i + nelt2 < nelt)
5296 sel[3 * i + nelt2] = nelt + j2++;
5298 indices.new_vector (sel, 2, nelt);
5299 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5301 vect1 = dr_chain[0];
5302 vect2 = dr_chain[1];
5304 /* Create interleaving stmt:
5305 low = VEC_PERM_EXPR <vect1, vect2,
5306 {j, nelt, *, j + 1, nelt + j + 1, *,
5307 j + 2, nelt + j + 2, *, ...}> */
5308 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5309 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5310 vect2, perm3_mask_low);
5311 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5313 vect1 = data_ref;
5314 vect2 = dr_chain[2];
5315 /* Create interleaving stmt:
5316 low = VEC_PERM_EXPR <vect1, vect2,
5317 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5318 6, 7, nelt + j + 2, ...}> */
5319 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5320 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5321 vect2, perm3_mask_high);
5322 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5323 (*result_chain)[j] = data_ref;
5326 else
5328 /* If length is not equal to 3 then only power of 2 is supported. */
5329 gcc_assert (pow2p_hwi (length));
5331 /* The encoding has 2 interleaved stepped patterns. */
5332 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5333 vec_perm_builder sel (nelt, 2, 3);
5334 sel.quick_grow (6);
5335 for (i = 0; i < 3; i++)
5337 sel[i * 2] = i;
5338 sel[i * 2 + 1] = i + nelt;
5340 vec_perm_indices indices (sel, 2, nelt);
5341 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5343 for (i = 0; i < 6; i++)
5344 sel[i] += exact_div (nelt, 2);
5345 indices.new_vector (sel, 2, nelt);
5346 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5348 for (i = 0, n = log_length; i < n; i++)
5350 for (j = 0; j < length/2; j++)
5352 vect1 = dr_chain[j];
5353 vect2 = dr_chain[j+length/2];
5355 /* Create interleaving stmt:
5356 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5357 ...}> */
5358 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5359 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5360 vect2, perm_mask_high);
5361 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5362 (*result_chain)[2*j] = high;
5364 /* Create interleaving stmt:
5365 low = VEC_PERM_EXPR <vect1, vect2,
5366 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5367 ...}> */
5368 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5369 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5370 vect2, perm_mask_low);
5371 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5372 (*result_chain)[2*j+1] = low;
5374 memcpy (dr_chain.address (), result_chain->address (),
5375 length * sizeof (tree));
5380 /* Function vect_setup_realignment
5382 This function is called when vectorizing an unaligned load using
5383 the dr_explicit_realign[_optimized] scheme.
5384 This function generates the following code at the loop prolog:
5386 p = initial_addr;
5387 x msq_init = *(floor(p)); # prolog load
5388 realignment_token = call target_builtin;
5389 loop:
5390 x msq = phi (msq_init, ---)
5392 The stmts marked with x are generated only for the case of
5393 dr_explicit_realign_optimized.
5395 The code above sets up a new (vector) pointer, pointing to the first
5396 location accessed by STMT_INFO, and a "floor-aligned" load using that
5397 pointer. It also generates code to compute the "realignment-token"
5398 (if the relevant target hook was defined), and creates a phi-node at the
5399 loop-header bb whose arguments are the result of the prolog-load (created
5400 by this function) and the result of a load that takes place in the loop
5401 (to be created by the caller to this function).
5403 For the case of dr_explicit_realign_optimized:
5404 The caller to this function uses the phi-result (msq) to create the
5405 realignment code inside the loop, and sets up the missing phi argument,
5406 as follows:
5407 loop:
5408 msq = phi (msq_init, lsq)
5409 lsq = *(floor(p')); # load in loop
5410 result = realign_load (msq, lsq, realignment_token);
5412 For the case of dr_explicit_realign:
5413 loop:
5414 msq = *(floor(p)); # load in loop
5415 p' = p + (VS-1);
5416 lsq = *(floor(p')); # load in loop
5417 result = realign_load (msq, lsq, realignment_token);
5419 Input:
5420 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5421 a memory location that may be unaligned.
5422 BSI - place where new code is to be inserted.
5423 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5424 is used.
5426 Output:
5427 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5428 target hook, if defined.
5429 Return value - the result of the loop-header phi node. */
5431 tree
5432 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5433 tree *realignment_token,
5434 enum dr_alignment_support alignment_support_scheme,
5435 tree init_addr,
5436 struct loop **at_loop)
5438 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5439 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5440 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5441 struct data_reference *dr = dr_info->dr;
5442 struct loop *loop = NULL;
5443 edge pe = NULL;
5444 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5445 tree vec_dest;
5446 gimple *inc;
5447 tree ptr;
5448 tree data_ref;
5449 basic_block new_bb;
5450 tree msq_init = NULL_TREE;
5451 tree new_temp;
5452 gphi *phi_stmt;
5453 tree msq = NULL_TREE;
5454 gimple_seq stmts = NULL;
5455 bool compute_in_loop = false;
5456 bool nested_in_vect_loop = false;
5457 struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5458 struct loop *loop_for_initial_load = NULL;
5460 if (loop_vinfo)
5462 loop = LOOP_VINFO_LOOP (loop_vinfo);
5463 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5466 gcc_assert (alignment_support_scheme == dr_explicit_realign
5467 || alignment_support_scheme == dr_explicit_realign_optimized);
5469 /* We need to generate three things:
5470 1. the misalignment computation
5471 2. the extra vector load (for the optimized realignment scheme).
5472 3. the phi node for the two vectors from which the realignment is
5473 done (for the optimized realignment scheme). */
5475 /* 1. Determine where to generate the misalignment computation.
5477 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5478 calculation will be generated by this function, outside the loop (in the
5479 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5480 caller, inside the loop.
5482 Background: If the misalignment remains fixed throughout the iterations of
5483 the loop, then both realignment schemes are applicable, and also the
5484 misalignment computation can be done outside LOOP. This is because we are
5485 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5486 are a multiple of VS (the Vector Size), and therefore the misalignment in
5487 different vectorized LOOP iterations is always the same.
5488 The problem arises only if the memory access is in an inner-loop nested
5489 inside LOOP, which is now being vectorized using outer-loop vectorization.
5490 This is the only case when the misalignment of the memory access may not
5491 remain fixed throughout the iterations of the inner-loop (as explained in
5492 detail in vect_supportable_dr_alignment). In this case, not only is the
5493 optimized realignment scheme not applicable, but also the misalignment
5494 computation (and generation of the realignment token that is passed to
5495 REALIGN_LOAD) have to be done inside the loop.
5497 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5498 or not, which in turn determines if the misalignment is computed inside
5499 the inner-loop, or outside LOOP. */
5501 if (init_addr != NULL_TREE || !loop_vinfo)
5503 compute_in_loop = true;
5504 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5508 /* 2. Determine where to generate the extra vector load.
5510 For the optimized realignment scheme, instead of generating two vector
5511 loads in each iteration, we generate a single extra vector load in the
5512 preheader of the loop, and in each iteration reuse the result of the
5513 vector load from the previous iteration. In case the memory access is in
5514 an inner-loop nested inside LOOP, which is now being vectorized using
5515 outer-loop vectorization, we need to determine whether this initial vector
5516 load should be generated at the preheader of the inner-loop, or can be
5517 generated at the preheader of LOOP. If the memory access has no evolution
5518 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5519 to be generated inside LOOP (in the preheader of the inner-loop). */
5521 if (nested_in_vect_loop)
5523 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5524 bool invariant_in_outerloop =
5525 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5526 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5528 else
5529 loop_for_initial_load = loop;
5530 if (at_loop)
5531 *at_loop = loop_for_initial_load;
5533 if (loop_for_initial_load)
5534 pe = loop_preheader_edge (loop_for_initial_load);
5536 /* 3. For the case of the optimized realignment, create the first vector
5537 load at the loop preheader. */
5539 if (alignment_support_scheme == dr_explicit_realign_optimized)
5541 /* Create msq_init = *(floor(p1)) in the loop preheader */
5542 gassign *new_stmt;
5544 gcc_assert (!compute_in_loop);
5545 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5546 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5547 loop_for_initial_load, NULL_TREE,
5548 &init_addr, NULL, &inc, true);
5549 if (TREE_CODE (ptr) == SSA_NAME)
5550 new_temp = copy_ssa_name (ptr);
5551 else
5552 new_temp = make_ssa_name (TREE_TYPE (ptr));
5553 unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5554 new_stmt = gimple_build_assign
5555 (new_temp, BIT_AND_EXPR, ptr,
5556 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5557 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5558 gcc_assert (!new_bb);
5559 data_ref
5560 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5561 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5562 vect_copy_ref_info (data_ref, DR_REF (dr));
5563 new_stmt = gimple_build_assign (vec_dest, data_ref);
5564 new_temp = make_ssa_name (vec_dest, new_stmt);
5565 gimple_assign_set_lhs (new_stmt, new_temp);
5566 if (pe)
5568 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5569 gcc_assert (!new_bb);
5571 else
5572 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5574 msq_init = gimple_assign_lhs (new_stmt);
5577 /* 4. Create realignment token using a target builtin, if available.
5578 It is done either inside the containing loop, or before LOOP (as
5579 determined above). */
5581 if (targetm.vectorize.builtin_mask_for_load)
5583 gcall *new_stmt;
5584 tree builtin_decl;
5586 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5587 if (!init_addr)
5589 /* Generate the INIT_ADDR computation outside LOOP. */
5590 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5591 NULL_TREE);
5592 if (loop)
5594 pe = loop_preheader_edge (loop);
5595 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5596 gcc_assert (!new_bb);
5598 else
5599 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5602 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5603 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5604 vec_dest =
5605 vect_create_destination_var (scalar_dest,
5606 gimple_call_return_type (new_stmt));
5607 new_temp = make_ssa_name (vec_dest, new_stmt);
5608 gimple_call_set_lhs (new_stmt, new_temp);
5610 if (compute_in_loop)
5611 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5612 else
5614 /* Generate the misalignment computation outside LOOP. */
5615 pe = loop_preheader_edge (loop);
5616 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5617 gcc_assert (!new_bb);
5620 *realignment_token = gimple_call_lhs (new_stmt);
5622 /* The result of the CALL_EXPR to this builtin is determined from
5623 the value of the parameter and no global variables are touched
5624 which makes the builtin a "const" function. Requiring the
5625 builtin to have the "const" attribute makes it unnecessary
5626 to call mark_call_clobbered. */
5627 gcc_assert (TREE_READONLY (builtin_decl));
5630 if (alignment_support_scheme == dr_explicit_realign)
5631 return msq;
5633 gcc_assert (!compute_in_loop);
5634 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5637 /* 5. Create msq = phi <msq_init, lsq> in loop */
5639 pe = loop_preheader_edge (containing_loop);
5640 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5641 msq = make_ssa_name (vec_dest);
5642 phi_stmt = create_phi_node (msq, containing_loop->header);
5643 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5645 return msq;
5649 /* Function vect_grouped_load_supported.
5651 COUNT is the size of the load group (the number of statements plus the
5652 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5653 only one statement, with a gap of COUNT - 1.
5655 Returns true if a suitable permute exists. */
5657 bool
5658 vect_grouped_load_supported (tree vectype, bool single_element_p,
5659 unsigned HOST_WIDE_INT count)
5661 machine_mode mode = TYPE_MODE (vectype);
5663 /* If this is single-element interleaving with an element distance
5664 that leaves unused vector loads around punt - we at least create
5665 very sub-optimal code in that case (and blow up memory,
5666 see PR65518). */
5667 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5669 if (dump_enabled_p ())
5670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5671 "single-element interleaving not supported "
5672 "for not adjacent vector loads\n");
5673 return false;
5676 /* vect_permute_load_chain requires the group size to be equal to 3 or
5677 be a power of two. */
5678 if (count != 3 && exact_log2 (count) == -1)
5680 if (dump_enabled_p ())
5681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5682 "the size of the group of accesses"
5683 " is not a power of 2 or not equal to 3\n");
5684 return false;
5687 /* Check that the permutation is supported. */
5688 if (VECTOR_MODE_P (mode))
5690 unsigned int i, j;
5691 if (count == 3)
5693 unsigned int nelt;
5694 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5696 if (dump_enabled_p ())
5697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5698 "cannot handle groups of 3 loads for"
5699 " variable-length vectors\n");
5700 return false;
5703 vec_perm_builder sel (nelt, nelt, 1);
5704 sel.quick_grow (nelt);
5705 vec_perm_indices indices;
5706 unsigned int k;
5707 for (k = 0; k < 3; k++)
5709 for (i = 0; i < nelt; i++)
5710 if (3 * i + k < 2 * nelt)
5711 sel[i] = 3 * i + k;
5712 else
5713 sel[i] = 0;
5714 indices.new_vector (sel, 2, nelt);
5715 if (!can_vec_perm_const_p (mode, indices))
5717 if (dump_enabled_p ())
5718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5719 "shuffle of 3 loads is not supported by"
5720 " target\n");
5721 return false;
5723 for (i = 0, j = 0; i < nelt; i++)
5724 if (3 * i + k < 2 * nelt)
5725 sel[i] = i;
5726 else
5727 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5728 indices.new_vector (sel, 2, nelt);
5729 if (!can_vec_perm_const_p (mode, indices))
5731 if (dump_enabled_p ())
5732 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5733 "shuffle of 3 loads is not supported by"
5734 " target\n");
5735 return false;
5738 return true;
5740 else
5742 /* If length is not equal to 3 then only power of 2 is supported. */
5743 gcc_assert (pow2p_hwi (count));
5744 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5746 /* The encoding has a single stepped pattern. */
5747 vec_perm_builder sel (nelt, 1, 3);
5748 sel.quick_grow (3);
5749 for (i = 0; i < 3; i++)
5750 sel[i] = i * 2;
5751 vec_perm_indices indices (sel, 2, nelt);
5752 if (can_vec_perm_const_p (mode, indices))
5754 for (i = 0; i < 3; i++)
5755 sel[i] = i * 2 + 1;
5756 indices.new_vector (sel, 2, nelt);
5757 if (can_vec_perm_const_p (mode, indices))
5758 return true;
5763 if (dump_enabled_p ())
5764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5765 "extract even/odd not supported by target\n");
5766 return false;
5769 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5770 type VECTYPE. MASKED_P says whether the masked form is needed. */
5772 bool
5773 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5774 bool masked_p)
5776 if (masked_p)
5777 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5778 vec_mask_load_lanes_optab,
5779 vectype, count);
5780 else
5781 return vect_lanes_optab_supported_p ("vec_load_lanes",
5782 vec_load_lanes_optab,
5783 vectype, count);
5786 /* Function vect_permute_load_chain.
5788 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5789 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5790 the input data correctly. Return the final references for loads in
5791 RESULT_CHAIN.
5793 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5794 The input is 4 vectors each containing 8 elements. We assign a number to each
5795 element, the input sequence is:
5797 1st vec: 0 1 2 3 4 5 6 7
5798 2nd vec: 8 9 10 11 12 13 14 15
5799 3rd vec: 16 17 18 19 20 21 22 23
5800 4th vec: 24 25 26 27 28 29 30 31
5802 The output sequence should be:
5804 1st vec: 0 4 8 12 16 20 24 28
5805 2nd vec: 1 5 9 13 17 21 25 29
5806 3rd vec: 2 6 10 14 18 22 26 30
5807 4th vec: 3 7 11 15 19 23 27 31
5809 i.e., the first output vector should contain the first elements of each
5810 interleaving group, etc.
5812 We use extract_even/odd instructions to create such output. The input of
5813 each extract_even/odd operation is two vectors
5814 1st vec 2nd vec
5815 0 1 2 3 4 5 6 7
5817 and the output is the vector of extracted even/odd elements. The output of
5818 extract_even will be: 0 2 4 6
5819 and of extract_odd: 1 3 5 7
5822 The permutation is done in log LENGTH stages. In each stage extract_even
5823 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5824 their order. In our example,
5826 E1: extract_even (1st vec, 2nd vec)
5827 E2: extract_odd (1st vec, 2nd vec)
5828 E3: extract_even (3rd vec, 4th vec)
5829 E4: extract_odd (3rd vec, 4th vec)
5831 The output for the first stage will be:
5833 E1: 0 2 4 6 8 10 12 14
5834 E2: 1 3 5 7 9 11 13 15
5835 E3: 16 18 20 22 24 26 28 30
5836 E4: 17 19 21 23 25 27 29 31
5838 In order to proceed and create the correct sequence for the next stage (or
5839 for the correct output, if the second stage is the last one, as in our
5840 example), we first put the output of extract_even operation and then the
5841 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5842 The input for the second stage is:
5844 1st vec (E1): 0 2 4 6 8 10 12 14
5845 2nd vec (E3): 16 18 20 22 24 26 28 30
5846 3rd vec (E2): 1 3 5 7 9 11 13 15
5847 4th vec (E4): 17 19 21 23 25 27 29 31
5849 The output of the second stage:
5851 E1: 0 4 8 12 16 20 24 28
5852 E2: 2 6 10 14 18 22 26 30
5853 E3: 1 5 9 13 17 21 25 29
5854 E4: 3 7 11 15 19 23 27 31
5856 And RESULT_CHAIN after reordering:
5858 1st vec (E1): 0 4 8 12 16 20 24 28
5859 2nd vec (E3): 1 5 9 13 17 21 25 29
5860 3rd vec (E2): 2 6 10 14 18 22 26 30
5861 4th vec (E4): 3 7 11 15 19 23 27 31. */
5863 static void
5864 vect_permute_load_chain (vec<tree> dr_chain,
5865 unsigned int length,
5866 stmt_vec_info stmt_info,
5867 gimple_stmt_iterator *gsi,
5868 vec<tree> *result_chain)
5870 tree data_ref, first_vect, second_vect;
5871 tree perm_mask_even, perm_mask_odd;
5872 tree perm3_mask_low, perm3_mask_high;
5873 gimple *perm_stmt;
5874 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5875 unsigned int i, j, log_length = exact_log2 (length);
5877 result_chain->quick_grow (length);
5878 memcpy (result_chain->address (), dr_chain.address (),
5879 length * sizeof (tree));
5881 if (length == 3)
5883 /* vect_grouped_load_supported ensures that this is constant. */
5884 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5885 unsigned int k;
5887 vec_perm_builder sel (nelt, nelt, 1);
5888 sel.quick_grow (nelt);
5889 vec_perm_indices indices;
5890 for (k = 0; k < 3; k++)
5892 for (i = 0; i < nelt; i++)
5893 if (3 * i + k < 2 * nelt)
5894 sel[i] = 3 * i + k;
5895 else
5896 sel[i] = 0;
5897 indices.new_vector (sel, 2, nelt);
5898 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5900 for (i = 0, j = 0; i < nelt; i++)
5901 if (3 * i + k < 2 * nelt)
5902 sel[i] = i;
5903 else
5904 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5905 indices.new_vector (sel, 2, nelt);
5906 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5908 first_vect = dr_chain[0];
5909 second_vect = dr_chain[1];
5911 /* Create interleaving stmt (low part of):
5912 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5913 ...}> */
5914 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5915 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5916 second_vect, perm3_mask_low);
5917 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5919 /* Create interleaving stmt (high part of):
5920 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5921 ...}> */
5922 first_vect = data_ref;
5923 second_vect = dr_chain[2];
5924 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5925 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5926 second_vect, perm3_mask_high);
5927 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5928 (*result_chain)[k] = data_ref;
5931 else
5933 /* If length is not equal to 3 then only power of 2 is supported. */
5934 gcc_assert (pow2p_hwi (length));
5936 /* The encoding has a single stepped pattern. */
5937 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5938 vec_perm_builder sel (nelt, 1, 3);
5939 sel.quick_grow (3);
5940 for (i = 0; i < 3; ++i)
5941 sel[i] = i * 2;
5942 vec_perm_indices indices (sel, 2, nelt);
5943 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5945 for (i = 0; i < 3; ++i)
5946 sel[i] = i * 2 + 1;
5947 indices.new_vector (sel, 2, nelt);
5948 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5950 for (i = 0; i < log_length; i++)
5952 for (j = 0; j < length; j += 2)
5954 first_vect = dr_chain[j];
5955 second_vect = dr_chain[j+1];
5957 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5958 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5959 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5960 first_vect, second_vect,
5961 perm_mask_even);
5962 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5963 (*result_chain)[j/2] = data_ref;
5965 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5966 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5967 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5968 first_vect, second_vect,
5969 perm_mask_odd);
5970 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5971 (*result_chain)[j/2+length/2] = data_ref;
5973 memcpy (dr_chain.address (), result_chain->address (),
5974 length * sizeof (tree));
5979 /* Function vect_shift_permute_load_chain.
5981 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5982 sequence of stmts to reorder the input data accordingly.
5983 Return the final references for loads in RESULT_CHAIN.
5984 Return true if successed, false otherwise.
5986 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5987 The input is 3 vectors each containing 8 elements. We assign a
5988 number to each element, the input sequence is:
5990 1st vec: 0 1 2 3 4 5 6 7
5991 2nd vec: 8 9 10 11 12 13 14 15
5992 3rd vec: 16 17 18 19 20 21 22 23
5994 The output sequence should be:
5996 1st vec: 0 3 6 9 12 15 18 21
5997 2nd vec: 1 4 7 10 13 16 19 22
5998 3rd vec: 2 5 8 11 14 17 20 23
6000 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6002 First we shuffle all 3 vectors to get correct elements order:
6004 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6005 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6006 3rd vec: (16 19 22) (17 20 23) (18 21)
6008 Next we unite and shift vector 3 times:
6010 1st step:
6011 shift right by 6 the concatenation of:
6012 "1st vec" and "2nd vec"
6013 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6014 "2nd vec" and "3rd vec"
6015 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6016 "3rd vec" and "1st vec"
6017 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6018 | New vectors |
6020 So that now new vectors are:
6022 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6023 2nd vec: (10 13) (16 19 22) (17 20 23)
6024 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6026 2nd step:
6027 shift right by 5 the concatenation of:
6028 "1st vec" and "3rd vec"
6029 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6030 "2nd vec" and "1st vec"
6031 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6032 "3rd vec" and "2nd vec"
6033 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6034 | New vectors |
6036 So that now new vectors are:
6038 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6039 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6040 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6042 3rd step:
6043 shift right by 5 the concatenation of:
6044 "1st vec" and "1st vec"
6045 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6046 shift right by 3 the concatenation of:
6047 "2nd vec" and "2nd vec"
6048 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6049 | New vectors |
6051 So that now all vectors are READY:
6052 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6053 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6054 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6056 This algorithm is faster than one in vect_permute_load_chain if:
6057 1. "shift of a concatination" is faster than general permutation.
6058 This is usually so.
6059 2. The TARGET machine can't execute vector instructions in parallel.
6060 This is because each step of the algorithm depends on previous.
6061 The algorithm in vect_permute_load_chain is much more parallel.
6063 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6066 static bool
6067 vect_shift_permute_load_chain (vec<tree> dr_chain,
6068 unsigned int length,
6069 stmt_vec_info stmt_info,
6070 gimple_stmt_iterator *gsi,
6071 vec<tree> *result_chain)
6073 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6074 tree perm2_mask1, perm2_mask2, perm3_mask;
6075 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6076 gimple *perm_stmt;
6078 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6079 unsigned int i;
6080 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6082 unsigned HOST_WIDE_INT nelt, vf;
6083 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6084 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6085 /* Not supported for variable-length vectors. */
6086 return false;
6088 vec_perm_builder sel (nelt, nelt, 1);
6089 sel.quick_grow (nelt);
6091 result_chain->quick_grow (length);
6092 memcpy (result_chain->address (), dr_chain.address (),
6093 length * sizeof (tree));
6095 if (pow2p_hwi (length) && vf > 4)
6097 unsigned int j, log_length = exact_log2 (length);
6098 for (i = 0; i < nelt / 2; ++i)
6099 sel[i] = i * 2;
6100 for (i = 0; i < nelt / 2; ++i)
6101 sel[nelt / 2 + i] = i * 2 + 1;
6102 vec_perm_indices indices (sel, 2, nelt);
6103 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6105 if (dump_enabled_p ())
6106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6107 "shuffle of 2 fields structure is not \
6108 supported by target\n");
6109 return false;
6111 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6113 for (i = 0; i < nelt / 2; ++i)
6114 sel[i] = i * 2 + 1;
6115 for (i = 0; i < nelt / 2; ++i)
6116 sel[nelt / 2 + i] = i * 2;
6117 indices.new_vector (sel, 2, nelt);
6118 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6120 if (dump_enabled_p ())
6121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6122 "shuffle of 2 fields structure is not \
6123 supported by target\n");
6124 return false;
6126 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6128 /* Generating permutation constant to shift all elements.
6129 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6130 for (i = 0; i < nelt; i++)
6131 sel[i] = nelt / 2 + i;
6132 indices.new_vector (sel, 2, nelt);
6133 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6135 if (dump_enabled_p ())
6136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6137 "shift permutation is not supported by target\n");
6138 return false;
6140 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6142 /* Generating permutation constant to select vector from 2.
6143 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6144 for (i = 0; i < nelt / 2; i++)
6145 sel[i] = i;
6146 for (i = nelt / 2; i < nelt; i++)
6147 sel[i] = nelt + i;
6148 indices.new_vector (sel, 2, nelt);
6149 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6151 if (dump_enabled_p ())
6152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6153 "select is not supported by target\n");
6154 return false;
6156 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6158 for (i = 0; i < log_length; i++)
6160 for (j = 0; j < length; j += 2)
6162 first_vect = dr_chain[j];
6163 second_vect = dr_chain[j + 1];
6165 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6166 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6167 first_vect, first_vect,
6168 perm2_mask1);
6169 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6170 vect[0] = data_ref;
6172 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6173 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6174 second_vect, second_vect,
6175 perm2_mask2);
6176 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6177 vect[1] = data_ref;
6179 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6180 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6181 vect[0], vect[1], shift1_mask);
6182 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6183 (*result_chain)[j/2 + length/2] = data_ref;
6185 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6186 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6187 vect[0], vect[1], select_mask);
6188 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6189 (*result_chain)[j/2] = data_ref;
6191 memcpy (dr_chain.address (), result_chain->address (),
6192 length * sizeof (tree));
6194 return true;
6196 if (length == 3 && vf > 2)
6198 unsigned int k = 0, l = 0;
6200 /* Generating permutation constant to get all elements in rigth order.
6201 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6202 for (i = 0; i < nelt; i++)
6204 if (3 * k + (l % 3) >= nelt)
6206 k = 0;
6207 l += (3 - (nelt % 3));
6209 sel[i] = 3 * k + (l % 3);
6210 k++;
6212 vec_perm_indices indices (sel, 2, nelt);
6213 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6215 if (dump_enabled_p ())
6216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6217 "shuffle of 3 fields structure is not \
6218 supported by target\n");
6219 return false;
6221 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6223 /* Generating permutation constant to shift all elements.
6224 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6225 for (i = 0; i < nelt; i++)
6226 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6227 indices.new_vector (sel, 2, nelt);
6228 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6230 if (dump_enabled_p ())
6231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6232 "shift permutation is not supported by target\n");
6233 return false;
6235 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6237 /* Generating permutation constant to shift all elements.
6238 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6239 for (i = 0; i < nelt; i++)
6240 sel[i] = 2 * (nelt / 3) + 1 + i;
6241 indices.new_vector (sel, 2, nelt);
6242 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6244 if (dump_enabled_p ())
6245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6246 "shift permutation is not supported by target\n");
6247 return false;
6249 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6251 /* Generating permutation constant to shift all elements.
6252 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6253 for (i = 0; i < nelt; i++)
6254 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6255 indices.new_vector (sel, 2, nelt);
6256 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6258 if (dump_enabled_p ())
6259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6260 "shift permutation is not supported by target\n");
6261 return false;
6263 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6265 /* Generating permutation constant to shift all elements.
6266 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6267 for (i = 0; i < nelt; i++)
6268 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6269 indices.new_vector (sel, 2, nelt);
6270 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6272 if (dump_enabled_p ())
6273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6274 "shift permutation is not supported by target\n");
6275 return false;
6277 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6279 for (k = 0; k < 3; k++)
6281 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6282 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6283 dr_chain[k], dr_chain[k],
6284 perm3_mask);
6285 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6286 vect[k] = data_ref;
6289 for (k = 0; k < 3; k++)
6291 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6292 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6293 vect[k % 3], vect[(k + 1) % 3],
6294 shift1_mask);
6295 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6296 vect_shift[k] = data_ref;
6299 for (k = 0; k < 3; k++)
6301 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6302 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6303 vect_shift[(4 - k) % 3],
6304 vect_shift[(3 - k) % 3],
6305 shift2_mask);
6306 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6307 vect[k] = data_ref;
6310 (*result_chain)[3 - (nelt % 3)] = vect[2];
6312 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6313 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6314 vect[0], shift3_mask);
6315 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6316 (*result_chain)[nelt % 3] = data_ref;
6318 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6319 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6320 vect[1], shift4_mask);
6321 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6322 (*result_chain)[0] = data_ref;
6323 return true;
6325 return false;
6328 /* Function vect_transform_grouped_load.
6330 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6331 to perform their permutation and ascribe the result vectorized statements to
6332 the scalar statements.
6335 void
6336 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6337 int size, gimple_stmt_iterator *gsi)
6339 machine_mode mode;
6340 vec<tree> result_chain = vNULL;
6342 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6343 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6344 vectors, that are ready for vector computation. */
6345 result_chain.create (size);
6347 /* If reassociation width for vector type is 2 or greater target machine can
6348 execute 2 or more vector instructions in parallel. Otherwise try to
6349 get chain for loads group using vect_shift_permute_load_chain. */
6350 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6351 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6352 || pow2p_hwi (size)
6353 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6354 gsi, &result_chain))
6355 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6356 vect_record_grouped_load_vectors (stmt_info, result_chain);
6357 result_chain.release ();
6360 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6361 generated as part of the vectorization of STMT_INFO. Assign the statement
6362 for each vector to the associated scalar statement. */
6364 void
6365 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6366 vec<tree> result_chain)
6368 vec_info *vinfo = stmt_info->vinfo;
6369 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6370 unsigned int i, gap_count;
6371 tree tmp_data_ref;
6373 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6374 Since we scan the chain starting from it's first node, their order
6375 corresponds the order of data-refs in RESULT_CHAIN. */
6376 stmt_vec_info next_stmt_info = first_stmt_info;
6377 gap_count = 1;
6378 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6380 if (!next_stmt_info)
6381 break;
6383 /* Skip the gaps. Loads created for the gaps will be removed by dead
6384 code elimination pass later. No need to check for the first stmt in
6385 the group, since it always exists.
6386 DR_GROUP_GAP is the number of steps in elements from the previous
6387 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6388 correspond to the gaps. */
6389 if (next_stmt_info != first_stmt_info
6390 && gap_count < DR_GROUP_GAP (next_stmt_info))
6392 gap_count++;
6393 continue;
6396 while (next_stmt_info)
6398 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6399 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6400 copies, and we put the new vector statement in the first available
6401 RELATED_STMT. */
6402 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6403 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6404 else
6406 if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6408 stmt_vec_info prev_stmt_info
6409 = STMT_VINFO_VEC_STMT (next_stmt_info);
6410 stmt_vec_info rel_stmt_info
6411 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6412 while (rel_stmt_info)
6414 prev_stmt_info = rel_stmt_info;
6415 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6418 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6422 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6423 gap_count = 1;
6424 /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6425 put the same TMP_DATA_REF as its vectorized statement; otherwise
6426 get the next data-ref from RESULT_CHAIN. */
6427 if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6428 break;
6433 /* Function vect_force_dr_alignment_p.
6435 Returns whether the alignment of a DECL can be forced to be aligned
6436 on ALIGNMENT bit boundary. */
6438 bool
6439 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6441 if (!VAR_P (decl))
6442 return false;
6444 if (decl_in_symtab_p (decl)
6445 && !symtab_node::get (decl)->can_increase_alignment_p ())
6446 return false;
6448 if (TREE_STATIC (decl))
6449 return (alignment <= MAX_OFILE_ALIGNMENT);
6450 else
6451 return (alignment <= MAX_STACK_ALIGNMENT);
6455 /* Return whether the data reference DR_INFO is supported with respect to its
6456 alignment.
6457 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6458 it is aligned, i.e., check if it is possible to vectorize it with different
6459 alignment. */
6461 enum dr_alignment_support
6462 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6463 bool check_aligned_accesses)
6465 data_reference *dr = dr_info->dr;
6466 stmt_vec_info stmt_info = dr_info->stmt;
6467 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6468 machine_mode mode = TYPE_MODE (vectype);
6469 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6470 struct loop *vect_loop = NULL;
6471 bool nested_in_vect_loop = false;
6473 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6474 return dr_aligned;
6476 /* For now assume all conditional loads/stores support unaligned
6477 access without any special code. */
6478 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6479 if (gimple_call_internal_p (stmt)
6480 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6481 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6482 return dr_unaligned_supported;
6484 if (loop_vinfo)
6486 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6487 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6490 /* Possibly unaligned access. */
6492 /* We can choose between using the implicit realignment scheme (generating
6493 a misaligned_move stmt) and the explicit realignment scheme (generating
6494 aligned loads with a REALIGN_LOAD). There are two variants to the
6495 explicit realignment scheme: optimized, and unoptimized.
6496 We can optimize the realignment only if the step between consecutive
6497 vector loads is equal to the vector size. Since the vector memory
6498 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6499 is guaranteed that the misalignment amount remains the same throughout the
6500 execution of the vectorized loop. Therefore, we can create the
6501 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6502 at the loop preheader.
6504 However, in the case of outer-loop vectorization, when vectorizing a
6505 memory access in the inner-loop nested within the LOOP that is now being
6506 vectorized, while it is guaranteed that the misalignment of the
6507 vectorized memory access will remain the same in different outer-loop
6508 iterations, it is *not* guaranteed that is will remain the same throughout
6509 the execution of the inner-loop. This is because the inner-loop advances
6510 with the original scalar step (and not in steps of VS). If the inner-loop
6511 step happens to be a multiple of VS, then the misalignment remains fixed
6512 and we can use the optimized realignment scheme. For example:
6514 for (i=0; i<N; i++)
6515 for (j=0; j<M; j++)
6516 s += a[i+j];
6518 When vectorizing the i-loop in the above example, the step between
6519 consecutive vector loads is 1, and so the misalignment does not remain
6520 fixed across the execution of the inner-loop, and the realignment cannot
6521 be optimized (as illustrated in the following pseudo vectorized loop):
6523 for (i=0; i<N; i+=4)
6524 for (j=0; j<M; j++){
6525 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6526 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6527 // (assuming that we start from an aligned address).
6530 We therefore have to use the unoptimized realignment scheme:
6532 for (i=0; i<N; i+=4)
6533 for (j=k; j<M; j+=4)
6534 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6535 // that the misalignment of the initial address is
6536 // 0).
6538 The loop can then be vectorized as follows:
6540 for (k=0; k<4; k++){
6541 rt = get_realignment_token (&vp[k]);
6542 for (i=0; i<N; i+=4){
6543 v1 = vp[i+k];
6544 for (j=k; j<M; j+=4){
6545 v2 = vp[i+j+VS-1];
6546 va = REALIGN_LOAD <v1,v2,rt>;
6547 vs += va;
6548 v1 = v2;
6551 } */
6553 if (DR_IS_READ (dr))
6555 bool is_packed = false;
6556 tree type = (TREE_TYPE (DR_REF (dr)));
6558 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6559 && (!targetm.vectorize.builtin_mask_for_load
6560 || targetm.vectorize.builtin_mask_for_load ()))
6562 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6564 /* If we are doing SLP then the accesses need not have the
6565 same alignment, instead it depends on the SLP group size. */
6566 if (loop_vinfo
6567 && STMT_SLP_TYPE (stmt_info)
6568 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6569 * (DR_GROUP_SIZE
6570 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6571 TYPE_VECTOR_SUBPARTS (vectype)))
6573 else if (!loop_vinfo
6574 || (nested_in_vect_loop
6575 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6576 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6577 return dr_explicit_realign;
6578 else
6579 return dr_explicit_realign_optimized;
6581 if (!known_alignment_for_access_p (dr_info))
6582 is_packed = not_size_aligned (DR_REF (dr));
6584 if (targetm.vectorize.support_vector_misalignment
6585 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6586 /* Can't software pipeline the loads, but can at least do them. */
6587 return dr_unaligned_supported;
6589 else
6591 bool is_packed = false;
6592 tree type = (TREE_TYPE (DR_REF (dr)));
6594 if (!known_alignment_for_access_p (dr_info))
6595 is_packed = not_size_aligned (DR_REF (dr));
6597 if (targetm.vectorize.support_vector_misalignment
6598 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6599 return dr_unaligned_supported;
6602 /* Unsupported. */
6603 return dr_unaligned_unsupported;