Avoid is_constant calls in vectorizable_bswap
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob9beb9d51f8730655dda09b5a909bbb2b9fa43456
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
120 HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt_info->stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
134 if (assign
135 && (gimple_assign_cast_p (assign)
136 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
142 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
145 if (rhs < lhs)
146 scalar_type = rhs_type;
149 *lhs_size_unit = lhs;
150 *rhs_size_unit = rhs;
151 return scalar_type;
155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
156 tested at run-time. Return TRUE if DDR was successfully inserted.
157 Return false if versioning is not supported. */
159 static bool
160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
162 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
164 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
165 return false;
167 if (!runtime_alias_check_p (ddr, loop,
168 optimize_loop_nest_for_speed_p (loop)))
169 return false;
171 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
172 return true;
175 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
177 static void
178 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
180 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
181 for (unsigned int i = 0; i < checks.length(); ++i)
182 if (checks[i] == value)
183 return;
185 if (dump_enabled_p ())
187 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
188 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
189 dump_printf (MSG_NOTE, " is nonzero\n");
191 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
194 /* Return true if we know that the order of vectorized DR_INFO_A and
195 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
196 DR_INFO_B. At least one of the accesses is a write. */
198 static bool
199 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
201 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
202 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
204 /* Single statements are always kept in their original order. */
205 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
206 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
207 return true;
209 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
210 group are emitted at the position of the first scalar load and all
211 stores in a group are emitted at the position of the last scalar store.
212 Thus writes will happen no earlier than their current position
213 (but could happen later) while reads will happen no later than their
214 current position (but could happen earlier). Reordering is therefore
215 only possible if the first access is a write. */
216 stmtinfo_a = vect_orig_stmt (stmtinfo_a);
217 stmtinfo_b = vect_orig_stmt (stmtinfo_b);
218 stmt_vec_info earlier_stmt_info = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
219 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (earlier_stmt_info));
222 /* A subroutine of vect_analyze_data_ref_dependence. Handle
223 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
224 distances. These distances are conservatively correct but they don't
225 reflect a guaranteed dependence.
227 Return true if this function does all the work necessary to avoid
228 an alias or false if the caller should use the dependence distances
229 to limit the vectorization factor in the usual way. LOOP_DEPTH is
230 the depth of the loop described by LOOP_VINFO and the other arguments
231 are as for vect_analyze_data_ref_dependence. */
233 static bool
234 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
235 loop_vec_info loop_vinfo,
236 int loop_depth, unsigned int *max_vf)
238 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
239 lambda_vector dist_v;
240 unsigned int i;
241 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
243 int dist = dist_v[loop_depth];
244 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
246 /* If the user asserted safelen >= DIST consecutive iterations
247 can be executed concurrently, assume independence.
249 ??? An alternative would be to add the alias check even
250 in this case, and vectorize the fallback loop with the
251 maximum VF set to safelen. However, if the user has
252 explicitly given a length, it's less likely that that
253 would be a win. */
254 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
256 if ((unsigned int) loop->safelen < *max_vf)
257 *max_vf = loop->safelen;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
259 continue;
262 /* For dependence distances of 2 or more, we have the option
263 of limiting VF or checking for an alias at runtime.
264 Prefer to check at runtime if we can, to avoid limiting
265 the VF unnecessarily when the bases are in fact independent.
267 Note that the alias checks will be removed if the VF ends up
268 being small enough. */
269 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
270 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
271 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
272 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
273 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
276 return true;
280 /* Function vect_analyze_data_ref_dependence.
282 Return TRUE if there (might) exist a dependence between a memory-reference
283 DRA and a memory-reference DRB. When versioning for alias may check a
284 dependence at run-time, return FALSE. Adjust *MAX_VF according to
285 the data dependence. */
287 static bool
288 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
289 loop_vec_info loop_vinfo,
290 unsigned int *max_vf)
292 unsigned int i;
293 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
294 struct data_reference *dra = DDR_A (ddr);
295 struct data_reference *drb = DDR_B (ddr);
296 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
297 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
298 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
299 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
300 lambda_vector dist_v;
301 unsigned int loop_depth;
303 /* In loop analysis all data references should be vectorizable. */
304 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
305 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
306 gcc_unreachable ();
308 /* Independent data accesses. */
309 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
310 return false;
312 if (dra == drb
313 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
314 return false;
316 /* We do not have to consider dependences between accesses that belong
317 to the same group, unless the stride could be smaller than the
318 group size. */
319 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
320 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
321 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
322 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
323 return false;
325 /* Even if we have an anti-dependence then, as the vectorized loop covers at
326 least two scalar iterations, there is always also a true dependence.
327 As the vectorizer does not re-order loads and stores we can ignore
328 the anti-dependence if TBAA can disambiguate both DRs similar to the
329 case with known negative distance anti-dependences (positive
330 distance anti-dependences would violate TBAA constraints). */
331 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
332 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
333 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
334 get_alias_set (DR_REF (drb))))
335 return false;
337 /* Unknown data dependence. */
338 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
340 /* If user asserted safelen consecutive iterations can be
341 executed concurrently, assume independence. */
342 if (loop->safelen >= 2)
344 if ((unsigned int) loop->safelen < *max_vf)
345 *max_vf = loop->safelen;
346 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
347 return false;
350 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
351 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
356 "versioning for alias not supported for: "
357 "can't determine dependence between ");
358 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
359 DR_REF (dra));
360 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
361 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
362 DR_REF (drb));
363 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
365 return true;
368 if (dump_enabled_p ())
370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
371 "versioning for alias required: "
372 "can't determine dependence between ");
373 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
374 DR_REF (dra));
375 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
376 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
377 DR_REF (drb));
378 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
381 /* Add to list of ddrs that need to be tested at run-time. */
382 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
385 /* Known data dependence. */
386 if (DDR_NUM_DIST_VECTS (ddr) == 0)
388 /* If user asserted safelen consecutive iterations can be
389 executed concurrently, assume independence. */
390 if (loop->safelen >= 2)
392 if ((unsigned int) loop->safelen < *max_vf)
393 *max_vf = loop->safelen;
394 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
395 return false;
398 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
399 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
401 if (dump_enabled_p ())
403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
404 "versioning for alias not supported for: "
405 "bad dist vector for ");
406 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
407 DR_REF (dra));
408 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
409 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
410 DR_REF (drb));
411 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
413 return true;
416 if (dump_enabled_p ())
418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
419 "versioning for alias required: "
420 "bad dist vector for ");
421 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
422 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
423 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
424 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
426 /* Add to list of ddrs that need to be tested at run-time. */
427 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
430 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
432 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
433 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
434 loop_depth, max_vf))
435 return false;
437 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
439 int dist = dist_v[loop_depth];
441 if (dump_enabled_p ())
442 dump_printf_loc (MSG_NOTE, vect_location,
443 "dependence distance = %d.\n", dist);
445 if (dist == 0)
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_NOTE, vect_location,
450 "dependence distance == 0 between ");
451 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
452 dump_printf (MSG_NOTE, " and ");
453 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
454 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
457 /* When we perform grouped accesses and perform implicit CSE
458 by detecting equal accesses and doing disambiguation with
459 runtime alias tests like for
460 .. = a[i];
461 .. = a[i+1];
462 a[i] = ..;
463 a[i+1] = ..;
464 *p = ..;
465 .. = a[i];
466 .. = a[i+1];
467 where we will end up loading { a[i], a[i+1] } once, make
468 sure that inserting group loads before the first load and
469 stores after the last store will do the right thing.
470 Similar for groups like
471 a[i] = ...;
472 ... = a[i];
473 a[i+1] = ...;
474 where loads from the group interleave with the store. */
475 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
479 "READ_WRITE dependence in interleaving.\n");
480 return true;
483 if (loop->safelen < 2)
485 tree indicator = dr_zero_step_indicator (dra);
486 if (!indicator || integer_zerop (indicator))
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
490 "access also has a zero step\n");
491 return true;
493 else if (TREE_CODE (indicator) != INTEGER_CST)
494 vect_check_nonzero_value (loop_vinfo, indicator);
496 continue;
499 if (dist > 0 && DDR_REVERSED_P (ddr))
501 /* If DDR_REVERSED_P the order of the data-refs in DDR was
502 reversed (to make distance vector positive), and the actual
503 distance is negative. */
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
506 "dependence distance negative.\n");
507 /* Record a negative dependence distance to later limit the
508 amount of stmt copying / unrolling we can perform.
509 Only need to handle read-after-write dependence. */
510 if (DR_IS_READ (drb)
511 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
512 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
513 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
514 continue;
517 unsigned int abs_dist = abs (dist);
518 if (abs_dist >= 2 && abs_dist < *max_vf)
520 /* The dependence distance requires reduction of the maximal
521 vectorization factor. */
522 *max_vf = abs (dist);
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_NOTE, vect_location,
525 "adjusting maximal vectorization factor to %i\n",
526 *max_vf);
529 if (abs_dist >= *max_vf)
531 /* Dependence distance does not create dependence, as far as
532 vectorization is concerned, in this case. */
533 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE, vect_location,
535 "dependence distance >= VF.\n");
536 continue;
539 if (dump_enabled_p ())
541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
542 "not vectorized, possible dependence "
543 "between data-refs ");
544 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
545 dump_printf (MSG_NOTE, " and ");
546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
547 dump_printf (MSG_NOTE, "\n");
550 return true;
553 return false;
556 /* Function vect_analyze_data_ref_dependences.
558 Examine all the data references in the loop, and make sure there do not
559 exist any data dependences between them. Set *MAX_VF according to
560 the maximum vectorization factor the data dependences allow. */
562 bool
563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
564 unsigned int *max_vf)
566 unsigned int i;
567 struct data_dependence_relation *ddr;
569 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
571 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
573 LOOP_VINFO_DDRS (loop_vinfo)
574 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
575 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
576 /* We need read-read dependences to compute
577 STMT_VINFO_SAME_ALIGN_REFS. */
578 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
579 &LOOP_VINFO_DDRS (loop_vinfo),
580 LOOP_VINFO_LOOP_NEST (loop_vinfo),
581 true);
582 gcc_assert (res);
585 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
587 /* For epilogues we either have no aliases or alias versioning
588 was applied to original loop. Therefore we may just get max_vf
589 using VF of original loop. */
590 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
591 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
592 else
593 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
594 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
595 return false;
597 return true;
601 /* Function vect_slp_analyze_data_ref_dependence.
603 Return TRUE if there (might) exist a dependence between a memory-reference
604 DRA and a memory-reference DRB for VINFO. When versioning for alias
605 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
606 according to the data dependence. */
608 static bool
609 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
610 struct data_dependence_relation *ddr)
612 struct data_reference *dra = DDR_A (ddr);
613 struct data_reference *drb = DDR_B (ddr);
614 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
615 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
617 /* We need to check dependences of statements marked as unvectorizable
618 as well, they still can prohibit vectorization. */
620 /* Independent data accesses. */
621 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
622 return false;
624 if (dra == drb)
625 return false;
627 /* Read-read is OK. */
628 if (DR_IS_READ (dra) && DR_IS_READ (drb))
629 return false;
631 /* If dra and drb are part of the same interleaving chain consider
632 them independent. */
633 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
634 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
635 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
636 return false;
638 /* Unknown data dependence. */
639 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
641 if (dump_enabled_p ())
643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
644 "can't determine dependence between ");
645 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
646 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
647 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
648 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
651 else if (dump_enabled_p ())
653 dump_printf_loc (MSG_NOTE, vect_location,
654 "determined dependence between ");
655 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
656 dump_printf (MSG_NOTE, " and ");
657 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
658 dump_printf (MSG_NOTE, "\n");
661 return true;
665 /* Analyze dependences involved in the transform of SLP NODE. STORES
666 contain the vector of scalar stores of this instance if we are
667 disambiguating the loads. */
669 static bool
670 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
671 vec<stmt_vec_info> stores,
672 stmt_vec_info last_store_info)
674 /* This walks over all stmts involved in the SLP load/store done
675 in NODE verifying we can sink them up to the last stmt in the
676 group. */
677 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
678 vec_info *vinfo = last_access_info->vinfo;
679 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
681 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
682 if (access_info == last_access_info)
683 continue;
684 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
685 ao_ref ref;
686 bool ref_initialized_p = false;
687 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
688 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
690 gimple *stmt = gsi_stmt (gsi);
691 if (! gimple_vuse (stmt)
692 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
693 continue;
695 /* If we couldn't record a (single) data reference for this
696 stmt we have to resort to the alias oracle. */
697 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
698 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
699 if (!dr_b)
701 /* We are moving a store or sinking a load - this means
702 we cannot use TBAA for disambiguation. */
703 if (!ref_initialized_p)
704 ao_ref_init (&ref, DR_REF (dr_a));
705 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
706 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
707 return false;
708 continue;
711 bool dependent = false;
712 /* If we run into a store of this same instance (we've just
713 marked those) then delay dependence checking until we run
714 into the last store because this is where it will have
715 been sunk to (and we verify if we can do that as well). */
716 if (gimple_visited_p (stmt))
718 if (stmt_info != last_store_info)
719 continue;
720 unsigned i;
721 stmt_vec_info store_info;
722 FOR_EACH_VEC_ELT (stores, i, store_info)
724 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
725 ddr_p ddr = initialize_data_dependence_relation
726 (dr_a, store_dr, vNULL);
727 dependent
728 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
729 free_dependence_relation (ddr);
730 if (dependent)
731 break;
734 else
736 ddr_p ddr = initialize_data_dependence_relation (dr_a,
737 dr_b, vNULL);
738 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
739 free_dependence_relation (ddr);
741 if (dependent)
742 return false;
745 return true;
749 /* Function vect_analyze_data_ref_dependences.
751 Examine all the data references in the basic-block, and make sure there
752 do not exist any data dependences between them. Set *MAX_VF according to
753 the maximum vectorization factor the data dependences allow. */
755 bool
756 vect_slp_analyze_instance_dependence (slp_instance instance)
758 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
760 /* The stores of this instance are at the root of the SLP tree. */
761 slp_tree store = SLP_INSTANCE_TREE (instance);
762 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
763 store = NULL;
765 /* Verify we can sink stores to the vectorized stmt insert location. */
766 stmt_vec_info last_store_info = NULL;
767 if (store)
769 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
770 return false;
772 /* Mark stores in this instance and remember the last one. */
773 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
774 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
775 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
778 bool res = true;
780 /* Verify we can sink loads to the vectorized stmt insert location,
781 special-casing stores of this instance. */
782 slp_tree load;
783 unsigned int i;
784 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
785 if (! vect_slp_analyze_node_dependences (instance, load,
786 store
787 ? SLP_TREE_SCALAR_STMTS (store)
788 : vNULL, last_store_info))
790 res = false;
791 break;
794 /* Unset the visited flag. */
795 if (store)
796 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
797 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
799 return res;
802 /* Record the base alignment guarantee given by DRB, which occurs
803 in STMT_INFO. */
805 static void
806 vect_record_base_alignment (stmt_vec_info stmt_info,
807 innermost_loop_behavior *drb)
809 vec_info *vinfo = stmt_info->vinfo;
810 bool existed;
811 innermost_loop_behavior *&entry
812 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
813 if (!existed || entry->base_alignment < drb->base_alignment)
815 entry = drb;
816 if (dump_enabled_p ())
818 dump_printf_loc (MSG_NOTE, vect_location,
819 "recording new base alignment for ");
820 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
821 dump_printf (MSG_NOTE, "\n");
822 dump_printf_loc (MSG_NOTE, vect_location,
823 " alignment: %d\n", drb->base_alignment);
824 dump_printf_loc (MSG_NOTE, vect_location,
825 " misalignment: %d\n", drb->base_misalignment);
826 dump_printf_loc (MSG_NOTE, vect_location,
827 " based on: ");
828 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
833 /* If the region we're going to vectorize is reached, all unconditional
834 data references occur at least once. We can therefore pool the base
835 alignment guarantees from each unconditional reference. Do this by
836 going through all the data references in VINFO and checking whether
837 the containing statement makes the reference unconditionally. If so,
838 record the alignment of the base address in VINFO so that it can be
839 used for all other references with the same base. */
841 void
842 vect_record_base_alignments (vec_info *vinfo)
844 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
845 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
846 data_reference *dr;
847 unsigned int i;
848 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
850 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
851 stmt_vec_info stmt_info = dr_info->stmt;
852 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
853 && STMT_VINFO_VECTORIZABLE (stmt_info)
854 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
856 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
858 /* If DR is nested in the loop that is being vectorized, we can also
859 record the alignment of the base wrt the outer loop. */
860 if (loop && nested_in_vect_loop_p (loop, stmt_info))
861 vect_record_base_alignment
862 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
867 /* Return the target alignment for the vectorized form of DR_INFO. */
869 static unsigned int
870 vect_calculate_target_alignment (dr_vec_info *dr_info)
872 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
873 return targetm.vectorize.preferred_vector_alignment (vectype);
876 /* Function vect_compute_data_ref_alignment
878 Compute the misalignment of the data reference DR_INFO.
880 Output:
881 1. DR_MISALIGNMENT (DR_INFO) is defined.
883 FOR NOW: No analysis is actually performed. Misalignment is calculated
884 only for trivial cases. TODO. */
886 static void
887 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
889 stmt_vec_info stmt_info = dr_info->stmt;
890 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
891 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
892 struct loop *loop = NULL;
893 tree ref = DR_REF (dr_info->dr);
894 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
896 if (dump_enabled_p ())
897 dump_printf_loc (MSG_NOTE, vect_location,
898 "vect_compute_data_ref_alignment:\n");
900 if (loop_vinfo)
901 loop = LOOP_VINFO_LOOP (loop_vinfo);
903 /* Initialize misalignment to unknown. */
904 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
906 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
907 return;
909 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
910 bool step_preserves_misalignment_p;
912 unsigned HOST_WIDE_INT vector_alignment
913 = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
914 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
916 /* No step for BB vectorization. */
917 if (!loop)
919 gcc_assert (integer_zerop (drb->step));
920 step_preserves_misalignment_p = true;
923 /* In case the dataref is in an inner-loop of the loop that is being
924 vectorized (LOOP), we use the base and misalignment information
925 relative to the outer-loop (LOOP). This is ok only if the misalignment
926 stays the same throughout the execution of the inner-loop, which is why
927 we have to check that the stride of the dataref in the inner-loop evenly
928 divides by the vector alignment. */
929 else if (nested_in_vect_loop_p (loop, stmt_info))
931 step_preserves_misalignment_p
932 = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
934 if (dump_enabled_p ())
936 if (step_preserves_misalignment_p)
937 dump_printf_loc (MSG_NOTE, vect_location,
938 "inner step divides the vector alignment.\n");
939 else
940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
941 "inner step doesn't divide the vector"
942 " alignment.\n");
946 /* Similarly we can only use base and misalignment information relative to
947 an innermost loop if the misalignment stays the same throughout the
948 execution of the loop. As above, this is the case if the stride of
949 the dataref evenly divides by the alignment. */
950 else
952 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
953 step_preserves_misalignment_p
954 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
956 if (!step_preserves_misalignment_p && dump_enabled_p ())
957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
958 "step doesn't divide the vector alignment.\n");
961 unsigned int base_alignment = drb->base_alignment;
962 unsigned int base_misalignment = drb->base_misalignment;
964 /* Calculate the maximum of the pooled base address alignment and the
965 alignment that we can compute for DR itself. */
966 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
967 if (entry && base_alignment < (*entry)->base_alignment)
969 base_alignment = (*entry)->base_alignment;
970 base_misalignment = (*entry)->base_misalignment;
973 if (drb->offset_alignment < vector_alignment
974 || !step_preserves_misalignment_p
975 /* We need to know whether the step wrt the vectorized loop is
976 negative when computing the starting misalignment below. */
977 || TREE_CODE (drb->step) != INTEGER_CST)
979 if (dump_enabled_p ())
981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
982 "Unknown alignment for access: ");
983 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
984 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
986 return;
989 if (base_alignment < vector_alignment)
991 unsigned int max_alignment;
992 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
993 if (max_alignment < vector_alignment
994 || !vect_can_force_dr_alignment_p (base,
995 vector_alignment * BITS_PER_UNIT))
997 if (dump_enabled_p ())
999 dump_printf_loc (MSG_NOTE, vect_location,
1000 "can't force alignment of ref: ");
1001 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1002 dump_printf (MSG_NOTE, "\n");
1004 return;
1007 /* Force the alignment of the decl.
1008 NOTE: This is the only change to the code we make during
1009 the analysis phase, before deciding to vectorize the loop. */
1010 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1013 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1014 dump_printf (MSG_NOTE, "\n");
1017 dr_info->base_decl = base;
1018 dr_info->base_misaligned = true;
1019 base_misalignment = 0;
1021 poly_int64 misalignment
1022 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1024 /* If this is a backward running DR then first access in the larger
1025 vectype actually is N-1 elements before the address in the DR.
1026 Adjust misalign accordingly. */
1027 if (tree_int_cst_sgn (drb->step) < 0)
1028 /* PLUS because STEP is negative. */
1029 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1030 * TREE_INT_CST_LOW (drb->step));
1032 unsigned int const_misalignment;
1033 if (!known_misalignment (misalignment, vector_alignment,
1034 &const_misalignment))
1036 if (dump_enabled_p ())
1038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1039 "Non-constant misalignment for access: ");
1040 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1041 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1043 return;
1046 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1048 if (dump_enabled_p ())
1050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1051 "misalign = %d bytes of ref ",
1052 DR_MISALIGNMENT (dr_info));
1053 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1054 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1057 return;
1060 /* Function vect_update_misalignment_for_peel.
1061 Sets DR_INFO's misalignment
1062 - to 0 if it has the same alignment as DR_PEEL_INFO,
1063 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1064 - to -1 (unknown) otherwise.
1066 DR_INFO - the data reference whose misalignment is to be adjusted.
1067 DR_PEEL_INFO - the data reference whose misalignment is being made
1068 zero in the vector loop by the peel.
1069 NPEEL - the number of iterations in the peel loop if the misalignment
1070 of DR_PEEL_INFO is known at compile time. */
1072 static void
1073 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1074 dr_vec_info *dr_peel_info, int npeel)
1076 unsigned int i;
1077 vec<dr_p> same_aligned_drs;
1078 struct data_reference *current_dr;
1079 int dr_size = vect_get_scalar_dr_size (dr_info);
1080 int dr_peel_size = vect_get_scalar_dr_size (dr_peel_info);
1081 stmt_vec_info stmt_info = dr_info->stmt;
1082 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1084 /* For interleaved data accesses the step in the loop must be multiplied by
1085 the size of the interleaving group. */
1086 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1087 dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1088 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1089 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1091 /* It can be assumed that the data refs with the same alignment as dr_peel
1092 are aligned in the vector loop. */
1093 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1094 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1096 if (current_dr != dr_info->dr)
1097 continue;
1098 gcc_assert (!known_alignment_for_access_p (dr_info)
1099 || !known_alignment_for_access_p (dr_peel_info)
1100 || (DR_MISALIGNMENT (dr_info) / dr_size
1101 == DR_MISALIGNMENT (dr_peel_info) / dr_peel_size));
1102 SET_DR_MISALIGNMENT (dr_info, 0);
1103 return;
1106 if (known_alignment_for_access_p (dr_info)
1107 && known_alignment_for_access_p (dr_peel_info))
1109 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1110 size_zero_node) < 0;
1111 int misal = DR_MISALIGNMENT (dr_info);
1112 misal += negative ? -npeel * dr_size : npeel * dr_size;
1113 misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1114 SET_DR_MISALIGNMENT (dr_info, misal);
1115 return;
1118 if (dump_enabled_p ())
1119 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1120 "to unknown (-1).\n");
1121 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1125 /* Function verify_data_ref_alignment
1127 Return TRUE if DR_INFO can be handled with respect to alignment. */
1129 static bool
1130 verify_data_ref_alignment (dr_vec_info *dr_info)
1132 enum dr_alignment_support supportable_dr_alignment
1133 = vect_supportable_dr_alignment (dr_info, false);
1134 if (!supportable_dr_alignment)
1136 if (dump_enabled_p ())
1138 if (DR_IS_READ (dr_info->dr))
1139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140 "not vectorized: unsupported unaligned load.");
1141 else
1142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1143 "not vectorized: unsupported unaligned "
1144 "store.");
1146 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1147 DR_REF (dr_info->dr));
1148 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1150 return false;
1153 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1154 dump_printf_loc (MSG_NOTE, vect_location,
1155 "Vectorizing an unaligned access.\n");
1157 return true;
1160 /* Function vect_verify_datarefs_alignment
1162 Return TRUE if all data references in the loop can be
1163 handled with respect to alignment. */
1165 bool
1166 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1168 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1169 struct data_reference *dr;
1170 unsigned int i;
1172 FOR_EACH_VEC_ELT (datarefs, i, dr)
1174 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1175 stmt_vec_info stmt_info = dr_info->stmt;
1177 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1178 continue;
1180 /* For interleaving, only the alignment of the first access matters. */
1181 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1182 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1183 continue;
1185 /* Strided accesses perform only component accesses, alignment is
1186 irrelevant for them. */
1187 if (STMT_VINFO_STRIDED_P (stmt_info)
1188 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1189 continue;
1191 if (! verify_data_ref_alignment (dr_info))
1192 return false;
1195 return true;
1198 /* Given an memory reference EXP return whether its alignment is less
1199 than its size. */
1201 static bool
1202 not_size_aligned (tree exp)
1204 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1205 return true;
1207 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1208 > get_object_alignment (exp));
1211 /* Function vector_alignment_reachable_p
1213 Return true if vector alignment for DR_INFO is reachable by peeling
1214 a few loop iterations. Return false otherwise. */
1216 static bool
1217 vector_alignment_reachable_p (dr_vec_info *dr_info)
1219 stmt_vec_info stmt_info = dr_info->stmt;
1220 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1222 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1224 /* For interleaved access we peel only if number of iterations in
1225 the prolog loop ({VF - misalignment}), is a multiple of the
1226 number of the interleaved accesses. */
1227 int elem_size, mis_in_elements;
1229 /* FORNOW: handle only known alignment. */
1230 if (!known_alignment_for_access_p (dr_info))
1231 return false;
1233 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1234 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1235 elem_size = vector_element_size (vector_size, nelements);
1236 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1238 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1239 return false;
1242 /* If misalignment is known at the compile time then allow peeling
1243 only if natural alignment is reachable through peeling. */
1244 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1246 HOST_WIDE_INT elmsize =
1247 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1248 if (dump_enabled_p ())
1250 dump_printf_loc (MSG_NOTE, vect_location,
1251 "data size = %wd. misalignment = %d.\n", elmsize,
1252 DR_MISALIGNMENT (dr_info));
1254 if (DR_MISALIGNMENT (dr_info) % elmsize)
1256 if (dump_enabled_p ())
1257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258 "data size does not divide the misalignment.\n");
1259 return false;
1263 if (!known_alignment_for_access_p (dr_info))
1265 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1266 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1267 if (dump_enabled_p ())
1268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269 "Unknown misalignment, %snaturally aligned\n",
1270 is_packed ? "not " : "");
1271 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1274 return true;
1278 /* Calculate the cost of the memory access represented by DR_INFO. */
1280 static void
1281 vect_get_data_access_cost (dr_vec_info *dr_info,
1282 unsigned int *inside_cost,
1283 unsigned int *outside_cost,
1284 stmt_vector_for_cost *body_cost_vec,
1285 stmt_vector_for_cost *prologue_cost_vec)
1287 stmt_vec_info stmt_info = dr_info->stmt;
1288 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1289 int ncopies;
1291 if (PURE_SLP_STMT (stmt_info))
1292 ncopies = 1;
1293 else
1294 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1296 if (DR_IS_READ (dr_info->dr))
1297 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1298 prologue_cost_vec, body_cost_vec, false);
1299 else
1300 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1302 if (dump_enabled_p ())
1303 dump_printf_loc (MSG_NOTE, vect_location,
1304 "vect_get_data_access_cost: inside_cost = %d, "
1305 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1309 typedef struct _vect_peel_info
1311 dr_vec_info *dr_info;
1312 int npeel;
1313 unsigned int count;
1314 } *vect_peel_info;
1316 typedef struct _vect_peel_extended_info
1318 struct _vect_peel_info peel_info;
1319 unsigned int inside_cost;
1320 unsigned int outside_cost;
1321 } *vect_peel_extended_info;
1324 /* Peeling hashtable helpers. */
1326 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1328 static inline hashval_t hash (const _vect_peel_info *);
1329 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1332 inline hashval_t
1333 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1335 return (hashval_t) peel_info->npeel;
1338 inline bool
1339 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1341 return (a->npeel == b->npeel);
1345 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1347 static void
1348 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1349 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1350 int npeel)
1352 struct _vect_peel_info elem, *slot;
1353 _vect_peel_info **new_slot;
1354 bool supportable_dr_alignment
1355 = vect_supportable_dr_alignment (dr_info, true);
1357 elem.npeel = npeel;
1358 slot = peeling_htab->find (&elem);
1359 if (slot)
1360 slot->count++;
1361 else
1363 slot = XNEW (struct _vect_peel_info);
1364 slot->npeel = npeel;
1365 slot->dr_info = dr_info;
1366 slot->count = 1;
1367 new_slot = peeling_htab->find_slot (slot, INSERT);
1368 *new_slot = slot;
1371 if (!supportable_dr_alignment
1372 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1373 slot->count += VECT_MAX_COST;
1377 /* Traverse peeling hash table to find peeling option that aligns maximum
1378 number of data accesses. */
1381 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1382 _vect_peel_extended_info *max)
1384 vect_peel_info elem = *slot;
1386 if (elem->count > max->peel_info.count
1387 || (elem->count == max->peel_info.count
1388 && max->peel_info.npeel > elem->npeel))
1390 max->peel_info.npeel = elem->npeel;
1391 max->peel_info.count = elem->count;
1392 max->peel_info.dr_info = elem->dr_info;
1395 return 1;
1398 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1399 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1400 we assume DR0_INFO's misalignment will be zero after peeling. */
1402 static void
1403 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1404 dr_vec_info *dr0_info,
1405 unsigned int *inside_cost,
1406 unsigned int *outside_cost,
1407 stmt_vector_for_cost *body_cost_vec,
1408 stmt_vector_for_cost *prologue_cost_vec,
1409 unsigned int npeel,
1410 bool unknown_misalignment)
1412 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1413 unsigned i;
1414 data_reference *dr;
1416 FOR_EACH_VEC_ELT (datarefs, i, dr)
1418 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1419 stmt_vec_info stmt_info = dr_info->stmt;
1420 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1421 continue;
1423 /* For interleaving, only the alignment of the first access
1424 matters. */
1425 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1426 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1427 continue;
1429 /* Strided accesses perform only component accesses, alignment is
1430 irrelevant for them. */
1431 if (STMT_VINFO_STRIDED_P (stmt_info)
1432 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1433 continue;
1435 int save_misalignment;
1436 save_misalignment = DR_MISALIGNMENT (dr_info);
1437 if (npeel == 0)
1439 else if (unknown_misalignment && dr_info == dr0_info)
1440 SET_DR_MISALIGNMENT (dr_info, 0);
1441 else
1442 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1443 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1444 body_cost_vec, prologue_cost_vec);
1445 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1449 /* Traverse peeling hash table and calculate cost for each peeling option.
1450 Find the one with the lowest cost. */
1453 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1454 _vect_peel_extended_info *min)
1456 vect_peel_info elem = *slot;
1457 int dummy;
1458 unsigned int inside_cost = 0, outside_cost = 0;
1459 stmt_vec_info stmt_info = elem->dr_info->stmt;
1460 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1461 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1462 epilogue_cost_vec;
1464 prologue_cost_vec.create (2);
1465 body_cost_vec.create (2);
1466 epilogue_cost_vec.create (2);
1468 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1469 &outside_cost, &body_cost_vec,
1470 &prologue_cost_vec, elem->npeel, false);
1472 body_cost_vec.release ();
1474 outside_cost += vect_get_known_peeling_cost
1475 (loop_vinfo, elem->npeel, &dummy,
1476 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1477 &prologue_cost_vec, &epilogue_cost_vec);
1479 /* Prologue and epilogue costs are added to the target model later.
1480 These costs depend only on the scalar iteration cost, the
1481 number of peeling iterations finally chosen, and the number of
1482 misaligned statements. So discard the information found here. */
1483 prologue_cost_vec.release ();
1484 epilogue_cost_vec.release ();
1486 if (inside_cost < min->inside_cost
1487 || (inside_cost == min->inside_cost
1488 && outside_cost < min->outside_cost))
1490 min->inside_cost = inside_cost;
1491 min->outside_cost = outside_cost;
1492 min->peel_info.dr_info = elem->dr_info;
1493 min->peel_info.npeel = elem->npeel;
1494 min->peel_info.count = elem->count;
1497 return 1;
1501 /* Choose best peeling option by traversing peeling hash table and either
1502 choosing an option with the lowest cost (if cost model is enabled) or the
1503 option that aligns as many accesses as possible. */
1505 static struct _vect_peel_extended_info
1506 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1507 loop_vec_info loop_vinfo)
1509 struct _vect_peel_extended_info res;
1511 res.peel_info.dr_info = NULL;
1513 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1515 res.inside_cost = INT_MAX;
1516 res.outside_cost = INT_MAX;
1517 peeling_htab->traverse <_vect_peel_extended_info *,
1518 vect_peeling_hash_get_lowest_cost> (&res);
1520 else
1522 res.peel_info.count = 0;
1523 peeling_htab->traverse <_vect_peel_extended_info *,
1524 vect_peeling_hash_get_most_frequent> (&res);
1525 res.inside_cost = 0;
1526 res.outside_cost = 0;
1529 return res;
1532 /* Return true if the new peeling NPEEL is supported. */
1534 static bool
1535 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1536 unsigned npeel)
1538 unsigned i;
1539 struct data_reference *dr = NULL;
1540 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1541 enum dr_alignment_support supportable_dr_alignment;
1543 /* Ensure that all data refs can be vectorized after the peel. */
1544 FOR_EACH_VEC_ELT (datarefs, i, dr)
1546 int save_misalignment;
1548 if (dr == dr0_info->dr)
1549 continue;
1551 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1552 stmt_vec_info stmt_info = dr_info->stmt;
1553 /* For interleaving, only the alignment of the first access
1554 matters. */
1555 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1556 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1557 continue;
1559 /* Strided accesses perform only component accesses, alignment is
1560 irrelevant for them. */
1561 if (STMT_VINFO_STRIDED_P (stmt_info)
1562 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1563 continue;
1565 save_misalignment = DR_MISALIGNMENT (dr_info);
1566 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1567 supportable_dr_alignment
1568 = vect_supportable_dr_alignment (dr_info, false);
1569 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1571 if (!supportable_dr_alignment)
1572 return false;
1575 return true;
1578 /* Function vect_enhance_data_refs_alignment
1580 This pass will use loop versioning and loop peeling in order to enhance
1581 the alignment of data references in the loop.
1583 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1584 original loop is to be vectorized. Any other loops that are created by
1585 the transformations performed in this pass - are not supposed to be
1586 vectorized. This restriction will be relaxed.
1588 This pass will require a cost model to guide it whether to apply peeling
1589 or versioning or a combination of the two. For example, the scheme that
1590 intel uses when given a loop with several memory accesses, is as follows:
1591 choose one memory access ('p') which alignment you want to force by doing
1592 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1593 other accesses are not necessarily aligned, or (2) use loop versioning to
1594 generate one loop in which all accesses are aligned, and another loop in
1595 which only 'p' is necessarily aligned.
1597 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1598 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1599 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1601 Devising a cost model is the most critical aspect of this work. It will
1602 guide us on which access to peel for, whether to use loop versioning, how
1603 many versions to create, etc. The cost model will probably consist of
1604 generic considerations as well as target specific considerations (on
1605 powerpc for example, misaligned stores are more painful than misaligned
1606 loads).
1608 Here are the general steps involved in alignment enhancements:
1610 -- original loop, before alignment analysis:
1611 for (i=0; i<N; i++){
1612 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1613 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1616 -- After vect_compute_data_refs_alignment:
1617 for (i=0; i<N; i++){
1618 x = q[i]; # DR_MISALIGNMENT(q) = 3
1619 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1622 -- Possibility 1: we do loop versioning:
1623 if (p is aligned) {
1624 for (i=0; i<N; i++){ # loop 1A
1625 x = q[i]; # DR_MISALIGNMENT(q) = 3
1626 p[i] = y; # DR_MISALIGNMENT(p) = 0
1629 else {
1630 for (i=0; i<N; i++){ # loop 1B
1631 x = q[i]; # DR_MISALIGNMENT(q) = 3
1632 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1636 -- Possibility 2: we do loop peeling:
1637 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1638 x = q[i];
1639 p[i] = y;
1641 for (i = 3; i < N; i++){ # loop 2A
1642 x = q[i]; # DR_MISALIGNMENT(q) = 0
1643 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1646 -- Possibility 3: combination of loop peeling and versioning:
1647 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1648 x = q[i];
1649 p[i] = y;
1651 if (p is aligned) {
1652 for (i = 3; i<N; i++){ # loop 3A
1653 x = q[i]; # DR_MISALIGNMENT(q) = 0
1654 p[i] = y; # DR_MISALIGNMENT(p) = 0
1657 else {
1658 for (i = 3; i<N; i++){ # loop 3B
1659 x = q[i]; # DR_MISALIGNMENT(q) = 0
1660 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1664 These loops are later passed to loop_transform to be vectorized. The
1665 vectorizer will use the alignment information to guide the transformation
1666 (whether to generate regular loads/stores, or with special handling for
1667 misalignment). */
1669 bool
1670 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1672 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1673 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674 enum dr_alignment_support supportable_dr_alignment;
1675 dr_vec_info *first_store = NULL;
1676 dr_vec_info *dr0_info = NULL;
1677 struct data_reference *dr;
1678 unsigned int i, j;
1679 bool do_peeling = false;
1680 bool do_versioning = false;
1681 bool stat;
1682 unsigned int npeel = 0;
1683 bool one_misalignment_known = false;
1684 bool one_misalignment_unknown = false;
1685 bool one_dr_unsupportable = false;
1686 dr_vec_info *unsupportable_dr_info = NULL;
1687 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1688 unsigned possible_npeel_number = 1;
1689 tree vectype;
1690 unsigned int mis, same_align_drs_max = 0;
1691 hash_table<peel_info_hasher> peeling_htab (1);
1693 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1695 /* Reset data so we can safely be called multiple times. */
1696 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1697 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1699 /* While cost model enhancements are expected in the future, the high level
1700 view of the code at this time is as follows:
1702 A) If there is a misaligned access then see if peeling to align
1703 this access can make all data references satisfy
1704 vect_supportable_dr_alignment. If so, update data structures
1705 as needed and return true.
1707 B) If peeling wasn't possible and there is a data reference with an
1708 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1709 then see if loop versioning checks can be used to make all data
1710 references satisfy vect_supportable_dr_alignment. If so, update
1711 data structures as needed and return true.
1713 C) If neither peeling nor versioning were successful then return false if
1714 any data reference does not satisfy vect_supportable_dr_alignment.
1716 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1718 Note, Possibility 3 above (which is peeling and versioning together) is not
1719 being done at this time. */
1721 /* (1) Peeling to force alignment. */
1723 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1724 Considerations:
1725 + How many accesses will become aligned due to the peeling
1726 - How many accesses will become unaligned due to the peeling,
1727 and the cost of misaligned accesses.
1728 - The cost of peeling (the extra runtime checks, the increase
1729 in code size). */
1731 FOR_EACH_VEC_ELT (datarefs, i, dr)
1733 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1734 stmt_vec_info stmt_info = dr_info->stmt;
1736 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1737 continue;
1739 /* For interleaving, only the alignment of the first access
1740 matters. */
1741 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1742 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1743 continue;
1745 /* For scatter-gather or invariant accesses there is nothing
1746 to enhance. */
1747 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1748 || integer_zerop (DR_STEP (dr)))
1749 continue;
1751 /* Strided accesses perform only component accesses, alignment is
1752 irrelevant for them. */
1753 if (STMT_VINFO_STRIDED_P (stmt_info)
1754 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1755 continue;
1757 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1758 do_peeling = vector_alignment_reachable_p (dr_info);
1759 if (do_peeling)
1761 if (known_alignment_for_access_p (dr_info))
1763 unsigned int npeel_tmp = 0;
1764 bool negative = tree_int_cst_compare (DR_STEP (dr),
1765 size_zero_node) < 0;
1767 vectype = STMT_VINFO_VECTYPE (stmt_info);
1768 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1769 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1770 mis = (negative
1771 ? DR_MISALIGNMENT (dr_info)
1772 : -DR_MISALIGNMENT (dr_info));
1773 if (DR_MISALIGNMENT (dr_info) != 0)
1774 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1776 /* For multiple types, it is possible that the bigger type access
1777 will have more than one peeling option. E.g., a loop with two
1778 types: one of size (vector size / 4), and the other one of
1779 size (vector size / 8). Vectorization factor will 8. If both
1780 accesses are misaligned by 3, the first one needs one scalar
1781 iteration to be aligned, and the second one needs 5. But the
1782 first one will be aligned also by peeling 5 scalar
1783 iterations, and in that case both accesses will be aligned.
1784 Hence, except for the immediate peeling amount, we also want
1785 to try to add full vector size, while we don't exceed
1786 vectorization factor.
1787 We do this automatically for cost model, since we calculate
1788 cost for every peeling option. */
1789 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1791 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1792 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1793 possible_npeel_number
1794 = vect_get_num_vectors (nscalars, vectype);
1796 /* NPEEL_TMP is 0 when there is no misalignment, but also
1797 allow peeling NELEMENTS. */
1798 if (DR_MISALIGNMENT (dr_info) == 0)
1799 possible_npeel_number++;
1802 /* Save info about DR in the hash table. Also include peeling
1803 amounts according to the explanation above. */
1804 for (j = 0; j < possible_npeel_number; j++)
1806 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1807 dr_info, npeel_tmp);
1808 npeel_tmp += target_align / dr_size;
1811 one_misalignment_known = true;
1813 else
1815 /* If we don't know any misalignment values, we prefer
1816 peeling for data-ref that has the maximum number of data-refs
1817 with the same alignment, unless the target prefers to align
1818 stores over load. */
1819 unsigned same_align_drs
1820 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1821 if (!dr0_info
1822 || same_align_drs_max < same_align_drs)
1824 same_align_drs_max = same_align_drs;
1825 dr0_info = dr_info;
1827 /* For data-refs with the same number of related
1828 accesses prefer the one where the misalign
1829 computation will be invariant in the outermost loop. */
1830 else if (same_align_drs_max == same_align_drs)
1832 struct loop *ivloop0, *ivloop;
1833 ivloop0 = outermost_invariant_loop_for_expr
1834 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1835 ivloop = outermost_invariant_loop_for_expr
1836 (loop, DR_BASE_ADDRESS (dr));
1837 if ((ivloop && !ivloop0)
1838 || (ivloop && ivloop0
1839 && flow_loop_nested_p (ivloop, ivloop0)))
1840 dr0_info = dr_info;
1843 one_misalignment_unknown = true;
1845 /* Check for data refs with unsupportable alignment that
1846 can be peeled. */
1847 if (!supportable_dr_alignment)
1849 one_dr_unsupportable = true;
1850 unsupportable_dr_info = dr_info;
1853 if (!first_store && DR_IS_WRITE (dr))
1854 first_store = dr_info;
1857 else
1859 if (!aligned_access_p (dr_info))
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863 "vector alignment may not be reachable\n");
1864 break;
1869 /* Check if we can possibly peel the loop. */
1870 if (!vect_can_advance_ivs_p (loop_vinfo)
1871 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1872 || loop->inner)
1873 do_peeling = false;
1875 struct _vect_peel_extended_info peel_for_known_alignment;
1876 struct _vect_peel_extended_info peel_for_unknown_alignment;
1877 struct _vect_peel_extended_info best_peel;
1879 peel_for_unknown_alignment.inside_cost = INT_MAX;
1880 peel_for_unknown_alignment.outside_cost = INT_MAX;
1881 peel_for_unknown_alignment.peel_info.count = 0;
1883 if (do_peeling
1884 && one_misalignment_unknown)
1886 /* Check if the target requires to prefer stores over loads, i.e., if
1887 misaligned stores are more expensive than misaligned loads (taking
1888 drs with same alignment into account). */
1889 unsigned int load_inside_cost = 0;
1890 unsigned int load_outside_cost = 0;
1891 unsigned int store_inside_cost = 0;
1892 unsigned int store_outside_cost = 0;
1893 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1895 stmt_vector_for_cost dummy;
1896 dummy.create (2);
1897 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1898 &load_inside_cost,
1899 &load_outside_cost,
1900 &dummy, &dummy, estimated_npeels, true);
1901 dummy.release ();
1903 if (first_store)
1905 dummy.create (2);
1906 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1907 &store_inside_cost,
1908 &store_outside_cost,
1909 &dummy, &dummy,
1910 estimated_npeels, true);
1911 dummy.release ();
1913 else
1915 store_inside_cost = INT_MAX;
1916 store_outside_cost = INT_MAX;
1919 if (load_inside_cost > store_inside_cost
1920 || (load_inside_cost == store_inside_cost
1921 && load_outside_cost > store_outside_cost))
1923 dr0_info = first_store;
1924 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1925 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1927 else
1929 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1930 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1933 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1934 prologue_cost_vec.create (2);
1935 epilogue_cost_vec.create (2);
1937 int dummy2;
1938 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1939 (loop_vinfo, estimated_npeels, &dummy2,
1940 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1941 &prologue_cost_vec, &epilogue_cost_vec);
1943 prologue_cost_vec.release ();
1944 epilogue_cost_vec.release ();
1946 peel_for_unknown_alignment.peel_info.count = 1
1947 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1950 peel_for_unknown_alignment.peel_info.npeel = 0;
1951 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1953 best_peel = peel_for_unknown_alignment;
1955 peel_for_known_alignment.inside_cost = INT_MAX;
1956 peel_for_known_alignment.outside_cost = INT_MAX;
1957 peel_for_known_alignment.peel_info.count = 0;
1958 peel_for_known_alignment.peel_info.dr_info = NULL;
1960 if (do_peeling && one_misalignment_known)
1962 /* Peeling is possible, but there is no data access that is not supported
1963 unless aligned. So we try to choose the best possible peeling from
1964 the hash table. */
1965 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1966 (&peeling_htab, loop_vinfo);
1969 /* Compare costs of peeling for known and unknown alignment. */
1970 if (peel_for_known_alignment.peel_info.dr_info != NULL
1971 && peel_for_unknown_alignment.inside_cost
1972 >= peel_for_known_alignment.inside_cost)
1974 best_peel = peel_for_known_alignment;
1976 /* If the best peeling for known alignment has NPEEL == 0, perform no
1977 peeling at all except if there is an unsupportable dr that we can
1978 align. */
1979 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1980 do_peeling = false;
1983 /* If there is an unsupportable data ref, prefer this over all choices so far
1984 since we'd have to discard a chosen peeling except when it accidentally
1985 aligned the unsupportable data ref. */
1986 if (one_dr_unsupportable)
1987 dr0_info = unsupportable_dr_info;
1988 else if (do_peeling)
1990 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1991 TODO: Use nopeel_outside_cost or get rid of it? */
1992 unsigned nopeel_inside_cost = 0;
1993 unsigned nopeel_outside_cost = 0;
1995 stmt_vector_for_cost dummy;
1996 dummy.create (2);
1997 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1998 &nopeel_outside_cost, &dummy, &dummy,
1999 0, false);
2000 dummy.release ();
2002 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2003 costs will be recorded. */
2004 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2005 prologue_cost_vec.create (2);
2006 epilogue_cost_vec.create (2);
2008 int dummy2;
2009 nopeel_outside_cost += vect_get_known_peeling_cost
2010 (loop_vinfo, 0, &dummy2,
2011 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2012 &prologue_cost_vec, &epilogue_cost_vec);
2014 prologue_cost_vec.release ();
2015 epilogue_cost_vec.release ();
2017 npeel = best_peel.peel_info.npeel;
2018 dr0_info = best_peel.peel_info.dr_info;
2020 /* If no peeling is not more expensive than the best peeling we
2021 have so far, don't perform any peeling. */
2022 if (nopeel_inside_cost <= best_peel.inside_cost)
2023 do_peeling = false;
2026 if (do_peeling)
2028 stmt_vec_info stmt_info = dr0_info->stmt;
2029 vectype = STMT_VINFO_VECTYPE (stmt_info);
2031 if (known_alignment_for_access_p (dr0_info))
2033 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2034 size_zero_node) < 0;
2035 if (!npeel)
2037 /* Since it's known at compile time, compute the number of
2038 iterations in the peeled loop (the peeling factor) for use in
2039 updating DR_MISALIGNMENT values. The peeling factor is the
2040 vectorization factor minus the misalignment as an element
2041 count. */
2042 mis = (negative
2043 ? DR_MISALIGNMENT (dr0_info)
2044 : -DR_MISALIGNMENT (dr0_info));
2045 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2046 npeel = ((mis & (target_align - 1))
2047 / vect_get_scalar_dr_size (dr0_info));
2050 /* For interleaved data access every iteration accesses all the
2051 members of the group, therefore we divide the number of iterations
2052 by the group size. */
2053 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2054 npeel /= DR_GROUP_SIZE (stmt_info);
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_NOTE, vect_location,
2058 "Try peeling by %d\n", npeel);
2061 /* Ensure that all datarefs can be vectorized after the peel. */
2062 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2063 do_peeling = false;
2065 /* Check if all datarefs are supportable and log. */
2066 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2068 stat = vect_verify_datarefs_alignment (loop_vinfo);
2069 if (!stat)
2070 do_peeling = false;
2071 else
2072 return stat;
2075 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2076 if (do_peeling)
2078 unsigned max_allowed_peel
2079 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2080 if (max_allowed_peel != (unsigned)-1)
2082 unsigned max_peel = npeel;
2083 if (max_peel == 0)
2085 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2086 max_peel = (target_align
2087 / vect_get_scalar_dr_size (dr0_info) - 1);
2089 if (max_peel > max_allowed_peel)
2091 do_peeling = false;
2092 if (dump_enabled_p ())
2093 dump_printf_loc (MSG_NOTE, vect_location,
2094 "Disable peeling, max peels reached: %d\n", max_peel);
2099 /* Cost model #2 - if peeling may result in a remaining loop not
2100 iterating enough to be vectorized then do not peel. Since this
2101 is a cost heuristic rather than a correctness decision, use the
2102 most likely runtime value for variable vectorization factors. */
2103 if (do_peeling
2104 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2106 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2107 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2108 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2109 < assumed_vf + max_peel)
2110 do_peeling = false;
2113 if (do_peeling)
2115 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2116 If the misalignment of DR_i is identical to that of dr0 then set
2117 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2118 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2119 by the peeling factor times the element size of DR_i (MOD the
2120 vectorization factor times the size). Otherwise, the
2121 misalignment of DR_i must be set to unknown. */
2122 FOR_EACH_VEC_ELT (datarefs, i, dr)
2123 if (dr != dr0_info->dr)
2125 /* Strided accesses perform only component accesses, alignment
2126 is irrelevant for them. */
2127 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2128 stmt_info = dr_info->stmt;
2129 if (STMT_VINFO_STRIDED_P (stmt_info)
2130 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2131 continue;
2133 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2136 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2137 if (npeel)
2138 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2139 else
2140 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2141 = DR_MISALIGNMENT (dr0_info);
2142 SET_DR_MISALIGNMENT (dr0_info, 0);
2143 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_NOTE, vect_location,
2146 "Alignment of access forced using peeling.\n");
2147 dump_printf_loc (MSG_NOTE, vect_location,
2148 "Peeling for alignment will be applied.\n");
2151 /* The inside-loop cost will be accounted for in vectorizable_load
2152 and vectorizable_store correctly with adjusted alignments.
2153 Drop the body_cst_vec on the floor here. */
2154 stat = vect_verify_datarefs_alignment (loop_vinfo);
2155 gcc_assert (stat);
2156 return stat;
2160 /* (2) Versioning to force alignment. */
2162 /* Try versioning if:
2163 1) optimize loop for speed
2164 2) there is at least one unsupported misaligned data ref with an unknown
2165 misalignment, and
2166 3) all misaligned data refs with a known misalignment are supported, and
2167 4) the number of runtime alignment checks is within reason. */
2169 do_versioning =
2170 optimize_loop_nest_for_speed_p (loop)
2171 && (!loop->inner); /* FORNOW */
2173 if (do_versioning)
2175 FOR_EACH_VEC_ELT (datarefs, i, dr)
2177 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2178 stmt_vec_info stmt_info = dr_info->stmt;
2180 /* For interleaving, only the alignment of the first access
2181 matters. */
2182 if (aligned_access_p (dr_info)
2183 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2184 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2185 continue;
2187 if (STMT_VINFO_STRIDED_P (stmt_info))
2189 /* Strided loads perform only component accesses, alignment is
2190 irrelevant for them. */
2191 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2192 continue;
2193 do_versioning = false;
2194 break;
2197 supportable_dr_alignment
2198 = vect_supportable_dr_alignment (dr_info, false);
2200 if (!supportable_dr_alignment)
2202 int mask;
2203 tree vectype;
2205 if (known_alignment_for_access_p (dr_info)
2206 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2207 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2209 do_versioning = false;
2210 break;
2213 vectype = STMT_VINFO_VECTYPE (stmt_info);
2214 gcc_assert (vectype);
2216 /* At present we don't support versioning for alignment
2217 with variable VF, since there's no guarantee that the
2218 VF is a power of two. We could relax this if we added
2219 a way of enforcing a power-of-two size. */
2220 unsigned HOST_WIDE_INT size;
2221 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2223 do_versioning = false;
2224 break;
2227 /* The rightmost bits of an aligned address must be zeros.
2228 Construct the mask needed for this test. For example,
2229 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2230 mask must be 15 = 0xf. */
2231 mask = size - 1;
2233 /* FORNOW: use the same mask to test all potentially unaligned
2234 references in the loop. The vectorizer currently supports
2235 a single vector size, see the reference to
2236 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2237 vectorization factor is computed. */
2238 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2239 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2240 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2241 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2245 /* Versioning requires at least one misaligned data reference. */
2246 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2247 do_versioning = false;
2248 else if (!do_versioning)
2249 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2252 if (do_versioning)
2254 vec<stmt_vec_info> may_misalign_stmts
2255 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2256 stmt_vec_info stmt_info;
2258 /* It can now be assumed that the data references in the statements
2259 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2260 of the loop being vectorized. */
2261 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2263 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2264 SET_DR_MISALIGNMENT (dr_info, 0);
2265 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_NOTE, vect_location,
2267 "Alignment of access forced using versioning.\n");
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_NOTE, vect_location,
2272 "Versioning for alignment will be applied.\n");
2274 /* Peeling and versioning can't be done together at this time. */
2275 gcc_assert (! (do_peeling && do_versioning));
2277 stat = vect_verify_datarefs_alignment (loop_vinfo);
2278 gcc_assert (stat);
2279 return stat;
2282 /* This point is reached if neither peeling nor versioning is being done. */
2283 gcc_assert (! (do_peeling || do_versioning));
2285 stat = vect_verify_datarefs_alignment (loop_vinfo);
2286 return stat;
2290 /* Function vect_find_same_alignment_drs.
2292 Update group and alignment relations in VINFO according to the chosen
2293 vectorization factor. */
2295 static void
2296 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2298 struct data_reference *dra = DDR_A (ddr);
2299 struct data_reference *drb = DDR_B (ddr);
2300 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2301 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2302 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2303 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2305 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2306 return;
2308 if (dra == drb)
2309 return;
2311 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2312 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2313 return;
2315 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2316 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2317 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2318 return;
2320 /* Two references with distance zero have the same alignment. */
2321 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2322 - wi::to_poly_offset (DR_INIT (drb)));
2323 if (maybe_ne (diff, 0))
2325 /* Get the wider of the two alignments. */
2326 unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2327 / BITS_PER_UNIT);
2328 unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2329 / BITS_PER_UNIT);
2330 unsigned int max_align = MAX (align_a, align_b);
2332 /* Require the gap to be a multiple of the larger vector alignment. */
2333 if (!multiple_p (diff, max_align))
2334 return;
2337 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2338 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2339 if (dump_enabled_p ())
2341 dump_printf_loc (MSG_NOTE, vect_location,
2342 "accesses have the same alignment: ");
2343 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2344 dump_printf (MSG_NOTE, " and ");
2345 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2346 dump_printf (MSG_NOTE, "\n");
2351 /* Function vect_analyze_data_refs_alignment
2353 Analyze the alignment of the data-references in the loop.
2354 Return FALSE if a data reference is found that cannot be vectorized. */
2356 bool
2357 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2359 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2361 /* Mark groups of data references with same alignment using
2362 data dependence information. */
2363 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2364 struct data_dependence_relation *ddr;
2365 unsigned int i;
2367 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2368 vect_find_same_alignment_drs (vinfo, ddr);
2370 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2371 struct data_reference *dr;
2373 vect_record_base_alignments (vinfo);
2374 FOR_EACH_VEC_ELT (datarefs, i, dr)
2376 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2377 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2378 vect_compute_data_ref_alignment (dr_info);
2381 return true;
2385 /* Analyze alignment of DRs of stmts in NODE. */
2387 static bool
2388 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2390 /* We vectorize from the first scalar stmt in the node unless
2391 the node is permuted in which case we start from the first
2392 element in the group. */
2393 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2394 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2395 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2396 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2398 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2399 vect_compute_data_ref_alignment (dr_info);
2400 /* For creating the data-ref pointer we need alignment of the
2401 first element anyway. */
2402 if (dr_info != first_dr_info)
2403 vect_compute_data_ref_alignment (first_dr_info);
2404 if (! verify_data_ref_alignment (dr_info))
2406 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2408 "not vectorized: bad data alignment in basic "
2409 "block.\n");
2410 return false;
2413 return true;
2416 /* Function vect_slp_analyze_instance_alignment
2418 Analyze the alignment of the data-references in the SLP instance.
2419 Return FALSE if a data reference is found that cannot be vectorized. */
2421 bool
2422 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2424 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2426 slp_tree node;
2427 unsigned i;
2428 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2429 if (! vect_slp_analyze_and_verify_node_alignment (node))
2430 return false;
2432 node = SLP_INSTANCE_TREE (instance);
2433 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2434 && ! vect_slp_analyze_and_verify_node_alignment
2435 (SLP_INSTANCE_TREE (instance)))
2436 return false;
2438 return true;
2442 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2443 accesses of legal size, step, etc. Detect gaps, single element
2444 interleaving, and other special cases. Set grouped access info.
2445 Collect groups of strided stores for further use in SLP analysis.
2446 Worker for vect_analyze_group_access. */
2448 static bool
2449 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2451 data_reference *dr = dr_info->dr;
2452 tree step = DR_STEP (dr);
2453 tree scalar_type = TREE_TYPE (DR_REF (dr));
2454 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2455 stmt_vec_info stmt_info = dr_info->stmt;
2456 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2457 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2458 HOST_WIDE_INT dr_step = -1;
2459 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2460 bool slp_impossible = false;
2462 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2463 size of the interleaving group (including gaps). */
2464 if (tree_fits_shwi_p (step))
2466 dr_step = tree_to_shwi (step);
2467 /* Check that STEP is a multiple of type size. Otherwise there is
2468 a non-element-sized gap at the end of the group which we
2469 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2470 ??? As we can handle non-constant step fine here we should
2471 simply remove uses of DR_GROUP_GAP between the last and first
2472 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2473 simply not include that gap. */
2474 if ((dr_step % type_size) != 0)
2476 if (dump_enabled_p ())
2478 dump_printf_loc (MSG_NOTE, vect_location,
2479 "Step ");
2480 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2481 dump_printf (MSG_NOTE,
2482 " is not a multiple of the element size for ");
2483 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2484 dump_printf (MSG_NOTE, "\n");
2486 return false;
2488 groupsize = absu_hwi (dr_step) / type_size;
2490 else
2491 groupsize = 0;
2493 /* Not consecutive access is possible only if it is a part of interleaving. */
2494 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2496 /* Check if it this DR is a part of interleaving, and is a single
2497 element of the group that is accessed in the loop. */
2499 /* Gaps are supported only for loads. STEP must be a multiple of the type
2500 size. */
2501 if (DR_IS_READ (dr)
2502 && (dr_step % type_size) == 0
2503 && groupsize > 0)
2505 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2506 DR_GROUP_SIZE (stmt_info) = groupsize;
2507 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2508 if (dump_enabled_p ())
2510 dump_printf_loc (MSG_NOTE, vect_location,
2511 "Detected single element interleaving ");
2512 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2513 dump_printf (MSG_NOTE, " step ");
2514 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2515 dump_printf (MSG_NOTE, "\n");
2518 return true;
2521 if (dump_enabled_p ())
2523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2524 "not consecutive access ");
2525 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2526 stmt_info->stmt, 0);
2529 if (bb_vinfo)
2531 /* Mark the statement as unvectorizable. */
2532 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2533 return true;
2536 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2537 STMT_VINFO_STRIDED_P (stmt_info) = true;
2538 return true;
2541 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2543 /* First stmt in the interleaving chain. Check the chain. */
2544 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2545 struct data_reference *data_ref = dr;
2546 unsigned int count = 1;
2547 tree prev_init = DR_INIT (data_ref);
2548 stmt_vec_info prev = stmt_info;
2549 HOST_WIDE_INT diff, gaps = 0;
2551 /* By construction, all group members have INTEGER_CST DR_INITs. */
2552 while (next)
2554 /* Skip same data-refs. In case that two or more stmts share
2555 data-ref (supported only for loads), we vectorize only the first
2556 stmt, and the rest get their vectorized loads from the first
2557 one. */
2558 if (!tree_int_cst_compare (DR_INIT (data_ref),
2559 DR_INIT (STMT_VINFO_DATA_REF (next))))
2561 if (DR_IS_WRITE (data_ref))
2563 if (dump_enabled_p ())
2564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2565 "Two store stmts share the same dr.\n");
2566 return false;
2569 if (dump_enabled_p ())
2570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2571 "Two or more load stmts share the same dr.\n");
2573 /* For load use the same data-ref load. */
2574 DR_GROUP_SAME_DR_STMT (next) = prev;
2576 prev = next;
2577 next = DR_GROUP_NEXT_ELEMENT (next);
2578 continue;
2581 prev = next;
2582 data_ref = STMT_VINFO_DATA_REF (next);
2584 /* All group members have the same STEP by construction. */
2585 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2587 /* Check that the distance between two accesses is equal to the type
2588 size. Otherwise, we have gaps. */
2589 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2590 - TREE_INT_CST_LOW (prev_init)) / type_size;
2591 if (diff != 1)
2593 /* FORNOW: SLP of accesses with gaps is not supported. */
2594 slp_impossible = true;
2595 if (DR_IS_WRITE (data_ref))
2597 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2599 "interleaved store with gaps\n");
2600 return false;
2603 gaps += diff - 1;
2606 last_accessed_element += diff;
2608 /* Store the gap from the previous member of the group. If there is no
2609 gap in the access, DR_GROUP_GAP is always 1. */
2610 DR_GROUP_GAP (next) = diff;
2612 prev_init = DR_INIT (data_ref);
2613 next = DR_GROUP_NEXT_ELEMENT (next);
2614 /* Count the number of data-refs in the chain. */
2615 count++;
2618 if (groupsize == 0)
2619 groupsize = count + gaps;
2621 /* This could be UINT_MAX but as we are generating code in a very
2622 inefficient way we have to cap earlier. See PR78699 for example. */
2623 if (groupsize > 4096)
2625 if (dump_enabled_p ())
2626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2627 "group is too large\n");
2628 return false;
2631 /* Check that the size of the interleaving is equal to count for stores,
2632 i.e., that there are no gaps. */
2633 if (groupsize != count
2634 && !DR_IS_READ (dr))
2636 groupsize = count;
2637 STMT_VINFO_STRIDED_P (stmt_info) = true;
2640 /* If there is a gap after the last load in the group it is the
2641 difference between the groupsize and the last accessed
2642 element.
2643 When there is no gap, this difference should be 0. */
2644 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2646 DR_GROUP_SIZE (stmt_info) = groupsize;
2647 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE, vect_location,
2650 "Detected interleaving ");
2651 if (DR_IS_READ (dr))
2652 dump_printf (MSG_NOTE, "load ");
2653 else if (STMT_VINFO_STRIDED_P (stmt_info))
2654 dump_printf (MSG_NOTE, "strided store ");
2655 else
2656 dump_printf (MSG_NOTE, "store ");
2657 dump_printf (MSG_NOTE, "of size %u starting with ",
2658 (unsigned)groupsize);
2659 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2660 if (DR_GROUP_GAP (stmt_info) != 0)
2661 dump_printf_loc (MSG_NOTE, vect_location,
2662 "There is a gap of %u elements after the group\n",
2663 DR_GROUP_GAP (stmt_info));
2666 /* SLP: create an SLP data structure for every interleaving group of
2667 stores for further analysis in vect_analyse_slp. */
2668 if (DR_IS_WRITE (dr) && !slp_impossible)
2670 if (loop_vinfo)
2671 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2672 if (bb_vinfo)
2673 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2677 return true;
2680 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2681 accesses of legal size, step, etc. Detect gaps, single element
2682 interleaving, and other special cases. Set grouped access info.
2683 Collect groups of strided stores for further use in SLP analysis. */
2685 static bool
2686 vect_analyze_group_access (dr_vec_info *dr_info)
2688 if (!vect_analyze_group_access_1 (dr_info))
2690 /* Dissolve the group if present. */
2691 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2692 while (stmt_info)
2694 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2695 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2696 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2697 stmt_info = next;
2699 return false;
2701 return true;
2704 /* Analyze the access pattern of the data-reference DR_INFO.
2705 In case of non-consecutive accesses call vect_analyze_group_access() to
2706 analyze groups of accesses. */
2708 static bool
2709 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2711 data_reference *dr = dr_info->dr;
2712 tree step = DR_STEP (dr);
2713 tree scalar_type = TREE_TYPE (DR_REF (dr));
2714 stmt_vec_info stmt_info = dr_info->stmt;
2715 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2716 struct loop *loop = NULL;
2718 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2719 return true;
2721 if (loop_vinfo)
2722 loop = LOOP_VINFO_LOOP (loop_vinfo);
2724 if (loop_vinfo && !step)
2726 if (dump_enabled_p ())
2727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2728 "bad data-ref access in loop\n");
2729 return false;
2732 /* Allow loads with zero step in inner-loop vectorization. */
2733 if (loop_vinfo && integer_zerop (step))
2735 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2736 if (!nested_in_vect_loop_p (loop, stmt_info))
2737 return DR_IS_READ (dr);
2738 /* Allow references with zero step for outer loops marked
2739 with pragma omp simd only - it guarantees absence of
2740 loop-carried dependencies between inner loop iterations. */
2741 if (loop->safelen < 2)
2743 if (dump_enabled_p ())
2744 dump_printf_loc (MSG_NOTE, vect_location,
2745 "zero step in inner loop of nest\n");
2746 return false;
2750 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2752 /* Interleaved accesses are not yet supported within outer-loop
2753 vectorization for references in the inner-loop. */
2754 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2756 /* For the rest of the analysis we use the outer-loop step. */
2757 step = STMT_VINFO_DR_STEP (stmt_info);
2758 if (integer_zerop (step))
2760 if (dump_enabled_p ())
2761 dump_printf_loc (MSG_NOTE, vect_location,
2762 "zero step in outer loop.\n");
2763 return DR_IS_READ (dr);
2767 /* Consecutive? */
2768 if (TREE_CODE (step) == INTEGER_CST)
2770 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2771 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2772 || (dr_step < 0
2773 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2775 /* Mark that it is not interleaving. */
2776 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2777 return true;
2781 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2783 if (dump_enabled_p ())
2784 dump_printf_loc (MSG_NOTE, vect_location,
2785 "grouped access in outer loop.\n");
2786 return false;
2790 /* Assume this is a DR handled by non-constant strided load case. */
2791 if (TREE_CODE (step) != INTEGER_CST)
2792 return (STMT_VINFO_STRIDED_P (stmt_info)
2793 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2794 || vect_analyze_group_access (dr_info)));
2796 /* Not consecutive access - check if it's a part of interleaving group. */
2797 return vect_analyze_group_access (dr_info);
2800 /* Compare two data-references DRA and DRB to group them into chunks
2801 suitable for grouping. */
2803 static int
2804 dr_group_sort_cmp (const void *dra_, const void *drb_)
2806 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2807 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2808 int cmp;
2810 /* Stabilize sort. */
2811 if (dra == drb)
2812 return 0;
2814 /* DRs in different loops never belong to the same group. */
2815 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2816 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2817 if (loopa != loopb)
2818 return loopa->num < loopb->num ? -1 : 1;
2820 /* Ordering of DRs according to base. */
2821 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2822 DR_BASE_ADDRESS (drb));
2823 if (cmp != 0)
2824 return cmp;
2826 /* And according to DR_OFFSET. */
2827 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2828 if (cmp != 0)
2829 return cmp;
2831 /* Put reads before writes. */
2832 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2833 return DR_IS_READ (dra) ? -1 : 1;
2835 /* Then sort after access size. */
2836 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2837 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2838 if (cmp != 0)
2839 return cmp;
2841 /* And after step. */
2842 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2843 if (cmp != 0)
2844 return cmp;
2846 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2847 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2848 if (cmp == 0)
2849 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2850 return cmp;
2853 /* If OP is the result of a conversion, return the unconverted value,
2854 otherwise return null. */
2856 static tree
2857 strip_conversion (tree op)
2859 if (TREE_CODE (op) != SSA_NAME)
2860 return NULL_TREE;
2861 gimple *stmt = SSA_NAME_DEF_STMT (op);
2862 if (!is_gimple_assign (stmt)
2863 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2864 return NULL_TREE;
2865 return gimple_assign_rhs1 (stmt);
2868 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2869 and STMT2_INFO being in a single group. */
2871 static bool
2872 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2874 if (gimple_assign_single_p (stmt1_info->stmt))
2875 return gimple_assign_single_p (stmt2_info->stmt);
2877 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2878 if (call1 && gimple_call_internal_p (call1))
2880 /* Check for two masked loads or two masked stores. */
2881 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2882 if (!call2 || !gimple_call_internal_p (call2))
2883 return false;
2884 internal_fn ifn = gimple_call_internal_fn (call1);
2885 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2886 return false;
2887 if (ifn != gimple_call_internal_fn (call2))
2888 return false;
2890 /* Check that the masks are the same. Cope with casts of masks,
2891 like those created by build_mask_conversion. */
2892 tree mask1 = gimple_call_arg (call1, 2);
2893 tree mask2 = gimple_call_arg (call2, 2);
2894 if (!operand_equal_p (mask1, mask2, 0))
2896 mask1 = strip_conversion (mask1);
2897 if (!mask1)
2898 return false;
2899 mask2 = strip_conversion (mask2);
2900 if (!mask2)
2901 return false;
2902 if (!operand_equal_p (mask1, mask2, 0))
2903 return false;
2905 return true;
2908 return false;
2911 /* Function vect_analyze_data_ref_accesses.
2913 Analyze the access pattern of all the data references in the loop.
2915 FORNOW: the only access pattern that is considered vectorizable is a
2916 simple step 1 (consecutive) access.
2918 FORNOW: handle only arrays and pointer accesses. */
2920 bool
2921 vect_analyze_data_ref_accesses (vec_info *vinfo)
2923 unsigned int i;
2924 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2925 struct data_reference *dr;
2927 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2929 if (datarefs.is_empty ())
2930 return true;
2932 /* Sort the array of datarefs to make building the interleaving chains
2933 linear. Don't modify the original vector's order, it is needed for
2934 determining what dependencies are reversed. */
2935 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2936 datarefs_copy.qsort (dr_group_sort_cmp);
2938 /* Build the interleaving chains. */
2939 for (i = 0; i < datarefs_copy.length () - 1;)
2941 data_reference_p dra = datarefs_copy[i];
2942 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2943 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2944 stmt_vec_info lastinfo = NULL;
2945 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2946 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2948 ++i;
2949 continue;
2951 for (i = i + 1; i < datarefs_copy.length (); ++i)
2953 data_reference_p drb = datarefs_copy[i];
2954 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2955 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2956 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2957 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2958 break;
2960 /* ??? Imperfect sorting (non-compatible types, non-modulo
2961 accesses, same accesses) can lead to a group to be artificially
2962 split here as we don't just skip over those. If it really
2963 matters we can push those to a worklist and re-iterate
2964 over them. The we can just skip ahead to the next DR here. */
2966 /* DRs in a different loop should not be put into the same
2967 interleaving group. */
2968 if (gimple_bb (DR_STMT (dra))->loop_father
2969 != gimple_bb (DR_STMT (drb))->loop_father)
2970 break;
2972 /* Check that the data-refs have same first location (except init)
2973 and they are both either store or load (not load and store,
2974 not masked loads or stores). */
2975 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2976 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2977 DR_BASE_ADDRESS (drb)) != 0
2978 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2979 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2980 break;
2982 /* Check that the data-refs have the same constant size. */
2983 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2984 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2985 if (!tree_fits_uhwi_p (sza)
2986 || !tree_fits_uhwi_p (szb)
2987 || !tree_int_cst_equal (sza, szb))
2988 break;
2990 /* Check that the data-refs have the same step. */
2991 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2992 break;
2994 /* Check the types are compatible.
2995 ??? We don't distinguish this during sorting. */
2996 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2997 TREE_TYPE (DR_REF (drb))))
2998 break;
3000 /* Check that the DR_INITs are compile-time constants. */
3001 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3002 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3003 break;
3005 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3006 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3007 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3008 HOST_WIDE_INT init_prev
3009 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3010 gcc_assert (init_a <= init_b
3011 && init_a <= init_prev
3012 && init_prev <= init_b);
3014 /* Do not place the same access in the interleaving chain twice. */
3015 if (init_b == init_prev)
3017 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3018 < gimple_uid (DR_STMT (drb)));
3019 /* ??? For now we simply "drop" the later reference which is
3020 otherwise the same rather than finishing off this group.
3021 In the end we'd want to re-process duplicates forming
3022 multiple groups from the refs, likely by just collecting
3023 all candidates (including duplicates and split points
3024 below) in a vector and then process them together. */
3025 continue;
3028 /* If init_b == init_a + the size of the type * k, we have an
3029 interleaving, and DRA is accessed before DRB. */
3030 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3031 if (type_size_a == 0
3032 || (init_b - init_a) % type_size_a != 0)
3033 break;
3035 /* If we have a store, the accesses are adjacent. This splits
3036 groups into chunks we support (we don't support vectorization
3037 of stores with gaps). */
3038 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3039 break;
3041 /* If the step (if not zero or non-constant) is greater than the
3042 difference between data-refs' inits this splits groups into
3043 suitable sizes. */
3044 if (tree_fits_shwi_p (DR_STEP (dra)))
3046 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3047 if (step != 0 && step <= (init_b - init_a))
3048 break;
3051 if (dump_enabled_p ())
3053 dump_printf_loc (MSG_NOTE, vect_location,
3054 "Detected interleaving ");
3055 if (DR_IS_READ (dra))
3056 dump_printf (MSG_NOTE, "load ");
3057 else
3058 dump_printf (MSG_NOTE, "store ");
3059 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3060 dump_printf (MSG_NOTE, " and ");
3061 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3062 dump_printf (MSG_NOTE, "\n");
3065 /* Link the found element into the group list. */
3066 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3068 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3069 lastinfo = stmtinfo_a;
3071 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3072 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3073 lastinfo = stmtinfo_b;
3077 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3079 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3080 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3081 && !vect_analyze_data_ref_access (dr_info))
3083 if (dump_enabled_p ())
3084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3085 "not vectorized: complicated access pattern.\n");
3087 if (is_a <bb_vec_info> (vinfo))
3089 /* Mark the statement as not vectorizable. */
3090 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3091 continue;
3093 else
3095 datarefs_copy.release ();
3096 return false;
3101 datarefs_copy.release ();
3102 return true;
3105 /* Function vect_vfa_segment_size.
3107 Input:
3108 DR_INFO: The data reference.
3109 LENGTH_FACTOR: segment length to consider.
3111 Return a value suitable for the dr_with_seg_len::seg_len field.
3112 This is the "distance travelled" by the pointer from the first
3113 iteration in the segment to the last. Note that it does not include
3114 the size of the access; in effect it only describes the first byte. */
3116 static tree
3117 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3119 length_factor = size_binop (MINUS_EXPR,
3120 fold_convert (sizetype, length_factor),
3121 size_one_node);
3122 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3123 length_factor);
3126 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3127 gives the worst-case number of bytes covered by the segment. */
3129 static unsigned HOST_WIDE_INT
3130 vect_vfa_access_size (dr_vec_info *dr_info)
3132 stmt_vec_info stmt_vinfo = dr_info->stmt;
3133 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3134 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3135 unsigned HOST_WIDE_INT access_size = ref_size;
3136 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3138 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3139 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3141 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3142 && (vect_supportable_dr_alignment (dr_info, false)
3143 == dr_explicit_realign_optimized))
3145 /* We might access a full vector's worth. */
3146 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3147 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3149 return access_size;
3152 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3153 describes. */
3155 static unsigned int
3156 vect_vfa_align (dr_vec_info *dr_info)
3158 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3161 /* Function vect_no_alias_p.
3163 Given data references A and B with equal base and offset, see whether
3164 the alias relation can be decided at compilation time. Return 1 if
3165 it can and the references alias, 0 if it can and the references do
3166 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3167 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3168 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3170 static int
3171 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3172 tree segment_length_a, tree segment_length_b,
3173 unsigned HOST_WIDE_INT access_size_a,
3174 unsigned HOST_WIDE_INT access_size_b)
3176 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3177 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3178 poly_uint64 const_length_a;
3179 poly_uint64 const_length_b;
3181 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3182 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3183 [a, a+12) */
3184 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3186 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3187 offset_a = (offset_a + access_size_a) - const_length_a;
3189 else
3190 const_length_a = tree_to_poly_uint64 (segment_length_a);
3191 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3193 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3194 offset_b = (offset_b + access_size_b) - const_length_b;
3196 else
3197 const_length_b = tree_to_poly_uint64 (segment_length_b);
3199 const_length_a += access_size_a;
3200 const_length_b += access_size_b;
3202 if (ranges_known_overlap_p (offset_a, const_length_a,
3203 offset_b, const_length_b))
3204 return 1;
3206 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3207 offset_b, const_length_b))
3208 return 0;
3210 return -1;
3213 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3214 in DDR is >= VF. */
3216 static bool
3217 dependence_distance_ge_vf (data_dependence_relation *ddr,
3218 unsigned int loop_depth, poly_uint64 vf)
3220 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3221 || DDR_NUM_DIST_VECTS (ddr) == 0)
3222 return false;
3224 /* If the dependence is exact, we should have limited the VF instead. */
3225 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3227 unsigned int i;
3228 lambda_vector dist_v;
3229 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3231 HOST_WIDE_INT dist = dist_v[loop_depth];
3232 if (dist != 0
3233 && !(dist > 0 && DDR_REVERSED_P (ddr))
3234 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3235 return false;
3238 if (dump_enabled_p ())
3240 dump_printf_loc (MSG_NOTE, vect_location,
3241 "dependence distance between ");
3242 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3243 dump_printf (MSG_NOTE, " and ");
3244 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3245 dump_printf (MSG_NOTE, " is >= VF\n");
3248 return true;
3251 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3253 static void
3254 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3256 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3257 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3258 dump_printf (dump_kind, ") >= ");
3259 dump_dec (dump_kind, lower_bound.min_value);
3262 /* Record that the vectorized loop requires the vec_lower_bound described
3263 by EXPR, UNSIGNED_P and MIN_VALUE. */
3265 static void
3266 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3267 poly_uint64 min_value)
3269 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3270 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3271 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3273 unsigned_p &= lower_bounds[i].unsigned_p;
3274 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3275 if (lower_bounds[i].unsigned_p != unsigned_p
3276 || maybe_lt (lower_bounds[i].min_value, min_value))
3278 lower_bounds[i].unsigned_p = unsigned_p;
3279 lower_bounds[i].min_value = min_value;
3280 if (dump_enabled_p ())
3282 dump_printf_loc (MSG_NOTE, vect_location,
3283 "updating run-time check to ");
3284 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3285 dump_printf (MSG_NOTE, "\n");
3288 return;
3291 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3292 if (dump_enabled_p ())
3294 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3295 dump_lower_bound (MSG_NOTE, lower_bound);
3296 dump_printf (MSG_NOTE, "\n");
3298 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3301 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3302 will span fewer than GAP bytes. */
3304 static bool
3305 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3306 poly_int64 gap)
3308 stmt_vec_info stmt_info = dr_info->stmt;
3309 HOST_WIDE_INT count
3310 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3311 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3312 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3313 return (estimated_poly_value (gap)
3314 <= count * vect_get_scalar_dr_size (dr_info));
3317 /* Return true if we know that there is no alias between DR_INFO_A and
3318 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3319 When returning true, set *LOWER_BOUND_OUT to this N. */
3321 static bool
3322 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3323 poly_uint64 *lower_bound_out)
3325 /* Check that there is a constant gap of known sign between DR_A
3326 and DR_B. */
3327 data_reference *dr_a = dr_info_a->dr;
3328 data_reference *dr_b = dr_info_b->dr;
3329 poly_int64 init_a, init_b;
3330 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3331 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3332 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3333 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3334 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3335 || !ordered_p (init_a, init_b))
3336 return false;
3338 /* Sort DR_A and DR_B by the address they access. */
3339 if (maybe_lt (init_b, init_a))
3341 std::swap (init_a, init_b);
3342 std::swap (dr_info_a, dr_info_b);
3343 std::swap (dr_a, dr_b);
3346 /* If the two accesses could be dependent within a scalar iteration,
3347 make sure that we'd retain their order. */
3348 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3349 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3350 return false;
3352 /* There is no alias if abs (DR_STEP) is greater than or equal to
3353 the bytes spanned by the combination of the two accesses. */
3354 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3355 return true;
3358 /* Function vect_prune_runtime_alias_test_list.
3360 Prune a list of ddrs to be tested at run-time by versioning for alias.
3361 Merge several alias checks into one if possible.
3362 Return FALSE if resulting list of ddrs is longer then allowed by
3363 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3365 bool
3366 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3368 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3369 hash_set <tree_pair_hash> compared_objects;
3371 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3372 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3373 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3374 vec<vec_object_pair> &check_unequal_addrs
3375 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3376 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3377 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3379 ddr_p ddr;
3380 unsigned int i;
3381 tree length_factor;
3383 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3385 /* Step values are irrelevant for aliasing if the number of vector
3386 iterations is equal to the number of scalar iterations (which can
3387 happen for fully-SLP loops). */
3388 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3390 if (!ignore_step_p)
3392 /* Convert the checks for nonzero steps into bound tests. */
3393 tree value;
3394 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3395 vect_check_lower_bound (loop_vinfo, value, true, 1);
3398 if (may_alias_ddrs.is_empty ())
3399 return true;
3401 comp_alias_ddrs.create (may_alias_ddrs.length ());
3403 unsigned int loop_depth
3404 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3405 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3407 /* First, we collect all data ref pairs for aliasing checks. */
3408 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3410 int comp_res;
3411 poly_uint64 lower_bound;
3412 tree segment_length_a, segment_length_b;
3413 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3414 unsigned int align_a, align_b;
3416 /* Ignore the alias if the VF we chose ended up being no greater
3417 than the dependence distance. */
3418 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3419 continue;
3421 if (DDR_OBJECT_A (ddr))
3423 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3424 if (!compared_objects.add (new_pair))
3426 if (dump_enabled_p ())
3428 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3429 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3430 dump_printf (MSG_NOTE, " and ");
3431 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3432 dump_printf (MSG_NOTE, " have different addresses\n");
3434 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3436 continue;
3439 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3440 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3442 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3443 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3445 /* Skip the pair if inter-iteration dependencies are irrelevant
3446 and intra-iteration dependencies are guaranteed to be honored. */
3447 if (ignore_step_p
3448 && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3449 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3450 &lower_bound)))
3452 if (dump_enabled_p ())
3454 dump_printf_loc (MSG_NOTE, vect_location,
3455 "no need for alias check between ");
3456 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3457 dump_printf (MSG_NOTE, " and ");
3458 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3459 dump_printf (MSG_NOTE, " when VF is 1\n");
3461 continue;
3464 /* See whether we can handle the alias using a bounds check on
3465 the step, and whether that's likely to be the best approach.
3466 (It might not be, for example, if the minimum step is much larger
3467 than the number of bytes handled by one vector iteration.) */
3468 if (!ignore_step_p
3469 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3470 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3471 &lower_bound)
3472 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3473 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3475 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3476 if (dump_enabled_p ())
3478 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3480 dump_printf (MSG_NOTE, " and ");
3481 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3482 dump_printf (MSG_NOTE, " when the step ");
3483 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_info_a->dr));
3484 dump_printf (MSG_NOTE, " is outside ");
3485 if (unsigned_p)
3486 dump_printf (MSG_NOTE, "[0");
3487 else
3489 dump_printf (MSG_NOTE, "(");
3490 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3492 dump_printf (MSG_NOTE, ", ");
3493 dump_dec (MSG_NOTE, lower_bound);
3494 dump_printf (MSG_NOTE, ")\n");
3496 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3497 unsigned_p, lower_bound);
3498 continue;
3501 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3502 if (dr_group_first_a)
3504 stmt_info_a = dr_group_first_a;
3505 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3508 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3509 if (dr_group_first_b)
3511 stmt_info_b = dr_group_first_b;
3512 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3515 if (ignore_step_p)
3517 segment_length_a = size_zero_node;
3518 segment_length_b = size_zero_node;
3520 else
3522 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3523 DR_STEP (dr_info_b->dr), 0))
3524 length_factor = scalar_loop_iters;
3525 else
3526 length_factor = size_int (vect_factor);
3527 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3528 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3530 access_size_a = vect_vfa_access_size (dr_info_a);
3531 access_size_b = vect_vfa_access_size (dr_info_b);
3532 align_a = vect_vfa_align (dr_info_a);
3533 align_b = vect_vfa_align (dr_info_b);
3535 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3536 DR_BASE_ADDRESS (dr_info_b->dr));
3537 if (comp_res == 0)
3538 comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3539 DR_OFFSET (dr_info_b->dr));
3541 /* See whether the alias is known at compilation time. */
3542 if (comp_res == 0
3543 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3544 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3545 && poly_int_tree_p (segment_length_a)
3546 && poly_int_tree_p (segment_length_b))
3548 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3549 segment_length_a,
3550 segment_length_b,
3551 access_size_a,
3552 access_size_b);
3553 if (res >= 0 && dump_enabled_p ())
3555 dump_printf_loc (MSG_NOTE, vect_location,
3556 "can tell at compile time that ");
3557 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3558 dump_printf (MSG_NOTE, " and ");
3559 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3560 if (res == 0)
3561 dump_printf (MSG_NOTE, " do not alias\n");
3562 else
3563 dump_printf (MSG_NOTE, " alias\n");
3566 if (res == 0)
3567 continue;
3569 if (res == 1)
3571 if (dump_enabled_p ())
3572 dump_printf_loc (MSG_NOTE, vect_location,
3573 "not vectorized: compilation time alias.\n");
3574 return false;
3578 dr_with_seg_len_pair_t dr_with_seg_len_pair
3579 (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3580 access_size_a, align_a),
3581 dr_with_seg_len (dr_info_b->dr, segment_length_b,
3582 access_size_b, align_b));
3584 /* Canonicalize pairs by sorting the two DR members. */
3585 if (comp_res > 0)
3586 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3588 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3591 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3593 unsigned int count = (comp_alias_ddrs.length ()
3594 + check_unequal_addrs.length ());
3596 dump_printf_loc (MSG_NOTE, vect_location,
3597 "improved number of alias checks from %d to %d\n",
3598 may_alias_ddrs.length (), count);
3599 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3601 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3603 "number of versioning for alias "
3604 "run-time tests exceeds %d "
3605 "(--param vect-max-version-for-alias-checks)\n",
3606 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3607 return false;
3610 return true;
3613 /* Check whether we can use an internal function for a gather load
3614 or scatter store. READ_P is true for loads and false for stores.
3615 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3616 the type of the memory elements being loaded or stored. OFFSET_BITS
3617 is the number of bits in each scalar offset and OFFSET_SIGN is the
3618 sign of the offset. SCALE is the amount by which the offset should
3619 be multiplied *after* it has been converted to address width.
3621 Return true if the function is supported, storing the function
3622 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3624 bool
3625 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3626 tree memory_type, unsigned int offset_bits,
3627 signop offset_sign, int scale,
3628 internal_fn *ifn_out, tree *element_type_out)
3630 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3631 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3632 if (offset_bits > element_bits)
3633 /* Internal functions require the offset to be the same width as
3634 the vector elements. We can extend narrower offsets, but it isn't
3635 safe to truncate wider offsets. */
3636 return false;
3638 if (element_bits != memory_bits)
3639 /* For now the vector elements must be the same width as the
3640 memory elements. */
3641 return false;
3643 /* Work out which function we need. */
3644 internal_fn ifn;
3645 if (read_p)
3646 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3647 else
3648 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3650 /* Test whether the target supports this combination. */
3651 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3652 offset_sign, scale))
3653 return false;
3655 *ifn_out = ifn;
3656 *element_type_out = TREE_TYPE (vectype);
3657 return true;
3660 /* STMT_INFO is a call to an internal gather load or scatter store function.
3661 Describe the operation in INFO. */
3663 static void
3664 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3665 gather_scatter_info *info)
3667 gcall *call = as_a <gcall *> (stmt_info->stmt);
3668 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3669 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3671 info->ifn = gimple_call_internal_fn (call);
3672 info->decl = NULL_TREE;
3673 info->base = gimple_call_arg (call, 0);
3674 info->offset = gimple_call_arg (call, 1);
3675 info->offset_dt = vect_unknown_def_type;
3676 info->offset_vectype = NULL_TREE;
3677 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3678 info->element_type = TREE_TYPE (vectype);
3679 info->memory_type = TREE_TYPE (DR_REF (dr));
3682 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3683 gather load or scatter store. Describe the operation in *INFO if so. */
3685 bool
3686 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3687 gather_scatter_info *info)
3689 HOST_WIDE_INT scale = 1;
3690 poly_int64 pbitpos, pbitsize;
3691 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3692 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3693 tree offtype = NULL_TREE;
3694 tree decl = NULL_TREE, base, off;
3695 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3696 tree memory_type = TREE_TYPE (DR_REF (dr));
3697 machine_mode pmode;
3698 int punsignedp, reversep, pvolatilep = 0;
3699 internal_fn ifn;
3700 tree element_type;
3701 bool masked_p = false;
3703 /* See whether this is already a call to a gather/scatter internal function.
3704 If not, see whether it's a masked load or store. */
3705 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3706 if (call && gimple_call_internal_p (call))
3708 ifn = gimple_call_internal_fn (call);
3709 if (internal_gather_scatter_fn_p (ifn))
3711 vect_describe_gather_scatter_call (stmt_info, info);
3712 return true;
3714 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3717 /* True if we should aim to use internal functions rather than
3718 built-in functions. */
3719 bool use_ifn_p = (DR_IS_READ (dr)
3720 ? supports_vec_gather_load_p ()
3721 : supports_vec_scatter_store_p ());
3723 base = DR_REF (dr);
3724 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3725 see if we can use the def stmt of the address. */
3726 if (masked_p
3727 && TREE_CODE (base) == MEM_REF
3728 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3729 && integer_zerop (TREE_OPERAND (base, 1))
3730 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3732 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3733 if (is_gimple_assign (def_stmt)
3734 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3735 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3738 /* The gather and scatter builtins need address of the form
3739 loop_invariant + vector * {1, 2, 4, 8}
3741 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3742 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3743 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3744 multiplications and additions in it. To get a vector, we need
3745 a single SSA_NAME that will be defined in the loop and will
3746 contain everything that is not loop invariant and that can be
3747 vectorized. The following code attempts to find such a preexistng
3748 SSA_NAME OFF and put the loop invariants into a tree BASE
3749 that can be gimplified before the loop. */
3750 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3751 &punsignedp, &reversep, &pvolatilep);
3752 if (reversep)
3753 return false;
3755 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3757 if (TREE_CODE (base) == MEM_REF)
3759 if (!integer_zerop (TREE_OPERAND (base, 1)))
3761 if (off == NULL_TREE)
3762 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3763 else
3764 off = size_binop (PLUS_EXPR, off,
3765 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3767 base = TREE_OPERAND (base, 0);
3769 else
3770 base = build_fold_addr_expr (base);
3772 if (off == NULL_TREE)
3773 off = size_zero_node;
3775 /* If base is not loop invariant, either off is 0, then we start with just
3776 the constant offset in the loop invariant BASE and continue with base
3777 as OFF, otherwise give up.
3778 We could handle that case by gimplifying the addition of base + off
3779 into some SSA_NAME and use that as off, but for now punt. */
3780 if (!expr_invariant_in_loop_p (loop, base))
3782 if (!integer_zerop (off))
3783 return false;
3784 off = base;
3785 base = size_int (pbytepos);
3787 /* Otherwise put base + constant offset into the loop invariant BASE
3788 and continue with OFF. */
3789 else
3791 base = fold_convert (sizetype, base);
3792 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3795 /* OFF at this point may be either a SSA_NAME or some tree expression
3796 from get_inner_reference. Try to peel off loop invariants from it
3797 into BASE as long as possible. */
3798 STRIP_NOPS (off);
3799 while (offtype == NULL_TREE)
3801 enum tree_code code;
3802 tree op0, op1, add = NULL_TREE;
3804 if (TREE_CODE (off) == SSA_NAME)
3806 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3808 if (expr_invariant_in_loop_p (loop, off))
3809 return false;
3811 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3812 break;
3814 op0 = gimple_assign_rhs1 (def_stmt);
3815 code = gimple_assign_rhs_code (def_stmt);
3816 op1 = gimple_assign_rhs2 (def_stmt);
3818 else
3820 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3821 return false;
3822 code = TREE_CODE (off);
3823 extract_ops_from_tree (off, &code, &op0, &op1);
3825 switch (code)
3827 case POINTER_PLUS_EXPR:
3828 case PLUS_EXPR:
3829 if (expr_invariant_in_loop_p (loop, op0))
3831 add = op0;
3832 off = op1;
3833 do_add:
3834 add = fold_convert (sizetype, add);
3835 if (scale != 1)
3836 add = size_binop (MULT_EXPR, add, size_int (scale));
3837 base = size_binop (PLUS_EXPR, base, add);
3838 continue;
3840 if (expr_invariant_in_loop_p (loop, op1))
3842 add = op1;
3843 off = op0;
3844 goto do_add;
3846 break;
3847 case MINUS_EXPR:
3848 if (expr_invariant_in_loop_p (loop, op1))
3850 add = fold_convert (sizetype, op1);
3851 add = size_binop (MINUS_EXPR, size_zero_node, add);
3852 off = op0;
3853 goto do_add;
3855 break;
3856 case MULT_EXPR:
3857 if (scale == 1 && tree_fits_shwi_p (op1))
3859 int new_scale = tree_to_shwi (op1);
3860 /* Only treat this as a scaling operation if the target
3861 supports it. */
3862 if (use_ifn_p
3863 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3864 vectype, memory_type, 1,
3865 TYPE_SIGN (TREE_TYPE (op0)),
3866 new_scale, &ifn,
3867 &element_type))
3868 break;
3869 scale = new_scale;
3870 off = op0;
3871 continue;
3873 break;
3874 case SSA_NAME:
3875 off = op0;
3876 continue;
3877 CASE_CONVERT:
3878 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3879 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3880 break;
3881 if (TYPE_PRECISION (TREE_TYPE (op0))
3882 == TYPE_PRECISION (TREE_TYPE (off)))
3884 off = op0;
3885 continue;
3888 /* The internal functions need the offset to be the same width
3889 as the elements of VECTYPE. Don't include operations that
3890 cast the offset from that width to a different width. */
3891 if (use_ifn_p
3892 && (int_size_in_bytes (TREE_TYPE (vectype))
3893 == int_size_in_bytes (TREE_TYPE (off))))
3894 break;
3896 if (TYPE_PRECISION (TREE_TYPE (op0))
3897 < TYPE_PRECISION (TREE_TYPE (off)))
3899 off = op0;
3900 offtype = TREE_TYPE (off);
3901 STRIP_NOPS (off);
3902 continue;
3904 break;
3905 default:
3906 break;
3908 break;
3911 /* If at the end OFF still isn't a SSA_NAME or isn't
3912 defined in the loop, punt. */
3913 if (TREE_CODE (off) != SSA_NAME
3914 || expr_invariant_in_loop_p (loop, off))
3915 return false;
3917 if (offtype == NULL_TREE)
3918 offtype = TREE_TYPE (off);
3920 if (use_ifn_p)
3922 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3923 memory_type, TYPE_PRECISION (offtype),
3924 TYPE_SIGN (offtype), scale, &ifn,
3925 &element_type))
3926 return false;
3928 else
3930 if (DR_IS_READ (dr))
3932 if (targetm.vectorize.builtin_gather)
3933 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3935 else
3937 if (targetm.vectorize.builtin_scatter)
3938 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3941 if (!decl)
3942 return false;
3944 ifn = IFN_LAST;
3945 element_type = TREE_TYPE (vectype);
3948 info->ifn = ifn;
3949 info->decl = decl;
3950 info->base = base;
3951 info->offset = off;
3952 info->offset_dt = vect_unknown_def_type;
3953 info->offset_vectype = NULL_TREE;
3954 info->scale = scale;
3955 info->element_type = element_type;
3956 info->memory_type = memory_type;
3957 return true;
3960 /* Find the data references in STMT, analyze them with respect to LOOP and
3961 append them to DATAREFS. Return false if datarefs in this stmt cannot
3962 be handled. */
3964 bool
3965 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3966 vec<data_reference_p> *datarefs)
3968 /* We can ignore clobbers for dataref analysis - they are removed during
3969 loop vectorization and BB vectorization checks dependences with a
3970 stmt walk. */
3971 if (gimple_clobber_p (stmt))
3972 return true;
3974 if (gimple_has_volatile_ops (stmt))
3976 if (dump_enabled_p ())
3978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3979 "not vectorized: volatile type ");
3980 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3982 return false;
3985 if (stmt_can_throw_internal (stmt))
3987 if (dump_enabled_p ())
3989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3990 "not vectorized: statement can throw an "
3991 "exception ");
3992 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3994 return false;
3997 auto_vec<data_reference_p, 2> refs;
3998 if (!find_data_references_in_stmt (loop, stmt, &refs))
3999 return false;
4001 if (refs.is_empty ())
4002 return true;
4004 if (refs.length () > 1)
4006 if (dump_enabled_p ())
4008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4009 "not vectorized: more than one data ref "
4010 "in stmt: ");
4011 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4013 return false;
4016 if (gcall *call = dyn_cast <gcall *> (stmt))
4017 if (!gimple_call_internal_p (call)
4018 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4019 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4021 if (dump_enabled_p ())
4023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4024 "not vectorized: dr in a call ");
4025 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4027 return false;
4030 data_reference_p dr = refs.pop ();
4031 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4032 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4034 if (dump_enabled_p ())
4036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4037 "not vectorized: statement is bitfield "
4038 "access ");
4039 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4041 return false;
4044 if (DR_BASE_ADDRESS (dr)
4045 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4047 if (dump_enabled_p ())
4048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4049 "not vectorized: base addr of dr is a "
4050 "constant\n");
4051 return false;
4054 /* Check whether this may be a SIMD lane access and adjust the
4055 DR to make it easier for us to handle it. */
4056 if (loop
4057 && loop->simduid
4058 && (!DR_BASE_ADDRESS (dr)
4059 || !DR_OFFSET (dr)
4060 || !DR_INIT (dr)
4061 || !DR_STEP (dr)))
4063 struct data_reference *newdr
4064 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4065 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4066 if (DR_BASE_ADDRESS (newdr)
4067 && DR_OFFSET (newdr)
4068 && DR_INIT (newdr)
4069 && DR_STEP (newdr)
4070 && integer_zerop (DR_STEP (newdr)))
4072 tree off = DR_OFFSET (newdr);
4073 STRIP_NOPS (off);
4074 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4075 && TREE_CODE (off) == MULT_EXPR
4076 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4078 tree step = TREE_OPERAND (off, 1);
4079 off = TREE_OPERAND (off, 0);
4080 STRIP_NOPS (off);
4081 if (CONVERT_EXPR_P (off)
4082 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4083 < TYPE_PRECISION (TREE_TYPE (off))))
4084 off = TREE_OPERAND (off, 0);
4085 if (TREE_CODE (off) == SSA_NAME)
4087 gimple *def = SSA_NAME_DEF_STMT (off);
4088 tree reft = TREE_TYPE (DR_REF (newdr));
4089 if (is_gimple_call (def)
4090 && gimple_call_internal_p (def)
4091 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4093 tree arg = gimple_call_arg (def, 0);
4094 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4095 arg = SSA_NAME_VAR (arg);
4096 if (arg == loop->simduid
4097 /* For now. */
4098 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4100 DR_OFFSET (newdr) = ssize_int (0);
4101 DR_STEP (newdr) = step;
4102 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4103 DR_STEP_ALIGNMENT (newdr)
4104 = highest_pow2_factor (step);
4105 /* Mark as simd-lane access. */
4106 newdr->aux = (void *)-1;
4107 free_data_ref (dr);
4108 datarefs->safe_push (newdr);
4109 return true;
4115 free_data_ref (newdr);
4118 datarefs->safe_push (dr);
4119 return true;
4122 /* Function vect_analyze_data_refs.
4124 Find all the data references in the loop or basic block.
4126 The general structure of the analysis of data refs in the vectorizer is as
4127 follows:
4128 1- vect_analyze_data_refs(loop/bb): call
4129 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4130 in the loop/bb and their dependences.
4131 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4132 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4133 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4137 bool
4138 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4140 struct loop *loop = NULL;
4141 unsigned int i;
4142 struct data_reference *dr;
4143 tree scalar_type;
4145 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4147 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4148 loop = LOOP_VINFO_LOOP (loop_vinfo);
4150 /* Go through the data-refs, check that the analysis succeeded. Update
4151 pointer from stmt_vec_info struct to DR and vectype. */
4153 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4154 FOR_EACH_VEC_ELT (datarefs, i, dr)
4156 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4157 poly_uint64 vf;
4159 gcc_assert (DR_REF (dr));
4160 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4161 gcc_assert (!stmt_info->dr_aux.dr);
4162 stmt_info->dr_aux.dr = dr;
4163 stmt_info->dr_aux.stmt = stmt_info;
4165 /* Check that analysis of the data-ref succeeded. */
4166 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4167 || !DR_STEP (dr))
4169 bool maybe_gather
4170 = DR_IS_READ (dr)
4171 && !TREE_THIS_VOLATILE (DR_REF (dr))
4172 && (targetm.vectorize.builtin_gather != NULL
4173 || supports_vec_gather_load_p ());
4174 bool maybe_scatter
4175 = DR_IS_WRITE (dr)
4176 && !TREE_THIS_VOLATILE (DR_REF (dr))
4177 && (targetm.vectorize.builtin_scatter != NULL
4178 || supports_vec_scatter_store_p ());
4180 /* If target supports vector gather loads or scatter stores,
4181 see if they can't be used. */
4182 if (is_a <loop_vec_info> (vinfo)
4183 && !nested_in_vect_loop_p (loop, stmt_info))
4185 if (maybe_gather || maybe_scatter)
4187 if (maybe_gather)
4188 gatherscatter = GATHER;
4189 else
4190 gatherscatter = SCATTER;
4194 if (gatherscatter == SG_NONE)
4196 if (dump_enabled_p ())
4198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4199 "not vectorized: data ref analysis "
4200 "failed ");
4201 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4202 stmt_info->stmt, 0);
4204 if (is_a <bb_vec_info> (vinfo))
4206 /* In BB vectorization the ref can still participate
4207 in dependence analysis, we just can't vectorize it. */
4208 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4209 continue;
4211 return false;
4215 /* See if this was detected as SIMD lane access. */
4216 if (dr->aux == (void *)-1)
4218 if (nested_in_vect_loop_p (loop, stmt_info))
4220 if (dump_enabled_p ())
4222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4223 "not vectorized: data ref analysis "
4224 "failed ");
4225 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4226 stmt_info->stmt, 0);
4228 return false;
4230 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4233 tree base = get_base_address (DR_REF (dr));
4234 if (base && VAR_P (base) && DECL_NONALIASED (base))
4236 if (dump_enabled_p ())
4238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4239 "not vectorized: base object not addressable "
4240 "for stmt: ");
4241 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4242 stmt_info->stmt, 0);
4244 if (is_a <bb_vec_info> (vinfo))
4246 /* In BB vectorization the ref can still participate
4247 in dependence analysis, we just can't vectorize it. */
4248 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4249 continue;
4251 return false;
4254 if (is_a <loop_vec_info> (vinfo)
4255 && DR_STEP (dr)
4256 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4258 if (nested_in_vect_loop_p (loop, stmt_info))
4260 if (dump_enabled_p ())
4262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4263 "not vectorized: not suitable for strided "
4264 "load ");
4265 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4266 stmt_info->stmt, 0);
4268 return false;
4270 STMT_VINFO_STRIDED_P (stmt_info) = true;
4273 /* Update DR field in stmt_vec_info struct. */
4275 /* If the dataref is in an inner-loop of the loop that is considered for
4276 for vectorization, we also want to analyze the access relative to
4277 the outer-loop (DR contains information only relative to the
4278 inner-most enclosing loop). We do that by building a reference to the
4279 first location accessed by the inner-loop, and analyze it relative to
4280 the outer-loop. */
4281 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4283 /* Build a reference to the first location accessed by the
4284 inner loop: *(BASE + INIT + OFFSET). By construction,
4285 this address must be invariant in the inner loop, so we
4286 can consider it as being used in the outer loop. */
4287 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4288 tree offset = unshare_expr (DR_OFFSET (dr));
4289 tree init = unshare_expr (DR_INIT (dr));
4290 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4291 init, offset);
4292 tree init_addr = fold_build_pointer_plus (base, init_offset);
4293 tree init_ref = build_fold_indirect_ref (init_addr);
4295 if (dump_enabled_p ())
4297 dump_printf_loc (MSG_NOTE, vect_location,
4298 "analyze in outer loop: ");
4299 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4300 dump_printf (MSG_NOTE, "\n");
4303 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4304 init_ref, loop))
4305 /* dr_analyze_innermost already explained the failure. */
4306 return false;
4308 if (dump_enabled_p ())
4310 dump_printf_loc (MSG_NOTE, vect_location,
4311 "\touter base_address: ");
4312 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4313 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4314 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4315 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4316 STMT_VINFO_DR_OFFSET (stmt_info));
4317 dump_printf (MSG_NOTE,
4318 "\n\touter constant offset from base address: ");
4319 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4320 STMT_VINFO_DR_INIT (stmt_info));
4321 dump_printf (MSG_NOTE, "\n\touter step: ");
4322 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4323 STMT_VINFO_DR_STEP (stmt_info));
4324 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4325 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4326 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4327 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4328 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4329 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4330 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4331 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4335 /* Set vectype for STMT. */
4336 scalar_type = TREE_TYPE (DR_REF (dr));
4337 STMT_VINFO_VECTYPE (stmt_info)
4338 = get_vectype_for_scalar_type (scalar_type);
4339 if (!STMT_VINFO_VECTYPE (stmt_info))
4341 if (dump_enabled_p ())
4343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4344 "not vectorized: no vectype for stmt: ");
4345 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4346 stmt_info->stmt, 0);
4347 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4348 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4349 scalar_type);
4350 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4353 if (is_a <bb_vec_info> (vinfo))
4355 /* No vector type is fine, the ref can still participate
4356 in dependence analysis, we just can't vectorize it. */
4357 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4358 continue;
4360 return false;
4362 else
4364 if (dump_enabled_p ())
4366 dump_printf_loc (MSG_NOTE, vect_location,
4367 "got vectype for stmt: ");
4368 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
4369 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4370 STMT_VINFO_VECTYPE (stmt_info));
4371 dump_printf (MSG_NOTE, "\n");
4375 /* Adjust the minimal vectorization factor according to the
4376 vector type. */
4377 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4378 *min_vf = upper_bound (*min_vf, vf);
4380 if (gatherscatter != SG_NONE)
4382 gather_scatter_info gs_info;
4383 if (!vect_check_gather_scatter (stmt_info,
4384 as_a <loop_vec_info> (vinfo),
4385 &gs_info)
4386 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4388 if (dump_enabled_p ())
4390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4391 (gatherscatter == GATHER) ?
4392 "not vectorized: not suitable for gather "
4393 "load " :
4394 "not vectorized: not suitable for scatter "
4395 "store ");
4396 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4397 stmt_info->stmt, 0);
4399 return false;
4401 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4405 /* We used to stop processing and prune the list here. Verify we no
4406 longer need to. */
4407 gcc_assert (i == datarefs.length ());
4409 return true;
4413 /* Function vect_get_new_vect_var.
4415 Returns a name for a new variable. The current naming scheme appends the
4416 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4417 the name of vectorizer generated variables, and appends that to NAME if
4418 provided. */
4420 tree
4421 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4423 const char *prefix;
4424 tree new_vect_var;
4426 switch (var_kind)
4428 case vect_simple_var:
4429 prefix = "vect";
4430 break;
4431 case vect_scalar_var:
4432 prefix = "stmp";
4433 break;
4434 case vect_mask_var:
4435 prefix = "mask";
4436 break;
4437 case vect_pointer_var:
4438 prefix = "vectp";
4439 break;
4440 default:
4441 gcc_unreachable ();
4444 if (name)
4446 char* tmp = concat (prefix, "_", name, NULL);
4447 new_vect_var = create_tmp_reg (type, tmp);
4448 free (tmp);
4450 else
4451 new_vect_var = create_tmp_reg (type, prefix);
4453 return new_vect_var;
4456 /* Like vect_get_new_vect_var but return an SSA name. */
4458 tree
4459 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4461 const char *prefix;
4462 tree new_vect_var;
4464 switch (var_kind)
4466 case vect_simple_var:
4467 prefix = "vect";
4468 break;
4469 case vect_scalar_var:
4470 prefix = "stmp";
4471 break;
4472 case vect_pointer_var:
4473 prefix = "vectp";
4474 break;
4475 default:
4476 gcc_unreachable ();
4479 if (name)
4481 char* tmp = concat (prefix, "_", name, NULL);
4482 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4483 free (tmp);
4485 else
4486 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4488 return new_vect_var;
4491 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4493 static void
4494 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4496 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4497 int misalign = DR_MISALIGNMENT (dr_info);
4498 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4499 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4500 else
4501 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4502 DR_TARGET_ALIGNMENT (dr_info), misalign);
4505 /* Function vect_create_addr_base_for_vector_ref.
4507 Create an expression that computes the address of the first memory location
4508 that will be accessed for a data reference.
4510 Input:
4511 STMT_INFO: The statement containing the data reference.
4512 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4513 OFFSET: Optional. If supplied, it is be added to the initial address.
4514 LOOP: Specify relative to which loop-nest should the address be computed.
4515 For example, when the dataref is in an inner-loop nested in an
4516 outer-loop that is now being vectorized, LOOP can be either the
4517 outer-loop, or the inner-loop. The first memory location accessed
4518 by the following dataref ('in' points to short):
4520 for (i=0; i<N; i++)
4521 for (j=0; j<M; j++)
4522 s += in[i+j]
4524 is as follows:
4525 if LOOP=i_loop: &in (relative to i_loop)
4526 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4527 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4528 initial address. Unlike OFFSET, which is number of elements to
4529 be added, BYTE_OFFSET is measured in bytes.
4531 Output:
4532 1. Return an SSA_NAME whose value is the address of the memory location of
4533 the first vector of the data reference.
4534 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4535 these statement(s) which define the returned SSA_NAME.
4537 FORNOW: We are only handling array accesses with step 1. */
4539 tree
4540 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4541 gimple_seq *new_stmt_list,
4542 tree offset,
4543 tree byte_offset)
4545 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4546 struct data_reference *dr = dr_info->dr;
4547 const char *base_name;
4548 tree addr_base;
4549 tree dest;
4550 gimple_seq seq = NULL;
4551 tree vect_ptr_type;
4552 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4553 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4554 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4556 tree data_ref_base = unshare_expr (drb->base_address);
4557 tree base_offset = unshare_expr (drb->offset);
4558 tree init = unshare_expr (drb->init);
4560 if (loop_vinfo)
4561 base_name = get_name (data_ref_base);
4562 else
4564 base_offset = ssize_int (0);
4565 init = ssize_int (0);
4566 base_name = get_name (DR_REF (dr));
4569 /* Create base_offset */
4570 base_offset = size_binop (PLUS_EXPR,
4571 fold_convert (sizetype, base_offset),
4572 fold_convert (sizetype, init));
4574 if (offset)
4576 offset = fold_build2 (MULT_EXPR, sizetype,
4577 fold_convert (sizetype, offset), step);
4578 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4579 base_offset, offset);
4581 if (byte_offset)
4583 byte_offset = fold_convert (sizetype, byte_offset);
4584 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4585 base_offset, byte_offset);
4588 /* base + base_offset */
4589 if (loop_vinfo)
4590 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4591 else
4593 addr_base = build1 (ADDR_EXPR,
4594 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4595 unshare_expr (DR_REF (dr)));
4598 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4599 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4600 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4601 gimple_seq_add_seq (new_stmt_list, seq);
4603 if (DR_PTR_INFO (dr)
4604 && TREE_CODE (addr_base) == SSA_NAME
4605 && !SSA_NAME_PTR_INFO (addr_base))
4607 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4608 if (offset || byte_offset)
4609 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4612 if (dump_enabled_p ())
4614 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4615 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4616 dump_printf (MSG_NOTE, "\n");
4619 return addr_base;
4623 /* Function vect_create_data_ref_ptr.
4625 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4626 location accessed in the loop by STMT_INFO, along with the def-use update
4627 chain to appropriately advance the pointer through the loop iterations.
4628 Also set aliasing information for the pointer. This pointer is used by
4629 the callers to this function to create a memory reference expression for
4630 vector load/store access.
4632 Input:
4633 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4634 GIMPLE_ASSIGN <name, data-ref> or
4635 GIMPLE_ASSIGN <data-ref, name>.
4636 2. AGGR_TYPE: the type of the reference, which should be either a vector
4637 or an array.
4638 3. AT_LOOP: the loop where the vector memref is to be created.
4639 4. OFFSET (optional): an offset to be added to the initial address accessed
4640 by the data-ref in STMT_INFO.
4641 5. BSI: location where the new stmts are to be placed if there is no loop
4642 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4643 pointing to the initial address.
4644 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4645 to the initial address accessed by the data-ref in STMT_INFO. This is
4646 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4647 in bytes.
4648 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4649 to the IV during each iteration of the loop. NULL says to move
4650 by one copy of AGGR_TYPE up or down, depending on the step of the
4651 data reference.
4653 Output:
4654 1. Declare a new ptr to vector_type, and have it point to the base of the
4655 data reference (initial addressed accessed by the data reference).
4656 For example, for vector of type V8HI, the following code is generated:
4658 v8hi *ap;
4659 ap = (v8hi *)initial_address;
4661 if OFFSET is not supplied:
4662 initial_address = &a[init];
4663 if OFFSET is supplied:
4664 initial_address = &a[init + OFFSET];
4665 if BYTE_OFFSET is supplied:
4666 initial_address = &a[init] + BYTE_OFFSET;
4668 Return the initial_address in INITIAL_ADDRESS.
4670 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4671 update the pointer in each iteration of the loop.
4673 Return the increment stmt that updates the pointer in PTR_INCR.
4675 3. Return the pointer. */
4677 tree
4678 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4679 struct loop *at_loop, tree offset,
4680 tree *initial_address, gimple_stmt_iterator *gsi,
4681 gimple **ptr_incr, bool only_init,
4682 tree byte_offset, tree iv_step)
4684 const char *base_name;
4685 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4686 struct loop *loop = NULL;
4687 bool nested_in_vect_loop = false;
4688 struct loop *containing_loop = NULL;
4689 tree aggr_ptr_type;
4690 tree aggr_ptr;
4691 tree new_temp;
4692 gimple_seq new_stmt_list = NULL;
4693 edge pe = NULL;
4694 basic_block new_bb;
4695 tree aggr_ptr_init;
4696 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4697 struct data_reference *dr = dr_info->dr;
4698 tree aptr;
4699 gimple_stmt_iterator incr_gsi;
4700 bool insert_after;
4701 tree indx_before_incr, indx_after_incr;
4702 gimple *incr;
4703 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4705 gcc_assert (iv_step != NULL_TREE
4706 || TREE_CODE (aggr_type) == ARRAY_TYPE
4707 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4709 if (loop_vinfo)
4711 loop = LOOP_VINFO_LOOP (loop_vinfo);
4712 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4713 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4714 pe = loop_preheader_edge (loop);
4716 else
4718 gcc_assert (bb_vinfo);
4719 only_init = true;
4720 *ptr_incr = NULL;
4723 /* Create an expression for the first address accessed by this load
4724 in LOOP. */
4725 base_name = get_name (DR_BASE_ADDRESS (dr));
4727 if (dump_enabled_p ())
4729 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4730 dump_printf_loc (MSG_NOTE, vect_location,
4731 "create %s-pointer variable to type: ",
4732 get_tree_code_name (TREE_CODE (aggr_type)));
4733 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4734 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4735 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4736 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4737 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4738 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4739 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4740 else
4741 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4742 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4743 dump_printf (MSG_NOTE, "\n");
4746 /* (1) Create the new aggregate-pointer variable.
4747 Vector and array types inherit the alias set of their component
4748 type by default so we need to use a ref-all pointer if the data
4749 reference does not conflict with the created aggregated data
4750 reference because it is not addressable. */
4751 bool need_ref_all = false;
4752 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4753 get_alias_set (DR_REF (dr))))
4754 need_ref_all = true;
4755 /* Likewise for any of the data references in the stmt group. */
4756 else if (DR_GROUP_SIZE (stmt_info) > 1)
4758 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4761 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4762 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4763 get_alias_set (DR_REF (sdr))))
4765 need_ref_all = true;
4766 break;
4768 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4770 while (sinfo);
4772 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4773 need_ref_all);
4774 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4777 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4778 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4779 def-use update cycles for the pointer: one relative to the outer-loop
4780 (LOOP), which is what steps (3) and (4) below do. The other is relative
4781 to the inner-loop (which is the inner-most loop containing the dataref),
4782 and this is done be step (5) below.
4784 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4785 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4786 redundant. Steps (3),(4) create the following:
4788 vp0 = &base_addr;
4789 LOOP: vp1 = phi(vp0,vp2)
4792 vp2 = vp1 + step
4793 goto LOOP
4795 If there is an inner-loop nested in loop, then step (5) will also be
4796 applied, and an additional update in the inner-loop will be created:
4798 vp0 = &base_addr;
4799 LOOP: vp1 = phi(vp0,vp2)
4801 inner: vp3 = phi(vp1,vp4)
4802 vp4 = vp3 + inner_step
4803 if () goto inner
4805 vp2 = vp1 + step
4806 if () goto LOOP */
4808 /* (2) Calculate the initial address of the aggregate-pointer, and set
4809 the aggregate-pointer to point to it before the loop. */
4811 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4813 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4814 offset, byte_offset);
4815 if (new_stmt_list)
4817 if (pe)
4819 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4820 gcc_assert (!new_bb);
4822 else
4823 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4826 *initial_address = new_temp;
4827 aggr_ptr_init = new_temp;
4829 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4830 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4831 inner-loop nested in LOOP (during outer-loop vectorization). */
4833 /* No update in loop is required. */
4834 if (only_init && (!loop_vinfo || at_loop == loop))
4835 aptr = aggr_ptr_init;
4836 else
4838 /* Accesses to invariant addresses should be handled specially
4839 by the caller. */
4840 tree step = vect_dr_behavior (dr_info)->step;
4841 gcc_assert (!integer_zerop (step));
4843 if (iv_step == NULL_TREE)
4845 /* The step of the aggregate pointer is the type size,
4846 negated for downward accesses. */
4847 iv_step = TYPE_SIZE_UNIT (aggr_type);
4848 if (tree_int_cst_sgn (step) == -1)
4849 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4852 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4854 create_iv (aggr_ptr_init,
4855 fold_convert (aggr_ptr_type, iv_step),
4856 aggr_ptr, loop, &incr_gsi, insert_after,
4857 &indx_before_incr, &indx_after_incr);
4858 incr = gsi_stmt (incr_gsi);
4859 loop_vinfo->add_stmt (incr);
4861 /* Copy the points-to information if it exists. */
4862 if (DR_PTR_INFO (dr))
4864 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4865 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4867 if (ptr_incr)
4868 *ptr_incr = incr;
4870 aptr = indx_before_incr;
4873 if (!nested_in_vect_loop || only_init)
4874 return aptr;
4877 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4878 nested in LOOP, if exists. */
4880 gcc_assert (nested_in_vect_loop);
4881 if (!only_init)
4883 standard_iv_increment_position (containing_loop, &incr_gsi,
4884 &insert_after);
4885 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4886 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4887 &indx_after_incr);
4888 incr = gsi_stmt (incr_gsi);
4889 loop_vinfo->add_stmt (incr);
4891 /* Copy the points-to information if it exists. */
4892 if (DR_PTR_INFO (dr))
4894 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4895 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4897 if (ptr_incr)
4898 *ptr_incr = incr;
4900 return indx_before_incr;
4902 else
4903 gcc_unreachable ();
4907 /* Function bump_vector_ptr
4909 Increment a pointer (to a vector type) by vector-size. If requested,
4910 i.e. if PTR-INCR is given, then also connect the new increment stmt
4911 to the existing def-use update-chain of the pointer, by modifying
4912 the PTR_INCR as illustrated below:
4914 The pointer def-use update-chain before this function:
4915 DATAREF_PTR = phi (p_0, p_2)
4916 ....
4917 PTR_INCR: p_2 = DATAREF_PTR + step
4919 The pointer def-use update-chain after this function:
4920 DATAREF_PTR = phi (p_0, p_2)
4921 ....
4922 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4923 ....
4924 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4926 Input:
4927 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4928 in the loop.
4929 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4930 the loop. The increment amount across iterations is expected
4931 to be vector_size.
4932 BSI - location where the new update stmt is to be placed.
4933 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4934 BUMP - optional. The offset by which to bump the pointer. If not given,
4935 the offset is assumed to be vector_size.
4937 Output: Return NEW_DATAREF_PTR as illustrated above.
4941 tree
4942 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4943 stmt_vec_info stmt_info, tree bump)
4945 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4946 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4947 tree update = TYPE_SIZE_UNIT (vectype);
4948 gassign *incr_stmt;
4949 ssa_op_iter iter;
4950 use_operand_p use_p;
4951 tree new_dataref_ptr;
4953 if (bump)
4954 update = bump;
4956 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4957 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4958 else
4959 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4960 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4961 dataref_ptr, update);
4962 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4964 /* Copy the points-to information if it exists. */
4965 if (DR_PTR_INFO (dr))
4967 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4968 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4971 if (!ptr_incr)
4972 return new_dataref_ptr;
4974 /* Update the vector-pointer's cross-iteration increment. */
4975 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4977 tree use = USE_FROM_PTR (use_p);
4979 if (use == dataref_ptr)
4980 SET_USE (use_p, new_dataref_ptr);
4981 else
4982 gcc_assert (operand_equal_p (use, update, 0));
4985 return new_dataref_ptr;
4989 /* Copy memory reference info such as base/clique from the SRC reference
4990 to the DEST MEM_REF. */
4992 void
4993 vect_copy_ref_info (tree dest, tree src)
4995 if (TREE_CODE (dest) != MEM_REF)
4996 return;
4998 tree src_base = src;
4999 while (handled_component_p (src_base))
5000 src_base = TREE_OPERAND (src_base, 0);
5001 if (TREE_CODE (src_base) != MEM_REF
5002 && TREE_CODE (src_base) != TARGET_MEM_REF)
5003 return;
5005 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5006 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5010 /* Function vect_create_destination_var.
5012 Create a new temporary of type VECTYPE. */
5014 tree
5015 vect_create_destination_var (tree scalar_dest, tree vectype)
5017 tree vec_dest;
5018 const char *name;
5019 char *new_name;
5020 tree type;
5021 enum vect_var_kind kind;
5023 kind = vectype
5024 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5025 ? vect_mask_var
5026 : vect_simple_var
5027 : vect_scalar_var;
5028 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5030 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5032 name = get_name (scalar_dest);
5033 if (name)
5034 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5035 else
5036 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5037 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5038 free (new_name);
5040 return vec_dest;
5043 /* Function vect_grouped_store_supported.
5045 Returns TRUE if interleave high and interleave low permutations
5046 are supported, and FALSE otherwise. */
5048 bool
5049 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5051 machine_mode mode = TYPE_MODE (vectype);
5053 /* vect_permute_store_chain requires the group size to be equal to 3 or
5054 be a power of two. */
5055 if (count != 3 && exact_log2 (count) == -1)
5057 if (dump_enabled_p ())
5058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5059 "the size of the group of accesses"
5060 " is not a power of 2 or not eqaul to 3\n");
5061 return false;
5064 /* Check that the permutation is supported. */
5065 if (VECTOR_MODE_P (mode))
5067 unsigned int i;
5068 if (count == 3)
5070 unsigned int j0 = 0, j1 = 0, j2 = 0;
5071 unsigned int i, j;
5073 unsigned int nelt;
5074 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5076 if (dump_enabled_p ())
5077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5078 "cannot handle groups of 3 stores for"
5079 " variable-length vectors\n");
5080 return false;
5083 vec_perm_builder sel (nelt, nelt, 1);
5084 sel.quick_grow (nelt);
5085 vec_perm_indices indices;
5086 for (j = 0; j < 3; j++)
5088 int nelt0 = ((3 - j) * nelt) % 3;
5089 int nelt1 = ((3 - j) * nelt + 1) % 3;
5090 int nelt2 = ((3 - j) * nelt + 2) % 3;
5091 for (i = 0; i < nelt; i++)
5093 if (3 * i + nelt0 < nelt)
5094 sel[3 * i + nelt0] = j0++;
5095 if (3 * i + nelt1 < nelt)
5096 sel[3 * i + nelt1] = nelt + j1++;
5097 if (3 * i + nelt2 < nelt)
5098 sel[3 * i + nelt2] = 0;
5100 indices.new_vector (sel, 2, nelt);
5101 if (!can_vec_perm_const_p (mode, indices))
5103 if (dump_enabled_p ())
5104 dump_printf (MSG_MISSED_OPTIMIZATION,
5105 "permutation op not supported by target.\n");
5106 return false;
5109 for (i = 0; i < nelt; i++)
5111 if (3 * i + nelt0 < nelt)
5112 sel[3 * i + nelt0] = 3 * i + nelt0;
5113 if (3 * i + nelt1 < nelt)
5114 sel[3 * i + nelt1] = 3 * i + nelt1;
5115 if (3 * i + nelt2 < nelt)
5116 sel[3 * i + nelt2] = nelt + j2++;
5118 indices.new_vector (sel, 2, nelt);
5119 if (!can_vec_perm_const_p (mode, indices))
5121 if (dump_enabled_p ())
5122 dump_printf (MSG_MISSED_OPTIMIZATION,
5123 "permutation op not supported by target.\n");
5124 return false;
5127 return true;
5129 else
5131 /* If length is not equal to 3 then only power of 2 is supported. */
5132 gcc_assert (pow2p_hwi (count));
5133 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5135 /* The encoding has 2 interleaved stepped patterns. */
5136 vec_perm_builder sel (nelt, 2, 3);
5137 sel.quick_grow (6);
5138 for (i = 0; i < 3; i++)
5140 sel[i * 2] = i;
5141 sel[i * 2 + 1] = i + nelt;
5143 vec_perm_indices indices (sel, 2, nelt);
5144 if (can_vec_perm_const_p (mode, indices))
5146 for (i = 0; i < 6; i++)
5147 sel[i] += exact_div (nelt, 2);
5148 indices.new_vector (sel, 2, nelt);
5149 if (can_vec_perm_const_p (mode, indices))
5150 return true;
5155 if (dump_enabled_p ())
5156 dump_printf (MSG_MISSED_OPTIMIZATION,
5157 "permutation op not supported by target.\n");
5158 return false;
5162 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5163 type VECTYPE. MASKED_P says whether the masked form is needed. */
5165 bool
5166 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5167 bool masked_p)
5169 if (masked_p)
5170 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5171 vec_mask_store_lanes_optab,
5172 vectype, count);
5173 else
5174 return vect_lanes_optab_supported_p ("vec_store_lanes",
5175 vec_store_lanes_optab,
5176 vectype, count);
5180 /* Function vect_permute_store_chain.
5182 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5183 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5184 the data correctly for the stores. Return the final references for stores
5185 in RESULT_CHAIN.
5187 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5188 The input is 4 vectors each containing 8 elements. We assign a number to
5189 each element, the input sequence is:
5191 1st vec: 0 1 2 3 4 5 6 7
5192 2nd vec: 8 9 10 11 12 13 14 15
5193 3rd vec: 16 17 18 19 20 21 22 23
5194 4th vec: 24 25 26 27 28 29 30 31
5196 The output sequence should be:
5198 1st vec: 0 8 16 24 1 9 17 25
5199 2nd vec: 2 10 18 26 3 11 19 27
5200 3rd vec: 4 12 20 28 5 13 21 30
5201 4th vec: 6 14 22 30 7 15 23 31
5203 i.e., we interleave the contents of the four vectors in their order.
5205 We use interleave_high/low instructions to create such output. The input of
5206 each interleave_high/low operation is two vectors:
5207 1st vec 2nd vec
5208 0 1 2 3 4 5 6 7
5209 the even elements of the result vector are obtained left-to-right from the
5210 high/low elements of the first vector. The odd elements of the result are
5211 obtained left-to-right from the high/low elements of the second vector.
5212 The output of interleave_high will be: 0 4 1 5
5213 and of interleave_low: 2 6 3 7
5216 The permutation is done in log LENGTH stages. In each stage interleave_high
5217 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5218 where the first argument is taken from the first half of DR_CHAIN and the
5219 second argument from it's second half.
5220 In our example,
5222 I1: interleave_high (1st vec, 3rd vec)
5223 I2: interleave_low (1st vec, 3rd vec)
5224 I3: interleave_high (2nd vec, 4th vec)
5225 I4: interleave_low (2nd vec, 4th vec)
5227 The output for the first stage is:
5229 I1: 0 16 1 17 2 18 3 19
5230 I2: 4 20 5 21 6 22 7 23
5231 I3: 8 24 9 25 10 26 11 27
5232 I4: 12 28 13 29 14 30 15 31
5234 The output of the second stage, i.e. the final result is:
5236 I1: 0 8 16 24 1 9 17 25
5237 I2: 2 10 18 26 3 11 19 27
5238 I3: 4 12 20 28 5 13 21 30
5239 I4: 6 14 22 30 7 15 23 31. */
5241 void
5242 vect_permute_store_chain (vec<tree> dr_chain,
5243 unsigned int length,
5244 stmt_vec_info stmt_info,
5245 gimple_stmt_iterator *gsi,
5246 vec<tree> *result_chain)
5248 tree vect1, vect2, high, low;
5249 gimple *perm_stmt;
5250 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5251 tree perm_mask_low, perm_mask_high;
5252 tree data_ref;
5253 tree perm3_mask_low, perm3_mask_high;
5254 unsigned int i, j, n, log_length = exact_log2 (length);
5256 result_chain->quick_grow (length);
5257 memcpy (result_chain->address (), dr_chain.address (),
5258 length * sizeof (tree));
5260 if (length == 3)
5262 /* vect_grouped_store_supported ensures that this is constant. */
5263 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5264 unsigned int j0 = 0, j1 = 0, j2 = 0;
5266 vec_perm_builder sel (nelt, nelt, 1);
5267 sel.quick_grow (nelt);
5268 vec_perm_indices indices;
5269 for (j = 0; j < 3; j++)
5271 int nelt0 = ((3 - j) * nelt) % 3;
5272 int nelt1 = ((3 - j) * nelt + 1) % 3;
5273 int nelt2 = ((3 - j) * nelt + 2) % 3;
5275 for (i = 0; i < nelt; i++)
5277 if (3 * i + nelt0 < nelt)
5278 sel[3 * i + nelt0] = j0++;
5279 if (3 * i + nelt1 < nelt)
5280 sel[3 * i + nelt1] = nelt + j1++;
5281 if (3 * i + nelt2 < nelt)
5282 sel[3 * i + nelt2] = 0;
5284 indices.new_vector (sel, 2, nelt);
5285 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5287 for (i = 0; i < nelt; i++)
5289 if (3 * i + nelt0 < nelt)
5290 sel[3 * i + nelt0] = 3 * i + nelt0;
5291 if (3 * i + nelt1 < nelt)
5292 sel[3 * i + nelt1] = 3 * i + nelt1;
5293 if (3 * i + nelt2 < nelt)
5294 sel[3 * i + nelt2] = nelt + j2++;
5296 indices.new_vector (sel, 2, nelt);
5297 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5299 vect1 = dr_chain[0];
5300 vect2 = dr_chain[1];
5302 /* Create interleaving stmt:
5303 low = VEC_PERM_EXPR <vect1, vect2,
5304 {j, nelt, *, j + 1, nelt + j + 1, *,
5305 j + 2, nelt + j + 2, *, ...}> */
5306 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5307 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5308 vect2, perm3_mask_low);
5309 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5311 vect1 = data_ref;
5312 vect2 = dr_chain[2];
5313 /* Create interleaving stmt:
5314 low = VEC_PERM_EXPR <vect1, vect2,
5315 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5316 6, 7, nelt + j + 2, ...}> */
5317 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5318 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5319 vect2, perm3_mask_high);
5320 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5321 (*result_chain)[j] = data_ref;
5324 else
5326 /* If length is not equal to 3 then only power of 2 is supported. */
5327 gcc_assert (pow2p_hwi (length));
5329 /* The encoding has 2 interleaved stepped patterns. */
5330 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5331 vec_perm_builder sel (nelt, 2, 3);
5332 sel.quick_grow (6);
5333 for (i = 0; i < 3; i++)
5335 sel[i * 2] = i;
5336 sel[i * 2 + 1] = i + nelt;
5338 vec_perm_indices indices (sel, 2, nelt);
5339 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5341 for (i = 0; i < 6; i++)
5342 sel[i] += exact_div (nelt, 2);
5343 indices.new_vector (sel, 2, nelt);
5344 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5346 for (i = 0, n = log_length; i < n; i++)
5348 for (j = 0; j < length/2; j++)
5350 vect1 = dr_chain[j];
5351 vect2 = dr_chain[j+length/2];
5353 /* Create interleaving stmt:
5354 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5355 ...}> */
5356 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5357 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5358 vect2, perm_mask_high);
5359 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5360 (*result_chain)[2*j] = high;
5362 /* Create interleaving stmt:
5363 low = VEC_PERM_EXPR <vect1, vect2,
5364 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5365 ...}> */
5366 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5367 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5368 vect2, perm_mask_low);
5369 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5370 (*result_chain)[2*j+1] = low;
5372 memcpy (dr_chain.address (), result_chain->address (),
5373 length * sizeof (tree));
5378 /* Function vect_setup_realignment
5380 This function is called when vectorizing an unaligned load using
5381 the dr_explicit_realign[_optimized] scheme.
5382 This function generates the following code at the loop prolog:
5384 p = initial_addr;
5385 x msq_init = *(floor(p)); # prolog load
5386 realignment_token = call target_builtin;
5387 loop:
5388 x msq = phi (msq_init, ---)
5390 The stmts marked with x are generated only for the case of
5391 dr_explicit_realign_optimized.
5393 The code above sets up a new (vector) pointer, pointing to the first
5394 location accessed by STMT_INFO, and a "floor-aligned" load using that
5395 pointer. It also generates code to compute the "realignment-token"
5396 (if the relevant target hook was defined), and creates a phi-node at the
5397 loop-header bb whose arguments are the result of the prolog-load (created
5398 by this function) and the result of a load that takes place in the loop
5399 (to be created by the caller to this function).
5401 For the case of dr_explicit_realign_optimized:
5402 The caller to this function uses the phi-result (msq) to create the
5403 realignment code inside the loop, and sets up the missing phi argument,
5404 as follows:
5405 loop:
5406 msq = phi (msq_init, lsq)
5407 lsq = *(floor(p')); # load in loop
5408 result = realign_load (msq, lsq, realignment_token);
5410 For the case of dr_explicit_realign:
5411 loop:
5412 msq = *(floor(p)); # load in loop
5413 p' = p + (VS-1);
5414 lsq = *(floor(p')); # load in loop
5415 result = realign_load (msq, lsq, realignment_token);
5417 Input:
5418 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5419 a memory location that may be unaligned.
5420 BSI - place where new code is to be inserted.
5421 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5422 is used.
5424 Output:
5425 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5426 target hook, if defined.
5427 Return value - the result of the loop-header phi node. */
5429 tree
5430 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5431 tree *realignment_token,
5432 enum dr_alignment_support alignment_support_scheme,
5433 tree init_addr,
5434 struct loop **at_loop)
5436 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5437 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5438 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5439 struct data_reference *dr = dr_info->dr;
5440 struct loop *loop = NULL;
5441 edge pe = NULL;
5442 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5443 tree vec_dest;
5444 gimple *inc;
5445 tree ptr;
5446 tree data_ref;
5447 basic_block new_bb;
5448 tree msq_init = NULL_TREE;
5449 tree new_temp;
5450 gphi *phi_stmt;
5451 tree msq = NULL_TREE;
5452 gimple_seq stmts = NULL;
5453 bool compute_in_loop = false;
5454 bool nested_in_vect_loop = false;
5455 struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5456 struct loop *loop_for_initial_load = NULL;
5458 if (loop_vinfo)
5460 loop = LOOP_VINFO_LOOP (loop_vinfo);
5461 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5464 gcc_assert (alignment_support_scheme == dr_explicit_realign
5465 || alignment_support_scheme == dr_explicit_realign_optimized);
5467 /* We need to generate three things:
5468 1. the misalignment computation
5469 2. the extra vector load (for the optimized realignment scheme).
5470 3. the phi node for the two vectors from which the realignment is
5471 done (for the optimized realignment scheme). */
5473 /* 1. Determine where to generate the misalignment computation.
5475 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5476 calculation will be generated by this function, outside the loop (in the
5477 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5478 caller, inside the loop.
5480 Background: If the misalignment remains fixed throughout the iterations of
5481 the loop, then both realignment schemes are applicable, and also the
5482 misalignment computation can be done outside LOOP. This is because we are
5483 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5484 are a multiple of VS (the Vector Size), and therefore the misalignment in
5485 different vectorized LOOP iterations is always the same.
5486 The problem arises only if the memory access is in an inner-loop nested
5487 inside LOOP, which is now being vectorized using outer-loop vectorization.
5488 This is the only case when the misalignment of the memory access may not
5489 remain fixed throughout the iterations of the inner-loop (as explained in
5490 detail in vect_supportable_dr_alignment). In this case, not only is the
5491 optimized realignment scheme not applicable, but also the misalignment
5492 computation (and generation of the realignment token that is passed to
5493 REALIGN_LOAD) have to be done inside the loop.
5495 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5496 or not, which in turn determines if the misalignment is computed inside
5497 the inner-loop, or outside LOOP. */
5499 if (init_addr != NULL_TREE || !loop_vinfo)
5501 compute_in_loop = true;
5502 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5506 /* 2. Determine where to generate the extra vector load.
5508 For the optimized realignment scheme, instead of generating two vector
5509 loads in each iteration, we generate a single extra vector load in the
5510 preheader of the loop, and in each iteration reuse the result of the
5511 vector load from the previous iteration. In case the memory access is in
5512 an inner-loop nested inside LOOP, which is now being vectorized using
5513 outer-loop vectorization, we need to determine whether this initial vector
5514 load should be generated at the preheader of the inner-loop, or can be
5515 generated at the preheader of LOOP. If the memory access has no evolution
5516 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5517 to be generated inside LOOP (in the preheader of the inner-loop). */
5519 if (nested_in_vect_loop)
5521 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5522 bool invariant_in_outerloop =
5523 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5524 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5526 else
5527 loop_for_initial_load = loop;
5528 if (at_loop)
5529 *at_loop = loop_for_initial_load;
5531 if (loop_for_initial_load)
5532 pe = loop_preheader_edge (loop_for_initial_load);
5534 /* 3. For the case of the optimized realignment, create the first vector
5535 load at the loop preheader. */
5537 if (alignment_support_scheme == dr_explicit_realign_optimized)
5539 /* Create msq_init = *(floor(p1)) in the loop preheader */
5540 gassign *new_stmt;
5542 gcc_assert (!compute_in_loop);
5543 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5544 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5545 loop_for_initial_load, NULL_TREE,
5546 &init_addr, NULL, &inc, true);
5547 if (TREE_CODE (ptr) == SSA_NAME)
5548 new_temp = copy_ssa_name (ptr);
5549 else
5550 new_temp = make_ssa_name (TREE_TYPE (ptr));
5551 unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5552 new_stmt = gimple_build_assign
5553 (new_temp, BIT_AND_EXPR, ptr,
5554 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5555 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5556 gcc_assert (!new_bb);
5557 data_ref
5558 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5559 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5560 vect_copy_ref_info (data_ref, DR_REF (dr));
5561 new_stmt = gimple_build_assign (vec_dest, data_ref);
5562 new_temp = make_ssa_name (vec_dest, new_stmt);
5563 gimple_assign_set_lhs (new_stmt, new_temp);
5564 if (pe)
5566 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5567 gcc_assert (!new_bb);
5569 else
5570 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5572 msq_init = gimple_assign_lhs (new_stmt);
5575 /* 4. Create realignment token using a target builtin, if available.
5576 It is done either inside the containing loop, or before LOOP (as
5577 determined above). */
5579 if (targetm.vectorize.builtin_mask_for_load)
5581 gcall *new_stmt;
5582 tree builtin_decl;
5584 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5585 if (!init_addr)
5587 /* Generate the INIT_ADDR computation outside LOOP. */
5588 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5589 NULL_TREE);
5590 if (loop)
5592 pe = loop_preheader_edge (loop);
5593 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5594 gcc_assert (!new_bb);
5596 else
5597 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5600 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5601 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5602 vec_dest =
5603 vect_create_destination_var (scalar_dest,
5604 gimple_call_return_type (new_stmt));
5605 new_temp = make_ssa_name (vec_dest, new_stmt);
5606 gimple_call_set_lhs (new_stmt, new_temp);
5608 if (compute_in_loop)
5609 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5610 else
5612 /* Generate the misalignment computation outside LOOP. */
5613 pe = loop_preheader_edge (loop);
5614 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5615 gcc_assert (!new_bb);
5618 *realignment_token = gimple_call_lhs (new_stmt);
5620 /* The result of the CALL_EXPR to this builtin is determined from
5621 the value of the parameter and no global variables are touched
5622 which makes the builtin a "const" function. Requiring the
5623 builtin to have the "const" attribute makes it unnecessary
5624 to call mark_call_clobbered. */
5625 gcc_assert (TREE_READONLY (builtin_decl));
5628 if (alignment_support_scheme == dr_explicit_realign)
5629 return msq;
5631 gcc_assert (!compute_in_loop);
5632 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5635 /* 5. Create msq = phi <msq_init, lsq> in loop */
5637 pe = loop_preheader_edge (containing_loop);
5638 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5639 msq = make_ssa_name (vec_dest);
5640 phi_stmt = create_phi_node (msq, containing_loop->header);
5641 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5643 return msq;
5647 /* Function vect_grouped_load_supported.
5649 COUNT is the size of the load group (the number of statements plus the
5650 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5651 only one statement, with a gap of COUNT - 1.
5653 Returns true if a suitable permute exists. */
5655 bool
5656 vect_grouped_load_supported (tree vectype, bool single_element_p,
5657 unsigned HOST_WIDE_INT count)
5659 machine_mode mode = TYPE_MODE (vectype);
5661 /* If this is single-element interleaving with an element distance
5662 that leaves unused vector loads around punt - we at least create
5663 very sub-optimal code in that case (and blow up memory,
5664 see PR65518). */
5665 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5667 if (dump_enabled_p ())
5668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5669 "single-element interleaving not supported "
5670 "for not adjacent vector loads\n");
5671 return false;
5674 /* vect_permute_load_chain requires the group size to be equal to 3 or
5675 be a power of two. */
5676 if (count != 3 && exact_log2 (count) == -1)
5678 if (dump_enabled_p ())
5679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5680 "the size of the group of accesses"
5681 " is not a power of 2 or not equal to 3\n");
5682 return false;
5685 /* Check that the permutation is supported. */
5686 if (VECTOR_MODE_P (mode))
5688 unsigned int i, j;
5689 if (count == 3)
5691 unsigned int nelt;
5692 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5694 if (dump_enabled_p ())
5695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5696 "cannot handle groups of 3 loads for"
5697 " variable-length vectors\n");
5698 return false;
5701 vec_perm_builder sel (nelt, nelt, 1);
5702 sel.quick_grow (nelt);
5703 vec_perm_indices indices;
5704 unsigned int k;
5705 for (k = 0; k < 3; k++)
5707 for (i = 0; i < nelt; i++)
5708 if (3 * i + k < 2 * nelt)
5709 sel[i] = 3 * i + k;
5710 else
5711 sel[i] = 0;
5712 indices.new_vector (sel, 2, nelt);
5713 if (!can_vec_perm_const_p (mode, indices))
5715 if (dump_enabled_p ())
5716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5717 "shuffle of 3 loads is not supported by"
5718 " target\n");
5719 return false;
5721 for (i = 0, j = 0; i < nelt; i++)
5722 if (3 * i + k < 2 * nelt)
5723 sel[i] = i;
5724 else
5725 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5726 indices.new_vector (sel, 2, nelt);
5727 if (!can_vec_perm_const_p (mode, indices))
5729 if (dump_enabled_p ())
5730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5731 "shuffle of 3 loads is not supported by"
5732 " target\n");
5733 return false;
5736 return true;
5738 else
5740 /* If length is not equal to 3 then only power of 2 is supported. */
5741 gcc_assert (pow2p_hwi (count));
5742 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5744 /* The encoding has a single stepped pattern. */
5745 vec_perm_builder sel (nelt, 1, 3);
5746 sel.quick_grow (3);
5747 for (i = 0; i < 3; i++)
5748 sel[i] = i * 2;
5749 vec_perm_indices indices (sel, 2, nelt);
5750 if (can_vec_perm_const_p (mode, indices))
5752 for (i = 0; i < 3; i++)
5753 sel[i] = i * 2 + 1;
5754 indices.new_vector (sel, 2, nelt);
5755 if (can_vec_perm_const_p (mode, indices))
5756 return true;
5761 if (dump_enabled_p ())
5762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5763 "extract even/odd not supported by target\n");
5764 return false;
5767 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5768 type VECTYPE. MASKED_P says whether the masked form is needed. */
5770 bool
5771 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5772 bool masked_p)
5774 if (masked_p)
5775 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5776 vec_mask_load_lanes_optab,
5777 vectype, count);
5778 else
5779 return vect_lanes_optab_supported_p ("vec_load_lanes",
5780 vec_load_lanes_optab,
5781 vectype, count);
5784 /* Function vect_permute_load_chain.
5786 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5787 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5788 the input data correctly. Return the final references for loads in
5789 RESULT_CHAIN.
5791 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5792 The input is 4 vectors each containing 8 elements. We assign a number to each
5793 element, the input sequence is:
5795 1st vec: 0 1 2 3 4 5 6 7
5796 2nd vec: 8 9 10 11 12 13 14 15
5797 3rd vec: 16 17 18 19 20 21 22 23
5798 4th vec: 24 25 26 27 28 29 30 31
5800 The output sequence should be:
5802 1st vec: 0 4 8 12 16 20 24 28
5803 2nd vec: 1 5 9 13 17 21 25 29
5804 3rd vec: 2 6 10 14 18 22 26 30
5805 4th vec: 3 7 11 15 19 23 27 31
5807 i.e., the first output vector should contain the first elements of each
5808 interleaving group, etc.
5810 We use extract_even/odd instructions to create such output. The input of
5811 each extract_even/odd operation is two vectors
5812 1st vec 2nd vec
5813 0 1 2 3 4 5 6 7
5815 and the output is the vector of extracted even/odd elements. The output of
5816 extract_even will be: 0 2 4 6
5817 and of extract_odd: 1 3 5 7
5820 The permutation is done in log LENGTH stages. In each stage extract_even
5821 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5822 their order. In our example,
5824 E1: extract_even (1st vec, 2nd vec)
5825 E2: extract_odd (1st vec, 2nd vec)
5826 E3: extract_even (3rd vec, 4th vec)
5827 E4: extract_odd (3rd vec, 4th vec)
5829 The output for the first stage will be:
5831 E1: 0 2 4 6 8 10 12 14
5832 E2: 1 3 5 7 9 11 13 15
5833 E3: 16 18 20 22 24 26 28 30
5834 E4: 17 19 21 23 25 27 29 31
5836 In order to proceed and create the correct sequence for the next stage (or
5837 for the correct output, if the second stage is the last one, as in our
5838 example), we first put the output of extract_even operation and then the
5839 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5840 The input for the second stage is:
5842 1st vec (E1): 0 2 4 6 8 10 12 14
5843 2nd vec (E3): 16 18 20 22 24 26 28 30
5844 3rd vec (E2): 1 3 5 7 9 11 13 15
5845 4th vec (E4): 17 19 21 23 25 27 29 31
5847 The output of the second stage:
5849 E1: 0 4 8 12 16 20 24 28
5850 E2: 2 6 10 14 18 22 26 30
5851 E3: 1 5 9 13 17 21 25 29
5852 E4: 3 7 11 15 19 23 27 31
5854 And RESULT_CHAIN after reordering:
5856 1st vec (E1): 0 4 8 12 16 20 24 28
5857 2nd vec (E3): 1 5 9 13 17 21 25 29
5858 3rd vec (E2): 2 6 10 14 18 22 26 30
5859 4th vec (E4): 3 7 11 15 19 23 27 31. */
5861 static void
5862 vect_permute_load_chain (vec<tree> dr_chain,
5863 unsigned int length,
5864 stmt_vec_info stmt_info,
5865 gimple_stmt_iterator *gsi,
5866 vec<tree> *result_chain)
5868 tree data_ref, first_vect, second_vect;
5869 tree perm_mask_even, perm_mask_odd;
5870 tree perm3_mask_low, perm3_mask_high;
5871 gimple *perm_stmt;
5872 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5873 unsigned int i, j, log_length = exact_log2 (length);
5875 result_chain->quick_grow (length);
5876 memcpy (result_chain->address (), dr_chain.address (),
5877 length * sizeof (tree));
5879 if (length == 3)
5881 /* vect_grouped_load_supported ensures that this is constant. */
5882 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5883 unsigned int k;
5885 vec_perm_builder sel (nelt, nelt, 1);
5886 sel.quick_grow (nelt);
5887 vec_perm_indices indices;
5888 for (k = 0; k < 3; k++)
5890 for (i = 0; i < nelt; i++)
5891 if (3 * i + k < 2 * nelt)
5892 sel[i] = 3 * i + k;
5893 else
5894 sel[i] = 0;
5895 indices.new_vector (sel, 2, nelt);
5896 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5898 for (i = 0, j = 0; i < nelt; i++)
5899 if (3 * i + k < 2 * nelt)
5900 sel[i] = i;
5901 else
5902 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5903 indices.new_vector (sel, 2, nelt);
5904 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5906 first_vect = dr_chain[0];
5907 second_vect = dr_chain[1];
5909 /* Create interleaving stmt (low part of):
5910 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5911 ...}> */
5912 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5913 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5914 second_vect, perm3_mask_low);
5915 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5917 /* Create interleaving stmt (high part of):
5918 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5919 ...}> */
5920 first_vect = data_ref;
5921 second_vect = dr_chain[2];
5922 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5923 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5924 second_vect, perm3_mask_high);
5925 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5926 (*result_chain)[k] = data_ref;
5929 else
5931 /* If length is not equal to 3 then only power of 2 is supported. */
5932 gcc_assert (pow2p_hwi (length));
5934 /* The encoding has a single stepped pattern. */
5935 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5936 vec_perm_builder sel (nelt, 1, 3);
5937 sel.quick_grow (3);
5938 for (i = 0; i < 3; ++i)
5939 sel[i] = i * 2;
5940 vec_perm_indices indices (sel, 2, nelt);
5941 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5943 for (i = 0; i < 3; ++i)
5944 sel[i] = i * 2 + 1;
5945 indices.new_vector (sel, 2, nelt);
5946 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5948 for (i = 0; i < log_length; i++)
5950 for (j = 0; j < length; j += 2)
5952 first_vect = dr_chain[j];
5953 second_vect = dr_chain[j+1];
5955 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5956 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5957 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5958 first_vect, second_vect,
5959 perm_mask_even);
5960 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5961 (*result_chain)[j/2] = data_ref;
5963 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5964 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5965 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5966 first_vect, second_vect,
5967 perm_mask_odd);
5968 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5969 (*result_chain)[j/2+length/2] = data_ref;
5971 memcpy (dr_chain.address (), result_chain->address (),
5972 length * sizeof (tree));
5977 /* Function vect_shift_permute_load_chain.
5979 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5980 sequence of stmts to reorder the input data accordingly.
5981 Return the final references for loads in RESULT_CHAIN.
5982 Return true if successed, false otherwise.
5984 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5985 The input is 3 vectors each containing 8 elements. We assign a
5986 number to each element, the input sequence is:
5988 1st vec: 0 1 2 3 4 5 6 7
5989 2nd vec: 8 9 10 11 12 13 14 15
5990 3rd vec: 16 17 18 19 20 21 22 23
5992 The output sequence should be:
5994 1st vec: 0 3 6 9 12 15 18 21
5995 2nd vec: 1 4 7 10 13 16 19 22
5996 3rd vec: 2 5 8 11 14 17 20 23
5998 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6000 First we shuffle all 3 vectors to get correct elements order:
6002 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6003 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6004 3rd vec: (16 19 22) (17 20 23) (18 21)
6006 Next we unite and shift vector 3 times:
6008 1st step:
6009 shift right by 6 the concatenation of:
6010 "1st vec" and "2nd vec"
6011 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6012 "2nd vec" and "3rd vec"
6013 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6014 "3rd vec" and "1st vec"
6015 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6016 | New vectors |
6018 So that now new vectors are:
6020 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6021 2nd vec: (10 13) (16 19 22) (17 20 23)
6022 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6024 2nd step:
6025 shift right by 5 the concatenation of:
6026 "1st vec" and "3rd vec"
6027 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6028 "2nd vec" and "1st vec"
6029 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6030 "3rd vec" and "2nd vec"
6031 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6032 | New vectors |
6034 So that now new vectors are:
6036 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6037 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6038 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6040 3rd step:
6041 shift right by 5 the concatenation of:
6042 "1st vec" and "1st vec"
6043 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6044 shift right by 3 the concatenation of:
6045 "2nd vec" and "2nd vec"
6046 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6047 | New vectors |
6049 So that now all vectors are READY:
6050 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6051 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6052 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6054 This algorithm is faster than one in vect_permute_load_chain if:
6055 1. "shift of a concatination" is faster than general permutation.
6056 This is usually so.
6057 2. The TARGET machine can't execute vector instructions in parallel.
6058 This is because each step of the algorithm depends on previous.
6059 The algorithm in vect_permute_load_chain is much more parallel.
6061 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6064 static bool
6065 vect_shift_permute_load_chain (vec<tree> dr_chain,
6066 unsigned int length,
6067 stmt_vec_info stmt_info,
6068 gimple_stmt_iterator *gsi,
6069 vec<tree> *result_chain)
6071 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6072 tree perm2_mask1, perm2_mask2, perm3_mask;
6073 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6074 gimple *perm_stmt;
6076 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6077 unsigned int i;
6078 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6080 unsigned HOST_WIDE_INT nelt, vf;
6081 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6082 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6083 /* Not supported for variable-length vectors. */
6084 return false;
6086 vec_perm_builder sel (nelt, nelt, 1);
6087 sel.quick_grow (nelt);
6089 result_chain->quick_grow (length);
6090 memcpy (result_chain->address (), dr_chain.address (),
6091 length * sizeof (tree));
6093 if (pow2p_hwi (length) && vf > 4)
6095 unsigned int j, log_length = exact_log2 (length);
6096 for (i = 0; i < nelt / 2; ++i)
6097 sel[i] = i * 2;
6098 for (i = 0; i < nelt / 2; ++i)
6099 sel[nelt / 2 + i] = i * 2 + 1;
6100 vec_perm_indices indices (sel, 2, nelt);
6101 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6103 if (dump_enabled_p ())
6104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6105 "shuffle of 2 fields structure is not \
6106 supported by target\n");
6107 return false;
6109 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6111 for (i = 0; i < nelt / 2; ++i)
6112 sel[i] = i * 2 + 1;
6113 for (i = 0; i < nelt / 2; ++i)
6114 sel[nelt / 2 + i] = i * 2;
6115 indices.new_vector (sel, 2, nelt);
6116 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6118 if (dump_enabled_p ())
6119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6120 "shuffle of 2 fields structure is not \
6121 supported by target\n");
6122 return false;
6124 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6126 /* Generating permutation constant to shift all elements.
6127 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6128 for (i = 0; i < nelt; i++)
6129 sel[i] = nelt / 2 + i;
6130 indices.new_vector (sel, 2, nelt);
6131 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6133 if (dump_enabled_p ())
6134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6135 "shift permutation is not supported by target\n");
6136 return false;
6138 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6140 /* Generating permutation constant to select vector from 2.
6141 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6142 for (i = 0; i < nelt / 2; i++)
6143 sel[i] = i;
6144 for (i = nelt / 2; i < nelt; i++)
6145 sel[i] = nelt + i;
6146 indices.new_vector (sel, 2, nelt);
6147 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6149 if (dump_enabled_p ())
6150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6151 "select is not supported by target\n");
6152 return false;
6154 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6156 for (i = 0; i < log_length; i++)
6158 for (j = 0; j < length; j += 2)
6160 first_vect = dr_chain[j];
6161 second_vect = dr_chain[j + 1];
6163 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6164 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6165 first_vect, first_vect,
6166 perm2_mask1);
6167 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6168 vect[0] = data_ref;
6170 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6171 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6172 second_vect, second_vect,
6173 perm2_mask2);
6174 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6175 vect[1] = data_ref;
6177 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6178 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6179 vect[0], vect[1], shift1_mask);
6180 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6181 (*result_chain)[j/2 + length/2] = data_ref;
6183 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6184 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6185 vect[0], vect[1], select_mask);
6186 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6187 (*result_chain)[j/2] = data_ref;
6189 memcpy (dr_chain.address (), result_chain->address (),
6190 length * sizeof (tree));
6192 return true;
6194 if (length == 3 && vf > 2)
6196 unsigned int k = 0, l = 0;
6198 /* Generating permutation constant to get all elements in rigth order.
6199 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6200 for (i = 0; i < nelt; i++)
6202 if (3 * k + (l % 3) >= nelt)
6204 k = 0;
6205 l += (3 - (nelt % 3));
6207 sel[i] = 3 * k + (l % 3);
6208 k++;
6210 vec_perm_indices indices (sel, 2, nelt);
6211 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6213 if (dump_enabled_p ())
6214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6215 "shuffle of 3 fields structure is not \
6216 supported by target\n");
6217 return false;
6219 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6221 /* Generating permutation constant to shift all elements.
6222 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6223 for (i = 0; i < nelt; i++)
6224 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6225 indices.new_vector (sel, 2, nelt);
6226 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6228 if (dump_enabled_p ())
6229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6230 "shift permutation is not supported by target\n");
6231 return false;
6233 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6235 /* Generating permutation constant to shift all elements.
6236 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6237 for (i = 0; i < nelt; i++)
6238 sel[i] = 2 * (nelt / 3) + 1 + i;
6239 indices.new_vector (sel, 2, nelt);
6240 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6242 if (dump_enabled_p ())
6243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6244 "shift permutation is not supported by target\n");
6245 return false;
6247 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6249 /* Generating permutation constant to shift all elements.
6250 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6251 for (i = 0; i < nelt; i++)
6252 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6253 indices.new_vector (sel, 2, nelt);
6254 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6256 if (dump_enabled_p ())
6257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6258 "shift permutation is not supported by target\n");
6259 return false;
6261 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6263 /* Generating permutation constant to shift all elements.
6264 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6265 for (i = 0; i < nelt; i++)
6266 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6267 indices.new_vector (sel, 2, nelt);
6268 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6270 if (dump_enabled_p ())
6271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6272 "shift permutation is not supported by target\n");
6273 return false;
6275 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6277 for (k = 0; k < 3; k++)
6279 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6280 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6281 dr_chain[k], dr_chain[k],
6282 perm3_mask);
6283 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6284 vect[k] = data_ref;
6287 for (k = 0; k < 3; k++)
6289 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6290 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6291 vect[k % 3], vect[(k + 1) % 3],
6292 shift1_mask);
6293 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6294 vect_shift[k] = data_ref;
6297 for (k = 0; k < 3; k++)
6299 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6300 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6301 vect_shift[(4 - k) % 3],
6302 vect_shift[(3 - k) % 3],
6303 shift2_mask);
6304 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6305 vect[k] = data_ref;
6308 (*result_chain)[3 - (nelt % 3)] = vect[2];
6310 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6311 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6312 vect[0], shift3_mask);
6313 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6314 (*result_chain)[nelt % 3] = data_ref;
6316 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6317 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6318 vect[1], shift4_mask);
6319 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6320 (*result_chain)[0] = data_ref;
6321 return true;
6323 return false;
6326 /* Function vect_transform_grouped_load.
6328 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6329 to perform their permutation and ascribe the result vectorized statements to
6330 the scalar statements.
6333 void
6334 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6335 int size, gimple_stmt_iterator *gsi)
6337 machine_mode mode;
6338 vec<tree> result_chain = vNULL;
6340 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6341 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6342 vectors, that are ready for vector computation. */
6343 result_chain.create (size);
6345 /* If reassociation width for vector type is 2 or greater target machine can
6346 execute 2 or more vector instructions in parallel. Otherwise try to
6347 get chain for loads group using vect_shift_permute_load_chain. */
6348 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6349 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6350 || pow2p_hwi (size)
6351 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6352 gsi, &result_chain))
6353 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6354 vect_record_grouped_load_vectors (stmt_info, result_chain);
6355 result_chain.release ();
6358 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6359 generated as part of the vectorization of STMT_INFO. Assign the statement
6360 for each vector to the associated scalar statement. */
6362 void
6363 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6364 vec<tree> result_chain)
6366 vec_info *vinfo = stmt_info->vinfo;
6367 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6368 unsigned int i, gap_count;
6369 tree tmp_data_ref;
6371 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6372 Since we scan the chain starting from it's first node, their order
6373 corresponds the order of data-refs in RESULT_CHAIN. */
6374 stmt_vec_info next_stmt_info = first_stmt_info;
6375 gap_count = 1;
6376 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6378 if (!next_stmt_info)
6379 break;
6381 /* Skip the gaps. Loads created for the gaps will be removed by dead
6382 code elimination pass later. No need to check for the first stmt in
6383 the group, since it always exists.
6384 DR_GROUP_GAP is the number of steps in elements from the previous
6385 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6386 correspond to the gaps. */
6387 if (next_stmt_info != first_stmt_info
6388 && gap_count < DR_GROUP_GAP (next_stmt_info))
6390 gap_count++;
6391 continue;
6394 while (next_stmt_info)
6396 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6397 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6398 copies, and we put the new vector statement in the first available
6399 RELATED_STMT. */
6400 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6401 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6402 else
6404 if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6406 stmt_vec_info prev_stmt_info
6407 = STMT_VINFO_VEC_STMT (next_stmt_info);
6408 stmt_vec_info rel_stmt_info
6409 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6410 while (rel_stmt_info)
6412 prev_stmt_info = rel_stmt_info;
6413 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6416 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6420 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6421 gap_count = 1;
6422 /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6423 put the same TMP_DATA_REF as its vectorized statement; otherwise
6424 get the next data-ref from RESULT_CHAIN. */
6425 if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6426 break;
6431 /* Function vect_force_dr_alignment_p.
6433 Returns whether the alignment of a DECL can be forced to be aligned
6434 on ALIGNMENT bit boundary. */
6436 bool
6437 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6439 if (!VAR_P (decl))
6440 return false;
6442 if (decl_in_symtab_p (decl)
6443 && !symtab_node::get (decl)->can_increase_alignment_p ())
6444 return false;
6446 if (TREE_STATIC (decl))
6447 return (alignment <= MAX_OFILE_ALIGNMENT);
6448 else
6449 return (alignment <= MAX_STACK_ALIGNMENT);
6453 /* Return whether the data reference DR_INFO is supported with respect to its
6454 alignment.
6455 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6456 it is aligned, i.e., check if it is possible to vectorize it with different
6457 alignment. */
6459 enum dr_alignment_support
6460 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6461 bool check_aligned_accesses)
6463 data_reference *dr = dr_info->dr;
6464 stmt_vec_info stmt_info = dr_info->stmt;
6465 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6466 machine_mode mode = TYPE_MODE (vectype);
6467 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6468 struct loop *vect_loop = NULL;
6469 bool nested_in_vect_loop = false;
6471 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6472 return dr_aligned;
6474 /* For now assume all conditional loads/stores support unaligned
6475 access without any special code. */
6476 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6477 if (gimple_call_internal_p (stmt)
6478 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6479 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6480 return dr_unaligned_supported;
6482 if (loop_vinfo)
6484 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6485 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6488 /* Possibly unaligned access. */
6490 /* We can choose between using the implicit realignment scheme (generating
6491 a misaligned_move stmt) and the explicit realignment scheme (generating
6492 aligned loads with a REALIGN_LOAD). There are two variants to the
6493 explicit realignment scheme: optimized, and unoptimized.
6494 We can optimize the realignment only if the step between consecutive
6495 vector loads is equal to the vector size. Since the vector memory
6496 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6497 is guaranteed that the misalignment amount remains the same throughout the
6498 execution of the vectorized loop. Therefore, we can create the
6499 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6500 at the loop preheader.
6502 However, in the case of outer-loop vectorization, when vectorizing a
6503 memory access in the inner-loop nested within the LOOP that is now being
6504 vectorized, while it is guaranteed that the misalignment of the
6505 vectorized memory access will remain the same in different outer-loop
6506 iterations, it is *not* guaranteed that is will remain the same throughout
6507 the execution of the inner-loop. This is because the inner-loop advances
6508 with the original scalar step (and not in steps of VS). If the inner-loop
6509 step happens to be a multiple of VS, then the misalignment remains fixed
6510 and we can use the optimized realignment scheme. For example:
6512 for (i=0; i<N; i++)
6513 for (j=0; j<M; j++)
6514 s += a[i+j];
6516 When vectorizing the i-loop in the above example, the step between
6517 consecutive vector loads is 1, and so the misalignment does not remain
6518 fixed across the execution of the inner-loop, and the realignment cannot
6519 be optimized (as illustrated in the following pseudo vectorized loop):
6521 for (i=0; i<N; i+=4)
6522 for (j=0; j<M; j++){
6523 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6524 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6525 // (assuming that we start from an aligned address).
6528 We therefore have to use the unoptimized realignment scheme:
6530 for (i=0; i<N; i+=4)
6531 for (j=k; j<M; j+=4)
6532 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6533 // that the misalignment of the initial address is
6534 // 0).
6536 The loop can then be vectorized as follows:
6538 for (k=0; k<4; k++){
6539 rt = get_realignment_token (&vp[k]);
6540 for (i=0; i<N; i+=4){
6541 v1 = vp[i+k];
6542 for (j=k; j<M; j+=4){
6543 v2 = vp[i+j+VS-1];
6544 va = REALIGN_LOAD <v1,v2,rt>;
6545 vs += va;
6546 v1 = v2;
6549 } */
6551 if (DR_IS_READ (dr))
6553 bool is_packed = false;
6554 tree type = (TREE_TYPE (DR_REF (dr)));
6556 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6557 && (!targetm.vectorize.builtin_mask_for_load
6558 || targetm.vectorize.builtin_mask_for_load ()))
6560 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6562 /* If we are doing SLP then the accesses need not have the
6563 same alignment, instead it depends on the SLP group size. */
6564 if (loop_vinfo
6565 && STMT_SLP_TYPE (stmt_info)
6566 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6567 * (DR_GROUP_SIZE
6568 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6569 TYPE_VECTOR_SUBPARTS (vectype)))
6571 else if (!loop_vinfo
6572 || (nested_in_vect_loop
6573 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6574 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6575 return dr_explicit_realign;
6576 else
6577 return dr_explicit_realign_optimized;
6579 if (!known_alignment_for_access_p (dr_info))
6580 is_packed = not_size_aligned (DR_REF (dr));
6582 if (targetm.vectorize.support_vector_misalignment
6583 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6584 /* Can't software pipeline the loads, but can at least do them. */
6585 return dr_unaligned_supported;
6587 else
6589 bool is_packed = false;
6590 tree type = (TREE_TYPE (DR_REF (dr)));
6592 if (!known_alignment_for_access_p (dr_info))
6593 is_packed = not_size_aligned (DR_REF (dr));
6595 if (targetm.vectorize.support_vector_misalignment
6596 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6597 return dr_unaligned_supported;
6600 /* Unsupported. */
6601 return dr_unaligned_unsupported;