[19/46] Make vect_dr_stmt return a stmt_vec_info
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob72465fd081773f24d1cd08ba3a79cacf5c25989f
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 if (is_pattern_stmt_p (stmtinfo_a))
216 stmtinfo_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
217 if (is_pattern_stmt_p (stmtinfo_b))
218 stmtinfo_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
219 gimple *earlier_stmt = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
220 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
223 /* A subroutine of vect_analyze_data_ref_dependence. Handle
224 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
225 distances. These distances are conservatively correct but they don't
226 reflect a guaranteed dependence.
228 Return true if this function does all the work necessary to avoid
229 an alias or false if the caller should use the dependence distances
230 to limit the vectorization factor in the usual way. LOOP_DEPTH is
231 the depth of the loop described by LOOP_VINFO and the other arguments
232 are as for vect_analyze_data_ref_dependence. */
234 static bool
235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
236 loop_vec_info loop_vinfo,
237 int loop_depth, unsigned int *max_vf)
239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
240 lambda_vector dist_v;
241 unsigned int i;
242 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
244 int dist = dist_v[loop_depth];
245 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
247 /* If the user asserted safelen >= DIST consecutive iterations
248 can be executed concurrently, assume independence.
250 ??? An alternative would be to add the alias check even
251 in this case, and vectorize the fallback loop with the
252 maximum VF set to safelen. However, if the user has
253 explicitly given a length, it's less likely that that
254 would be a win. */
255 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
257 if ((unsigned int) loop->safelen < *max_vf)
258 *max_vf = loop->safelen;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
260 continue;
263 /* For dependence distances of 2 or more, we have the option
264 of limiting VF or checking for an alias at runtime.
265 Prefer to check at runtime if we can, to avoid limiting
266 the VF unnecessarily when the bases are in fact independent.
268 Note that the alias checks will be removed if the VF ends up
269 being small enough. */
270 return (!STMT_VINFO_GATHER_SCATTER_P
271 (vinfo_for_stmt (DR_STMT (DDR_A (ddr))))
272 && !STMT_VINFO_GATHER_SCATTER_P
273 (vinfo_for_stmt (DR_STMT (DDR_B (ddr))))
274 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
277 return true;
281 /* Function vect_analyze_data_ref_dependence.
283 Return TRUE if there (might) exist a dependence between a memory-reference
284 DRA and a memory-reference DRB. When versioning for alias may check a
285 dependence at run-time, return FALSE. Adjust *MAX_VF according to
286 the data dependence. */
288 static bool
289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
290 loop_vec_info loop_vinfo,
291 unsigned int *max_vf)
293 unsigned int i;
294 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
295 struct data_reference *dra = DDR_A (ddr);
296 struct data_reference *drb = DDR_B (ddr);
297 stmt_vec_info stmtinfo_a = vect_dr_stmt (dra);
298 stmt_vec_info stmtinfo_b = vect_dr_stmt (drb);
299 lambda_vector dist_v;
300 unsigned int loop_depth;
302 /* In loop analysis all data references should be vectorizable. */
303 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
304 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
305 gcc_unreachable ();
307 /* Independent data accesses. */
308 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
309 return false;
311 if (dra == drb
312 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
313 return false;
315 /* We do not have to consider dependences between accesses that belong
316 to the same group, unless the stride could be smaller than the
317 group size. */
318 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
319 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
320 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
321 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
322 return false;
324 /* Even if we have an anti-dependence then, as the vectorized loop covers at
325 least two scalar iterations, there is always also a true dependence.
326 As the vectorizer does not re-order loads and stores we can ignore
327 the anti-dependence if TBAA can disambiguate both DRs similar to the
328 case with known negative distance anti-dependences (positive
329 distance anti-dependences would violate TBAA constraints). */
330 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
331 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
332 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
333 get_alias_set (DR_REF (drb))))
334 return false;
336 /* Unknown data dependence. */
337 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
339 /* If user asserted safelen consecutive iterations can be
340 executed concurrently, assume independence. */
341 if (loop->safelen >= 2)
343 if ((unsigned int) loop->safelen < *max_vf)
344 *max_vf = loop->safelen;
345 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
346 return false;
349 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
350 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "versioning for alias not supported for: "
356 "can't determine dependence between ");
357 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
358 DR_REF (dra));
359 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
360 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
361 DR_REF (drb));
362 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 return true;
367 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "versioning for alias required: "
371 "can't determine dependence between ");
372 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
373 DR_REF (dra));
374 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
375 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
376 DR_REF (drb));
377 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
380 /* Add to list of ddrs that need to be tested at run-time. */
381 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
384 /* Known data dependence. */
385 if (DDR_NUM_DIST_VECTS (ddr) == 0)
387 /* If user asserted safelen consecutive iterations can be
388 executed concurrently, assume independence. */
389 if (loop->safelen >= 2)
391 if ((unsigned int) loop->safelen < *max_vf)
392 *max_vf = loop->safelen;
393 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
394 return false;
397 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
398 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
400 if (dump_enabled_p ())
402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
403 "versioning for alias not supported for: "
404 "bad dist vector for ");
405 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
406 DR_REF (dra));
407 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
408 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
409 DR_REF (drb));
410 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
412 return true;
415 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "versioning for alias required: "
419 "bad dist vector for ");
420 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
421 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
422 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
423 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
425 /* Add to list of ddrs that need to be tested at run-time. */
426 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
429 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
431 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
432 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
433 loop_depth, max_vf))
434 return false;
436 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
438 int dist = dist_v[loop_depth];
440 if (dump_enabled_p ())
441 dump_printf_loc (MSG_NOTE, vect_location,
442 "dependence distance = %d.\n", dist);
444 if (dist == 0)
446 if (dump_enabled_p ())
448 dump_printf_loc (MSG_NOTE, vect_location,
449 "dependence distance == 0 between ");
450 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
451 dump_printf (MSG_NOTE, " and ");
452 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
453 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
456 /* When we perform grouped accesses and perform implicit CSE
457 by detecting equal accesses and doing disambiguation with
458 runtime alias tests like for
459 .. = a[i];
460 .. = a[i+1];
461 a[i] = ..;
462 a[i+1] = ..;
463 *p = ..;
464 .. = a[i];
465 .. = a[i+1];
466 where we will end up loading { a[i], a[i+1] } once, make
467 sure that inserting group loads before the first load and
468 stores after the last store will do the right thing.
469 Similar for groups like
470 a[i] = ...;
471 ... = a[i];
472 a[i+1] = ...;
473 where loads from the group interleave with the store. */
474 if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
475 vect_dr_stmt (drb)))
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
479 "READ_WRITE dependence in interleaving.\n");
480 return true;
483 if (loop->safelen < 2)
485 tree indicator = dr_zero_step_indicator (dra);
486 if (!indicator || integer_zerop (indicator))
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
490 "access also has a zero step\n");
491 return true;
493 else if (TREE_CODE (indicator) != INTEGER_CST)
494 vect_check_nonzero_value (loop_vinfo, indicator);
496 continue;
499 if (dist > 0 && DDR_REVERSED_P (ddr))
501 /* If DDR_REVERSED_P the order of the data-refs in DDR was
502 reversed (to make distance vector positive), and the actual
503 distance is negative. */
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
506 "dependence distance negative.\n");
507 /* Record a negative dependence distance to later limit the
508 amount of stmt copying / unrolling we can perform.
509 Only need to handle read-after-write dependence. */
510 if (DR_IS_READ (drb)
511 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
512 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
513 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
514 continue;
517 unsigned int abs_dist = abs (dist);
518 if (abs_dist >= 2 && abs_dist < *max_vf)
520 /* The dependence distance requires reduction of the maximal
521 vectorization factor. */
522 *max_vf = abs (dist);
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_NOTE, vect_location,
525 "adjusting maximal vectorization factor to %i\n",
526 *max_vf);
529 if (abs_dist >= *max_vf)
531 /* Dependence distance does not create dependence, as far as
532 vectorization is concerned, in this case. */
533 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE, vect_location,
535 "dependence distance >= VF.\n");
536 continue;
539 if (dump_enabled_p ())
541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
542 "not vectorized, possible dependence "
543 "between data-refs ");
544 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
545 dump_printf (MSG_NOTE, " and ");
546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
547 dump_printf (MSG_NOTE, "\n");
550 return true;
553 return false;
556 /* Function vect_analyze_data_ref_dependences.
558 Examine all the data references in the loop, and make sure there do not
559 exist any data dependences between them. Set *MAX_VF according to
560 the maximum vectorization factor the data dependences allow. */
562 bool
563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
564 unsigned int *max_vf)
566 unsigned int i;
567 struct data_dependence_relation *ddr;
569 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
571 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
573 LOOP_VINFO_DDRS (loop_vinfo)
574 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
575 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
576 /* We need read-read dependences to compute
577 STMT_VINFO_SAME_ALIGN_REFS. */
578 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
579 &LOOP_VINFO_DDRS (loop_vinfo),
580 LOOP_VINFO_LOOP_NEST (loop_vinfo),
581 true);
582 gcc_assert (res);
585 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
587 /* For epilogues we either have no aliases or alias versioning
588 was applied to original loop. Therefore we may just get max_vf
589 using VF of original loop. */
590 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
591 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
592 else
593 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
594 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
595 return false;
597 return true;
601 /* Function vect_slp_analyze_data_ref_dependence.
603 Return TRUE if there (might) exist a dependence between a memory-reference
604 DRA and a memory-reference DRB. When versioning for alias may check a
605 dependence at run-time, return FALSE. Adjust *MAX_VF according to
606 the data dependence. */
608 static bool
609 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
611 struct data_reference *dra = DDR_A (ddr);
612 struct data_reference *drb = DDR_B (ddr);
614 /* We need to check dependences of statements marked as unvectorizable
615 as well, they still can prohibit vectorization. */
617 /* Independent data accesses. */
618 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
619 return false;
621 if (dra == drb)
622 return false;
624 /* Read-read is OK. */
625 if (DR_IS_READ (dra) && DR_IS_READ (drb))
626 return false;
628 /* If dra and drb are part of the same interleaving chain consider
629 them independent. */
630 if (STMT_VINFO_GROUPED_ACCESS (vect_dr_stmt (dra))
631 && (DR_GROUP_FIRST_ELEMENT (vect_dr_stmt (dra))
632 == DR_GROUP_FIRST_ELEMENT (vect_dr_stmt (drb))))
633 return false;
635 /* Unknown data dependence. */
636 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
638 if (dump_enabled_p ())
640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641 "can't determine dependence between ");
642 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
643 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
644 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
645 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
648 else if (dump_enabled_p ())
650 dump_printf_loc (MSG_NOTE, vect_location,
651 "determined dependence between ");
652 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
653 dump_printf (MSG_NOTE, " and ");
654 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
655 dump_printf (MSG_NOTE, "\n");
658 return true;
662 /* Analyze dependences involved in the transform of SLP NODE. STORES
663 contain the vector of scalar stores of this instance if we are
664 disambiguating the loads. */
666 static bool
667 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
668 vec<stmt_vec_info> stores,
669 gimple *last_store)
671 /* This walks over all stmts involved in the SLP load/store done
672 in NODE verifying we can sink them up to the last stmt in the
673 group. */
674 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
675 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
677 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
678 if (access_info == last_access)
679 continue;
680 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
681 ao_ref ref;
682 bool ref_initialized_p = false;
683 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
684 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
686 gimple *stmt = gsi_stmt (gsi);
687 if (! gimple_vuse (stmt)
688 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
689 continue;
691 /* If we couldn't record a (single) data reference for this
692 stmt we have to resort to the alias oracle. */
693 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
694 if (!dr_b)
696 /* We are moving a store or sinking a load - this means
697 we cannot use TBAA for disambiguation. */
698 if (!ref_initialized_p)
699 ao_ref_init (&ref, DR_REF (dr_a));
700 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
701 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
702 return false;
703 continue;
706 bool dependent = false;
707 /* If we run into a store of this same instance (we've just
708 marked those) then delay dependence checking until we run
709 into the last store because this is where it will have
710 been sunk to (and we verify if we can do that as well). */
711 if (gimple_visited_p (stmt))
713 if (stmt != last_store)
714 continue;
715 unsigned i;
716 stmt_vec_info store_info;
717 FOR_EACH_VEC_ELT (stores, i, store_info)
719 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
720 ddr_p ddr = initialize_data_dependence_relation
721 (dr_a, store_dr, vNULL);
722 dependent = vect_slp_analyze_data_ref_dependence (ddr);
723 free_dependence_relation (ddr);
724 if (dependent)
725 break;
728 else
730 ddr_p ddr = initialize_data_dependence_relation (dr_a,
731 dr_b, vNULL);
732 dependent = vect_slp_analyze_data_ref_dependence (ddr);
733 free_dependence_relation (ddr);
735 if (dependent)
736 return false;
739 return true;
743 /* Function vect_analyze_data_ref_dependences.
745 Examine all the data references in the basic-block, and make sure there
746 do not exist any data dependences between them. Set *MAX_VF according to
747 the maximum vectorization factor the data dependences allow. */
749 bool
750 vect_slp_analyze_instance_dependence (slp_instance instance)
752 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
754 /* The stores of this instance are at the root of the SLP tree. */
755 slp_tree store = SLP_INSTANCE_TREE (instance);
756 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
757 store = NULL;
759 /* Verify we can sink stores to the vectorized stmt insert location. */
760 gimple *last_store = NULL;
761 if (store)
763 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
764 return false;
766 /* Mark stores in this instance and remember the last one. */
767 last_store = vect_find_last_scalar_stmt_in_slp (store);
768 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
769 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
772 bool res = true;
774 /* Verify we can sink loads to the vectorized stmt insert location,
775 special-casing stores of this instance. */
776 slp_tree load;
777 unsigned int i;
778 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
779 if (! vect_slp_analyze_node_dependences (instance, load,
780 store
781 ? SLP_TREE_SCALAR_STMTS (store)
782 : vNULL, last_store))
784 res = false;
785 break;
788 /* Unset the visited flag. */
789 if (store)
790 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
791 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
793 return res;
796 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
797 the statement that contains DRB, which is useful for recording in the
798 dump file. */
800 static void
801 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
802 innermost_loop_behavior *drb)
804 bool existed;
805 innermost_loop_behavior *&entry
806 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
807 if (!existed || entry->base_alignment < drb->base_alignment)
809 entry = drb;
810 if (dump_enabled_p ())
812 dump_printf_loc (MSG_NOTE, vect_location,
813 "recording new base alignment for ");
814 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
815 dump_printf (MSG_NOTE, "\n");
816 dump_printf_loc (MSG_NOTE, vect_location,
817 " alignment: %d\n", drb->base_alignment);
818 dump_printf_loc (MSG_NOTE, vect_location,
819 " misalignment: %d\n", drb->base_misalignment);
820 dump_printf_loc (MSG_NOTE, vect_location,
821 " based on: ");
822 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
827 /* If the region we're going to vectorize is reached, all unconditional
828 data references occur at least once. We can therefore pool the base
829 alignment guarantees from each unconditional reference. Do this by
830 going through all the data references in VINFO and checking whether
831 the containing statement makes the reference unconditionally. If so,
832 record the alignment of the base address in VINFO so that it can be
833 used for all other references with the same base. */
835 void
836 vect_record_base_alignments (vec_info *vinfo)
838 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
839 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
840 data_reference *dr;
841 unsigned int i;
842 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
844 stmt_vec_info stmt_info = vect_dr_stmt (dr);
845 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
846 && STMT_VINFO_VECTORIZABLE (stmt_info)
847 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
849 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
851 /* If DR is nested in the loop that is being vectorized, we can also
852 record the alignment of the base wrt the outer loop. */
853 if (loop && nested_in_vect_loop_p (loop, stmt_info))
854 vect_record_base_alignment
855 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
860 /* Return the target alignment for the vectorized form of DR. */
862 static unsigned int
863 vect_calculate_target_alignment (struct data_reference *dr)
865 stmt_vec_info stmt_info = vect_dr_stmt (dr);
866 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
867 return targetm.vectorize.preferred_vector_alignment (vectype);
870 /* Function vect_compute_data_ref_alignment
872 Compute the misalignment of the data reference DR.
874 Output:
875 1. DR_MISALIGNMENT (DR) is defined.
877 FOR NOW: No analysis is actually performed. Misalignment is calculated
878 only for trivial cases. TODO. */
880 static void
881 vect_compute_data_ref_alignment (struct data_reference *dr)
883 stmt_vec_info stmt_info = vect_dr_stmt (dr);
884 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
885 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
886 struct loop *loop = NULL;
887 tree ref = DR_REF (dr);
888 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
890 if (dump_enabled_p ())
891 dump_printf_loc (MSG_NOTE, vect_location,
892 "vect_compute_data_ref_alignment:\n");
894 if (loop_vinfo)
895 loop = LOOP_VINFO_LOOP (loop_vinfo);
897 /* Initialize misalignment to unknown. */
898 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
900 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
901 return;
903 innermost_loop_behavior *drb = vect_dr_behavior (dr);
904 bool step_preserves_misalignment_p;
906 unsigned HOST_WIDE_INT vector_alignment
907 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
908 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
910 /* No step for BB vectorization. */
911 if (!loop)
913 gcc_assert (integer_zerop (drb->step));
914 step_preserves_misalignment_p = true;
917 /* In case the dataref is in an inner-loop of the loop that is being
918 vectorized (LOOP), we use the base and misalignment information
919 relative to the outer-loop (LOOP). This is ok only if the misalignment
920 stays the same throughout the execution of the inner-loop, which is why
921 we have to check that the stride of the dataref in the inner-loop evenly
922 divides by the vector alignment. */
923 else if (nested_in_vect_loop_p (loop, stmt_info))
925 step_preserves_misalignment_p
926 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
928 if (dump_enabled_p ())
930 if (step_preserves_misalignment_p)
931 dump_printf_loc (MSG_NOTE, vect_location,
932 "inner step divides the vector alignment.\n");
933 else
934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
935 "inner step doesn't divide the vector"
936 " alignment.\n");
940 /* Similarly we can only use base and misalignment information relative to
941 an innermost loop if the misalignment stays the same throughout the
942 execution of the loop. As above, this is the case if the stride of
943 the dataref evenly divides by the alignment. */
944 else
946 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
947 step_preserves_misalignment_p
948 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
950 if (!step_preserves_misalignment_p && dump_enabled_p ())
951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
952 "step doesn't divide the vector alignment.\n");
955 unsigned int base_alignment = drb->base_alignment;
956 unsigned int base_misalignment = drb->base_misalignment;
958 /* Calculate the maximum of the pooled base address alignment and the
959 alignment that we can compute for DR itself. */
960 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
961 if (entry && base_alignment < (*entry)->base_alignment)
963 base_alignment = (*entry)->base_alignment;
964 base_misalignment = (*entry)->base_misalignment;
967 if (drb->offset_alignment < vector_alignment
968 || !step_preserves_misalignment_p
969 /* We need to know whether the step wrt the vectorized loop is
970 negative when computing the starting misalignment below. */
971 || TREE_CODE (drb->step) != INTEGER_CST)
973 if (dump_enabled_p ())
975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
976 "Unknown alignment for access: ");
977 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
978 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
980 return;
983 if (base_alignment < vector_alignment)
985 unsigned int max_alignment;
986 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
987 if (max_alignment < vector_alignment
988 || !vect_can_force_dr_alignment_p (base,
989 vector_alignment * BITS_PER_UNIT))
991 if (dump_enabled_p ())
993 dump_printf_loc (MSG_NOTE, vect_location,
994 "can't force alignment of ref: ");
995 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
996 dump_printf (MSG_NOTE, "\n");
998 return;
1001 /* Force the alignment of the decl.
1002 NOTE: This is the only change to the code we make during
1003 the analysis phase, before deciding to vectorize the loop. */
1004 if (dump_enabled_p ())
1006 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1007 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1008 dump_printf (MSG_NOTE, "\n");
1011 DR_VECT_AUX (dr)->base_decl = base;
1012 DR_VECT_AUX (dr)->base_misaligned = true;
1013 base_misalignment = 0;
1015 poly_int64 misalignment
1016 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1018 /* If this is a backward running DR then first access in the larger
1019 vectype actually is N-1 elements before the address in the DR.
1020 Adjust misalign accordingly. */
1021 if (tree_int_cst_sgn (drb->step) < 0)
1022 /* PLUS because STEP is negative. */
1023 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1024 * TREE_INT_CST_LOW (drb->step));
1026 unsigned int const_misalignment;
1027 if (!known_misalignment (misalignment, vector_alignment,
1028 &const_misalignment))
1030 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033 "Non-constant misalignment for access: ");
1034 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1035 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1037 return;
1040 SET_DR_MISALIGNMENT (dr, const_misalignment);
1042 if (dump_enabled_p ())
1044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1045 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1046 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1047 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1050 return;
1053 /* Function vect_update_misalignment_for_peel.
1054 Sets DR's misalignment
1055 - to 0 if it has the same alignment as DR_PEEL,
1056 - to the misalignment computed using NPEEL if DR's salignment is known,
1057 - to -1 (unknown) otherwise.
1059 DR - the data reference whose misalignment is to be adjusted.
1060 DR_PEEL - the data reference whose misalignment is being made
1061 zero in the vector loop by the peel.
1062 NPEEL - the number of iterations in the peel loop if the misalignment
1063 of DR_PEEL is known at compile time. */
1065 static void
1066 vect_update_misalignment_for_peel (struct data_reference *dr,
1067 struct data_reference *dr_peel, int npeel)
1069 unsigned int i;
1070 vec<dr_p> same_aligned_drs;
1071 struct data_reference *current_dr;
1072 int dr_size = vect_get_scalar_dr_size (dr);
1073 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1074 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1075 stmt_vec_info peel_stmt_info = vect_dr_stmt (dr_peel);
1077 /* For interleaved data accesses the step in the loop must be multiplied by
1078 the size of the interleaving group. */
1079 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1080 dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1081 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1082 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1084 /* It can be assumed that the data refs with the same alignment as dr_peel
1085 are aligned in the vector loop. */
1086 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (vect_dr_stmt (dr_peel));
1087 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1089 if (current_dr != dr)
1090 continue;
1091 gcc_assert (!known_alignment_for_access_p (dr)
1092 || !known_alignment_for_access_p (dr_peel)
1093 || (DR_MISALIGNMENT (dr) / dr_size
1094 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1095 SET_DR_MISALIGNMENT (dr, 0);
1096 return;
1099 if (known_alignment_for_access_p (dr)
1100 && known_alignment_for_access_p (dr_peel))
1102 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1103 int misal = DR_MISALIGNMENT (dr);
1104 misal += negative ? -npeel * dr_size : npeel * dr_size;
1105 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1106 SET_DR_MISALIGNMENT (dr, misal);
1107 return;
1110 if (dump_enabled_p ())
1111 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1112 "to unknown (-1).\n");
1113 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1117 /* Function verify_data_ref_alignment
1119 Return TRUE if DR can be handled with respect to alignment. */
1121 static bool
1122 verify_data_ref_alignment (data_reference_p dr)
1124 enum dr_alignment_support supportable_dr_alignment
1125 = vect_supportable_dr_alignment (dr, false);
1126 if (!supportable_dr_alignment)
1128 if (dump_enabled_p ())
1130 if (DR_IS_READ (dr))
1131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1132 "not vectorized: unsupported unaligned load.");
1133 else
1134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1135 "not vectorized: unsupported unaligned "
1136 "store.");
1138 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1139 DR_REF (dr));
1140 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1142 return false;
1145 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1146 dump_printf_loc (MSG_NOTE, vect_location,
1147 "Vectorizing an unaligned access.\n");
1149 return true;
1152 /* Function vect_verify_datarefs_alignment
1154 Return TRUE if all data references in the loop can be
1155 handled with respect to alignment. */
1157 bool
1158 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1160 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1161 struct data_reference *dr;
1162 unsigned int i;
1164 FOR_EACH_VEC_ELT (datarefs, i, dr)
1166 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1168 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1169 continue;
1171 /* For interleaving, only the alignment of the first access matters. */
1172 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1173 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1174 continue;
1176 /* Strided accesses perform only component accesses, alignment is
1177 irrelevant for them. */
1178 if (STMT_VINFO_STRIDED_P (stmt_info)
1179 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1180 continue;
1182 if (! verify_data_ref_alignment (dr))
1183 return false;
1186 return true;
1189 /* Given an memory reference EXP return whether its alignment is less
1190 than its size. */
1192 static bool
1193 not_size_aligned (tree exp)
1195 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1196 return true;
1198 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1199 > get_object_alignment (exp));
1202 /* Function vector_alignment_reachable_p
1204 Return true if vector alignment for DR is reachable by peeling
1205 a few loop iterations. Return false otherwise. */
1207 static bool
1208 vector_alignment_reachable_p (struct data_reference *dr)
1210 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1211 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1213 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1215 /* For interleaved access we peel only if number of iterations in
1216 the prolog loop ({VF - misalignment}), is a multiple of the
1217 number of the interleaved accesses. */
1218 int elem_size, mis_in_elements;
1220 /* FORNOW: handle only known alignment. */
1221 if (!known_alignment_for_access_p (dr))
1222 return false;
1224 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1225 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1226 elem_size = vector_element_size (vector_size, nelements);
1227 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1229 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1230 return false;
1233 /* If misalignment is known at the compile time then allow peeling
1234 only if natural alignment is reachable through peeling. */
1235 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1237 HOST_WIDE_INT elmsize =
1238 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1239 if (dump_enabled_p ())
1241 dump_printf_loc (MSG_NOTE, vect_location,
1242 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1243 dump_printf (MSG_NOTE,
1244 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1246 if (DR_MISALIGNMENT (dr) % elmsize)
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 "data size does not divide the misalignment.\n");
1251 return false;
1255 if (!known_alignment_for_access_p (dr))
1257 tree type = TREE_TYPE (DR_REF (dr));
1258 bool is_packed = not_size_aligned (DR_REF (dr));
1259 if (dump_enabled_p ())
1260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1261 "Unknown misalignment, %snaturally aligned\n",
1262 is_packed ? "not " : "");
1263 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1266 return true;
1270 /* Calculate the cost of the memory access represented by DR. */
1272 static void
1273 vect_get_data_access_cost (struct data_reference *dr,
1274 unsigned int *inside_cost,
1275 unsigned int *outside_cost,
1276 stmt_vector_for_cost *body_cost_vec,
1277 stmt_vector_for_cost *prologue_cost_vec)
1279 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1280 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1281 int ncopies;
1283 if (PURE_SLP_STMT (stmt_info))
1284 ncopies = 1;
1285 else
1286 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1288 if (DR_IS_READ (dr))
1289 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1290 prologue_cost_vec, body_cost_vec, false);
1291 else
1292 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1294 if (dump_enabled_p ())
1295 dump_printf_loc (MSG_NOTE, vect_location,
1296 "vect_get_data_access_cost: inside_cost = %d, "
1297 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1301 typedef struct _vect_peel_info
1303 struct data_reference *dr;
1304 int npeel;
1305 unsigned int count;
1306 } *vect_peel_info;
1308 typedef struct _vect_peel_extended_info
1310 struct _vect_peel_info peel_info;
1311 unsigned int inside_cost;
1312 unsigned int outside_cost;
1313 } *vect_peel_extended_info;
1316 /* Peeling hashtable helpers. */
1318 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1320 static inline hashval_t hash (const _vect_peel_info *);
1321 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1324 inline hashval_t
1325 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1327 return (hashval_t) peel_info->npeel;
1330 inline bool
1331 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1333 return (a->npeel == b->npeel);
1337 /* Insert DR into peeling hash table with NPEEL as key. */
1339 static void
1340 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1341 loop_vec_info loop_vinfo, struct data_reference *dr,
1342 int npeel)
1344 struct _vect_peel_info elem, *slot;
1345 _vect_peel_info **new_slot;
1346 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1348 elem.npeel = npeel;
1349 slot = peeling_htab->find (&elem);
1350 if (slot)
1351 slot->count++;
1352 else
1354 slot = XNEW (struct _vect_peel_info);
1355 slot->npeel = npeel;
1356 slot->dr = dr;
1357 slot->count = 1;
1358 new_slot = peeling_htab->find_slot (slot, INSERT);
1359 *new_slot = slot;
1362 if (!supportable_dr_alignment
1363 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1364 slot->count += VECT_MAX_COST;
1368 /* Traverse peeling hash table to find peeling option that aligns maximum
1369 number of data accesses. */
1372 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1373 _vect_peel_extended_info *max)
1375 vect_peel_info elem = *slot;
1377 if (elem->count > max->peel_info.count
1378 || (elem->count == max->peel_info.count
1379 && max->peel_info.npeel > elem->npeel))
1381 max->peel_info.npeel = elem->npeel;
1382 max->peel_info.count = elem->count;
1383 max->peel_info.dr = elem->dr;
1386 return 1;
1389 /* Get the costs of peeling NPEEL iterations checking data access costs
1390 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1391 misalignment will be zero after peeling. */
1393 static void
1394 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1395 struct data_reference *dr0,
1396 unsigned int *inside_cost,
1397 unsigned int *outside_cost,
1398 stmt_vector_for_cost *body_cost_vec,
1399 stmt_vector_for_cost *prologue_cost_vec,
1400 unsigned int npeel,
1401 bool unknown_misalignment)
1403 unsigned i;
1404 data_reference *dr;
1406 FOR_EACH_VEC_ELT (datarefs, i, dr)
1408 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1409 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1410 continue;
1412 /* For interleaving, only the alignment of the first access
1413 matters. */
1414 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1415 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1416 continue;
1418 /* Strided accesses perform only component accesses, alignment is
1419 irrelevant for them. */
1420 if (STMT_VINFO_STRIDED_P (stmt_info)
1421 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1422 continue;
1424 int save_misalignment;
1425 save_misalignment = DR_MISALIGNMENT (dr);
1426 if (npeel == 0)
1428 else if (unknown_misalignment && dr == dr0)
1429 SET_DR_MISALIGNMENT (dr, 0);
1430 else
1431 vect_update_misalignment_for_peel (dr, dr0, npeel);
1432 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1433 body_cost_vec, prologue_cost_vec);
1434 SET_DR_MISALIGNMENT (dr, save_misalignment);
1438 /* Traverse peeling hash table and calculate cost for each peeling option.
1439 Find the one with the lowest cost. */
1442 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1443 _vect_peel_extended_info *min)
1445 vect_peel_info elem = *slot;
1446 int dummy;
1447 unsigned int inside_cost = 0, outside_cost = 0;
1448 stmt_vec_info stmt_info = vect_dr_stmt (elem->dr);
1449 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1450 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1451 epilogue_cost_vec;
1453 prologue_cost_vec.create (2);
1454 body_cost_vec.create (2);
1455 epilogue_cost_vec.create (2);
1457 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1458 elem->dr, &inside_cost, &outside_cost,
1459 &body_cost_vec, &prologue_cost_vec,
1460 elem->npeel, false);
1462 body_cost_vec.release ();
1464 outside_cost += vect_get_known_peeling_cost
1465 (loop_vinfo, elem->npeel, &dummy,
1466 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1467 &prologue_cost_vec, &epilogue_cost_vec);
1469 /* Prologue and epilogue costs are added to the target model later.
1470 These costs depend only on the scalar iteration cost, the
1471 number of peeling iterations finally chosen, and the number of
1472 misaligned statements. So discard the information found here. */
1473 prologue_cost_vec.release ();
1474 epilogue_cost_vec.release ();
1476 if (inside_cost < min->inside_cost
1477 || (inside_cost == min->inside_cost
1478 && outside_cost < min->outside_cost))
1480 min->inside_cost = inside_cost;
1481 min->outside_cost = outside_cost;
1482 min->peel_info.dr = elem->dr;
1483 min->peel_info.npeel = elem->npeel;
1484 min->peel_info.count = elem->count;
1487 return 1;
1491 /* Choose best peeling option by traversing peeling hash table and either
1492 choosing an option with the lowest cost (if cost model is enabled) or the
1493 option that aligns as many accesses as possible. */
1495 static struct _vect_peel_extended_info
1496 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1497 loop_vec_info loop_vinfo)
1499 struct _vect_peel_extended_info res;
1501 res.peel_info.dr = NULL;
1503 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1505 res.inside_cost = INT_MAX;
1506 res.outside_cost = INT_MAX;
1507 peeling_htab->traverse <_vect_peel_extended_info *,
1508 vect_peeling_hash_get_lowest_cost> (&res);
1510 else
1512 res.peel_info.count = 0;
1513 peeling_htab->traverse <_vect_peel_extended_info *,
1514 vect_peeling_hash_get_most_frequent> (&res);
1515 res.inside_cost = 0;
1516 res.outside_cost = 0;
1519 return res;
1522 /* Return true if the new peeling NPEEL is supported. */
1524 static bool
1525 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1526 unsigned npeel)
1528 unsigned i;
1529 struct data_reference *dr = NULL;
1530 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1531 enum dr_alignment_support supportable_dr_alignment;
1533 /* Ensure that all data refs can be vectorized after the peel. */
1534 FOR_EACH_VEC_ELT (datarefs, i, dr)
1536 int save_misalignment;
1538 if (dr == dr0)
1539 continue;
1541 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1542 /* For interleaving, only the alignment of the first access
1543 matters. */
1544 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1545 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1546 continue;
1548 /* Strided accesses perform only component accesses, alignment is
1549 irrelevant for them. */
1550 if (STMT_VINFO_STRIDED_P (stmt_info)
1551 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1552 continue;
1554 save_misalignment = DR_MISALIGNMENT (dr);
1555 vect_update_misalignment_for_peel (dr, dr0, npeel);
1556 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1557 SET_DR_MISALIGNMENT (dr, save_misalignment);
1559 if (!supportable_dr_alignment)
1560 return false;
1563 return true;
1566 /* Function vect_enhance_data_refs_alignment
1568 This pass will use loop versioning and loop peeling in order to enhance
1569 the alignment of data references in the loop.
1571 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1572 original loop is to be vectorized. Any other loops that are created by
1573 the transformations performed in this pass - are not supposed to be
1574 vectorized. This restriction will be relaxed.
1576 This pass will require a cost model to guide it whether to apply peeling
1577 or versioning or a combination of the two. For example, the scheme that
1578 intel uses when given a loop with several memory accesses, is as follows:
1579 choose one memory access ('p') which alignment you want to force by doing
1580 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1581 other accesses are not necessarily aligned, or (2) use loop versioning to
1582 generate one loop in which all accesses are aligned, and another loop in
1583 which only 'p' is necessarily aligned.
1585 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1586 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1587 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1589 Devising a cost model is the most critical aspect of this work. It will
1590 guide us on which access to peel for, whether to use loop versioning, how
1591 many versions to create, etc. The cost model will probably consist of
1592 generic considerations as well as target specific considerations (on
1593 powerpc for example, misaligned stores are more painful than misaligned
1594 loads).
1596 Here are the general steps involved in alignment enhancements:
1598 -- original loop, before alignment analysis:
1599 for (i=0; i<N; i++){
1600 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1601 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1604 -- After vect_compute_data_refs_alignment:
1605 for (i=0; i<N; i++){
1606 x = q[i]; # DR_MISALIGNMENT(q) = 3
1607 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1610 -- Possibility 1: we do loop versioning:
1611 if (p is aligned) {
1612 for (i=0; i<N; i++){ # loop 1A
1613 x = q[i]; # DR_MISALIGNMENT(q) = 3
1614 p[i] = y; # DR_MISALIGNMENT(p) = 0
1617 else {
1618 for (i=0; i<N; i++){ # loop 1B
1619 x = q[i]; # DR_MISALIGNMENT(q) = 3
1620 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1624 -- Possibility 2: we do loop peeling:
1625 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1626 x = q[i];
1627 p[i] = y;
1629 for (i = 3; i < N; i++){ # loop 2A
1630 x = q[i]; # DR_MISALIGNMENT(q) = 0
1631 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1634 -- Possibility 3: combination of loop peeling and versioning:
1635 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1636 x = q[i];
1637 p[i] = y;
1639 if (p is aligned) {
1640 for (i = 3; i<N; i++){ # loop 3A
1641 x = q[i]; # DR_MISALIGNMENT(q) = 0
1642 p[i] = y; # DR_MISALIGNMENT(p) = 0
1645 else {
1646 for (i = 3; i<N; i++){ # loop 3B
1647 x = q[i]; # DR_MISALIGNMENT(q) = 0
1648 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1652 These loops are later passed to loop_transform to be vectorized. The
1653 vectorizer will use the alignment information to guide the transformation
1654 (whether to generate regular loads/stores, or with special handling for
1655 misalignment). */
1657 bool
1658 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1660 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1661 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1662 enum dr_alignment_support supportable_dr_alignment;
1663 struct data_reference *dr0 = NULL, *first_store = NULL;
1664 struct data_reference *dr;
1665 unsigned int i, j;
1666 bool do_peeling = false;
1667 bool do_versioning = false;
1668 bool stat;
1669 unsigned int npeel = 0;
1670 bool one_misalignment_known = false;
1671 bool one_misalignment_unknown = false;
1672 bool one_dr_unsupportable = false;
1673 struct data_reference *unsupportable_dr = NULL;
1674 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1675 unsigned possible_npeel_number = 1;
1676 tree vectype;
1677 unsigned int mis, same_align_drs_max = 0;
1678 hash_table<peel_info_hasher> peeling_htab (1);
1680 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1682 /* Reset data so we can safely be called multiple times. */
1683 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1684 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1686 /* While cost model enhancements are expected in the future, the high level
1687 view of the code at this time is as follows:
1689 A) If there is a misaligned access then see if peeling to align
1690 this access can make all data references satisfy
1691 vect_supportable_dr_alignment. If so, update data structures
1692 as needed and return true.
1694 B) If peeling wasn't possible and there is a data reference with an
1695 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1696 then see if loop versioning checks can be used to make all data
1697 references satisfy vect_supportable_dr_alignment. If so, update
1698 data structures as needed and return true.
1700 C) If neither peeling nor versioning were successful then return false if
1701 any data reference does not satisfy vect_supportable_dr_alignment.
1703 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1705 Note, Possibility 3 above (which is peeling and versioning together) is not
1706 being done at this time. */
1708 /* (1) Peeling to force alignment. */
1710 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1711 Considerations:
1712 + How many accesses will become aligned due to the peeling
1713 - How many accesses will become unaligned due to the peeling,
1714 and the cost of misaligned accesses.
1715 - The cost of peeling (the extra runtime checks, the increase
1716 in code size). */
1718 FOR_EACH_VEC_ELT (datarefs, i, dr)
1720 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1722 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1723 continue;
1725 /* For interleaving, only the alignment of the first access
1726 matters. */
1727 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1728 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1729 continue;
1731 /* For scatter-gather or invariant accesses there is nothing
1732 to enhance. */
1733 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1734 || integer_zerop (DR_STEP (dr)))
1735 continue;
1737 /* Strided accesses perform only component accesses, alignment is
1738 irrelevant for them. */
1739 if (STMT_VINFO_STRIDED_P (stmt_info)
1740 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1741 continue;
1743 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1744 do_peeling = vector_alignment_reachable_p (dr);
1745 if (do_peeling)
1747 if (known_alignment_for_access_p (dr))
1749 unsigned int npeel_tmp = 0;
1750 bool negative = tree_int_cst_compare (DR_STEP (dr),
1751 size_zero_node) < 0;
1753 vectype = STMT_VINFO_VECTYPE (stmt_info);
1754 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1755 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1756 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1757 if (DR_MISALIGNMENT (dr) != 0)
1758 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1760 /* For multiple types, it is possible that the bigger type access
1761 will have more than one peeling option. E.g., a loop with two
1762 types: one of size (vector size / 4), and the other one of
1763 size (vector size / 8). Vectorization factor will 8. If both
1764 accesses are misaligned by 3, the first one needs one scalar
1765 iteration to be aligned, and the second one needs 5. But the
1766 first one will be aligned also by peeling 5 scalar
1767 iterations, and in that case both accesses will be aligned.
1768 Hence, except for the immediate peeling amount, we also want
1769 to try to add full vector size, while we don't exceed
1770 vectorization factor.
1771 We do this automatically for cost model, since we calculate
1772 cost for every peeling option. */
1773 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1775 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1776 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1777 possible_npeel_number
1778 = vect_get_num_vectors (nscalars, vectype);
1780 /* NPEEL_TMP is 0 when there is no misalignment, but also
1781 allow peeling NELEMENTS. */
1782 if (DR_MISALIGNMENT (dr) == 0)
1783 possible_npeel_number++;
1786 /* Save info about DR in the hash table. Also include peeling
1787 amounts according to the explanation above. */
1788 for (j = 0; j < possible_npeel_number; j++)
1790 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1791 dr, npeel_tmp);
1792 npeel_tmp += target_align / dr_size;
1795 one_misalignment_known = true;
1797 else
1799 /* If we don't know any misalignment values, we prefer
1800 peeling for data-ref that has the maximum number of data-refs
1801 with the same alignment, unless the target prefers to align
1802 stores over load. */
1803 unsigned same_align_drs
1804 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1805 if (!dr0
1806 || same_align_drs_max < same_align_drs)
1808 same_align_drs_max = same_align_drs;
1809 dr0 = dr;
1811 /* For data-refs with the same number of related
1812 accesses prefer the one where the misalign
1813 computation will be invariant in the outermost loop. */
1814 else if (same_align_drs_max == same_align_drs)
1816 struct loop *ivloop0, *ivloop;
1817 ivloop0 = outermost_invariant_loop_for_expr
1818 (loop, DR_BASE_ADDRESS (dr0));
1819 ivloop = outermost_invariant_loop_for_expr
1820 (loop, DR_BASE_ADDRESS (dr));
1821 if ((ivloop && !ivloop0)
1822 || (ivloop && ivloop0
1823 && flow_loop_nested_p (ivloop, ivloop0)))
1824 dr0 = dr;
1827 one_misalignment_unknown = true;
1829 /* Check for data refs with unsupportable alignment that
1830 can be peeled. */
1831 if (!supportable_dr_alignment)
1833 one_dr_unsupportable = true;
1834 unsupportable_dr = dr;
1837 if (!first_store && DR_IS_WRITE (dr))
1838 first_store = dr;
1841 else
1843 if (!aligned_access_p (dr))
1845 if (dump_enabled_p ())
1846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1847 "vector alignment may not be reachable\n");
1848 break;
1853 /* Check if we can possibly peel the loop. */
1854 if (!vect_can_advance_ivs_p (loop_vinfo)
1855 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1856 || loop->inner)
1857 do_peeling = false;
1859 struct _vect_peel_extended_info peel_for_known_alignment;
1860 struct _vect_peel_extended_info peel_for_unknown_alignment;
1861 struct _vect_peel_extended_info best_peel;
1863 peel_for_unknown_alignment.inside_cost = INT_MAX;
1864 peel_for_unknown_alignment.outside_cost = INT_MAX;
1865 peel_for_unknown_alignment.peel_info.count = 0;
1867 if (do_peeling
1868 && one_misalignment_unknown)
1870 /* Check if the target requires to prefer stores over loads, i.e., if
1871 misaligned stores are more expensive than misaligned loads (taking
1872 drs with same alignment into account). */
1873 unsigned int load_inside_cost = 0;
1874 unsigned int load_outside_cost = 0;
1875 unsigned int store_inside_cost = 0;
1876 unsigned int store_outside_cost = 0;
1877 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1879 stmt_vector_for_cost dummy;
1880 dummy.create (2);
1881 vect_get_peeling_costs_all_drs (datarefs, dr0,
1882 &load_inside_cost,
1883 &load_outside_cost,
1884 &dummy, &dummy, estimated_npeels, true);
1885 dummy.release ();
1887 if (first_store)
1889 dummy.create (2);
1890 vect_get_peeling_costs_all_drs (datarefs, first_store,
1891 &store_inside_cost,
1892 &store_outside_cost,
1893 &dummy, &dummy,
1894 estimated_npeels, true);
1895 dummy.release ();
1897 else
1899 store_inside_cost = INT_MAX;
1900 store_outside_cost = INT_MAX;
1903 if (load_inside_cost > store_inside_cost
1904 || (load_inside_cost == store_inside_cost
1905 && load_outside_cost > store_outside_cost))
1907 dr0 = first_store;
1908 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1909 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1911 else
1913 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1914 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1917 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1918 prologue_cost_vec.create (2);
1919 epilogue_cost_vec.create (2);
1921 int dummy2;
1922 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1923 (loop_vinfo, estimated_npeels, &dummy2,
1924 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1925 &prologue_cost_vec, &epilogue_cost_vec);
1927 prologue_cost_vec.release ();
1928 epilogue_cost_vec.release ();
1930 peel_for_unknown_alignment.peel_info.count = 1
1931 + STMT_VINFO_SAME_ALIGN_REFS (vect_dr_stmt (dr0)).length ();
1934 peel_for_unknown_alignment.peel_info.npeel = 0;
1935 peel_for_unknown_alignment.peel_info.dr = dr0;
1937 best_peel = peel_for_unknown_alignment;
1939 peel_for_known_alignment.inside_cost = INT_MAX;
1940 peel_for_known_alignment.outside_cost = INT_MAX;
1941 peel_for_known_alignment.peel_info.count = 0;
1942 peel_for_known_alignment.peel_info.dr = NULL;
1944 if (do_peeling && one_misalignment_known)
1946 /* Peeling is possible, but there is no data access that is not supported
1947 unless aligned. So we try to choose the best possible peeling from
1948 the hash table. */
1949 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1950 (&peeling_htab, loop_vinfo);
1953 /* Compare costs of peeling for known and unknown alignment. */
1954 if (peel_for_known_alignment.peel_info.dr != NULL
1955 && peel_for_unknown_alignment.inside_cost
1956 >= peel_for_known_alignment.inside_cost)
1958 best_peel = peel_for_known_alignment;
1960 /* If the best peeling for known alignment has NPEEL == 0, perform no
1961 peeling at all except if there is an unsupportable dr that we can
1962 align. */
1963 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1964 do_peeling = false;
1967 /* If there is an unsupportable data ref, prefer this over all choices so far
1968 since we'd have to discard a chosen peeling except when it accidentally
1969 aligned the unsupportable data ref. */
1970 if (one_dr_unsupportable)
1971 dr0 = unsupportable_dr;
1972 else if (do_peeling)
1974 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1975 TODO: Use nopeel_outside_cost or get rid of it? */
1976 unsigned nopeel_inside_cost = 0;
1977 unsigned nopeel_outside_cost = 0;
1979 stmt_vector_for_cost dummy;
1980 dummy.create (2);
1981 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1982 &nopeel_outside_cost, &dummy, &dummy,
1983 0, false);
1984 dummy.release ();
1986 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1987 costs will be recorded. */
1988 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1989 prologue_cost_vec.create (2);
1990 epilogue_cost_vec.create (2);
1992 int dummy2;
1993 nopeel_outside_cost += vect_get_known_peeling_cost
1994 (loop_vinfo, 0, &dummy2,
1995 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1996 &prologue_cost_vec, &epilogue_cost_vec);
1998 prologue_cost_vec.release ();
1999 epilogue_cost_vec.release ();
2001 npeel = best_peel.peel_info.npeel;
2002 dr0 = best_peel.peel_info.dr;
2004 /* If no peeling is not more expensive than the best peeling we
2005 have so far, don't perform any peeling. */
2006 if (nopeel_inside_cost <= best_peel.inside_cost)
2007 do_peeling = false;
2010 if (do_peeling)
2012 stmt_vec_info stmt_info = vect_dr_stmt (dr0);
2013 vectype = STMT_VINFO_VECTYPE (stmt_info);
2015 if (known_alignment_for_access_p (dr0))
2017 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2018 size_zero_node) < 0;
2019 if (!npeel)
2021 /* Since it's known at compile time, compute the number of
2022 iterations in the peeled loop (the peeling factor) for use in
2023 updating DR_MISALIGNMENT values. The peeling factor is the
2024 vectorization factor minus the misalignment as an element
2025 count. */
2026 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2027 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2028 npeel = ((mis & (target_align - 1))
2029 / vect_get_scalar_dr_size (dr0));
2032 /* For interleaved data access every iteration accesses all the
2033 members of the group, therefore we divide the number of iterations
2034 by the group size. */
2035 stmt_info = vect_dr_stmt (dr0);
2036 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2037 npeel /= DR_GROUP_SIZE (stmt_info);
2039 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_NOTE, vect_location,
2041 "Try peeling by %d\n", npeel);
2044 /* Ensure that all datarefs can be vectorized after the peel. */
2045 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2046 do_peeling = false;
2048 /* Check if all datarefs are supportable and log. */
2049 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2051 stat = vect_verify_datarefs_alignment (loop_vinfo);
2052 if (!stat)
2053 do_peeling = false;
2054 else
2055 return stat;
2058 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2059 if (do_peeling)
2061 unsigned max_allowed_peel
2062 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2063 if (max_allowed_peel != (unsigned)-1)
2065 unsigned max_peel = npeel;
2066 if (max_peel == 0)
2068 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2069 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2071 if (max_peel > max_allowed_peel)
2073 do_peeling = false;
2074 if (dump_enabled_p ())
2075 dump_printf_loc (MSG_NOTE, vect_location,
2076 "Disable peeling, max peels reached: %d\n", max_peel);
2081 /* Cost model #2 - if peeling may result in a remaining loop not
2082 iterating enough to be vectorized then do not peel. Since this
2083 is a cost heuristic rather than a correctness decision, use the
2084 most likely runtime value for variable vectorization factors. */
2085 if (do_peeling
2086 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2088 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2089 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2090 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2091 < assumed_vf + max_peel)
2092 do_peeling = false;
2095 if (do_peeling)
2097 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2098 If the misalignment of DR_i is identical to that of dr0 then set
2099 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2100 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2101 by the peeling factor times the element size of DR_i (MOD the
2102 vectorization factor times the size). Otherwise, the
2103 misalignment of DR_i must be set to unknown. */
2104 FOR_EACH_VEC_ELT (datarefs, i, dr)
2105 if (dr != dr0)
2107 /* Strided accesses perform only component accesses, alignment
2108 is irrelevant for them. */
2109 stmt_info = vect_dr_stmt (dr);
2110 if (STMT_VINFO_STRIDED_P (stmt_info)
2111 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2112 continue;
2114 vect_update_misalignment_for_peel (dr, dr0, npeel);
2117 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2118 if (npeel)
2119 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2120 else
2121 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2122 = DR_MISALIGNMENT (dr0);
2123 SET_DR_MISALIGNMENT (dr0, 0);
2124 if (dump_enabled_p ())
2126 dump_printf_loc (MSG_NOTE, vect_location,
2127 "Alignment of access forced using peeling.\n");
2128 dump_printf_loc (MSG_NOTE, vect_location,
2129 "Peeling for alignment will be applied.\n");
2132 /* The inside-loop cost will be accounted for in vectorizable_load
2133 and vectorizable_store correctly with adjusted alignments.
2134 Drop the body_cst_vec on the floor here. */
2135 stat = vect_verify_datarefs_alignment (loop_vinfo);
2136 gcc_assert (stat);
2137 return stat;
2141 /* (2) Versioning to force alignment. */
2143 /* Try versioning if:
2144 1) optimize loop for speed
2145 2) there is at least one unsupported misaligned data ref with an unknown
2146 misalignment, and
2147 3) all misaligned data refs with a known misalignment are supported, and
2148 4) the number of runtime alignment checks is within reason. */
2150 do_versioning =
2151 optimize_loop_nest_for_speed_p (loop)
2152 && (!loop->inner); /* FORNOW */
2154 if (do_versioning)
2156 FOR_EACH_VEC_ELT (datarefs, i, dr)
2158 stmt_vec_info stmt_info = vect_dr_stmt (dr);
2160 /* For interleaving, only the alignment of the first access
2161 matters. */
2162 if (aligned_access_p (dr)
2163 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2164 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2165 continue;
2167 if (STMT_VINFO_STRIDED_P (stmt_info))
2169 /* Strided loads perform only component accesses, alignment is
2170 irrelevant for them. */
2171 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2172 continue;
2173 do_versioning = false;
2174 break;
2177 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2179 if (!supportable_dr_alignment)
2181 int mask;
2182 tree vectype;
2184 if (known_alignment_for_access_p (dr)
2185 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2186 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2188 do_versioning = false;
2189 break;
2192 stmt_info = vect_dr_stmt (dr);
2193 vectype = STMT_VINFO_VECTYPE (stmt_info);
2194 gcc_assert (vectype);
2196 /* At present we don't support versioning for alignment
2197 with variable VF, since there's no guarantee that the
2198 VF is a power of two. We could relax this if we added
2199 a way of enforcing a power-of-two size. */
2200 unsigned HOST_WIDE_INT size;
2201 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2203 do_versioning = false;
2204 break;
2207 /* The rightmost bits of an aligned address must be zeros.
2208 Construct the mask needed for this test. For example,
2209 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2210 mask must be 15 = 0xf. */
2211 mask = size - 1;
2213 /* FORNOW: use the same mask to test all potentially unaligned
2214 references in the loop. The vectorizer currently supports
2215 a single vector size, see the reference to
2216 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2217 vectorization factor is computed. */
2218 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2219 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2220 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2221 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2225 /* Versioning requires at least one misaligned data reference. */
2226 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2227 do_versioning = false;
2228 else if (!do_versioning)
2229 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2232 if (do_versioning)
2234 vec<gimple *> may_misalign_stmts
2235 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2236 gimple *stmt;
2238 /* It can now be assumed that the data references in the statements
2239 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2240 of the loop being vectorized. */
2241 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2243 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2244 dr = STMT_VINFO_DATA_REF (stmt_info);
2245 SET_DR_MISALIGNMENT (dr, 0);
2246 if (dump_enabled_p ())
2247 dump_printf_loc (MSG_NOTE, vect_location,
2248 "Alignment of access forced using versioning.\n");
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "Versioning for alignment will be applied.\n");
2255 /* Peeling and versioning can't be done together at this time. */
2256 gcc_assert (! (do_peeling && do_versioning));
2258 stat = vect_verify_datarefs_alignment (loop_vinfo);
2259 gcc_assert (stat);
2260 return stat;
2263 /* This point is reached if neither peeling nor versioning is being done. */
2264 gcc_assert (! (do_peeling || do_versioning));
2266 stat = vect_verify_datarefs_alignment (loop_vinfo);
2267 return stat;
2271 /* Function vect_find_same_alignment_drs.
2273 Update group and alignment relations according to the chosen
2274 vectorization factor. */
2276 static void
2277 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2279 struct data_reference *dra = DDR_A (ddr);
2280 struct data_reference *drb = DDR_B (ddr);
2281 stmt_vec_info stmtinfo_a = vect_dr_stmt (dra);
2282 stmt_vec_info stmtinfo_b = vect_dr_stmt (drb);
2284 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2285 return;
2287 if (dra == drb)
2288 return;
2290 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2291 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2292 return;
2294 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2295 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2296 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2297 return;
2299 /* Two references with distance zero have the same alignment. */
2300 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2301 - wi::to_poly_offset (DR_INIT (drb)));
2302 if (maybe_ne (diff, 0))
2304 /* Get the wider of the two alignments. */
2305 unsigned int align_a = (vect_calculate_target_alignment (dra)
2306 / BITS_PER_UNIT);
2307 unsigned int align_b = (vect_calculate_target_alignment (drb)
2308 / BITS_PER_UNIT);
2309 unsigned int max_align = MAX (align_a, align_b);
2311 /* Require the gap to be a multiple of the larger vector alignment. */
2312 if (!multiple_p (diff, max_align))
2313 return;
2316 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2317 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2318 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_NOTE, vect_location,
2321 "accesses have the same alignment: ");
2322 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2323 dump_printf (MSG_NOTE, " and ");
2324 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2325 dump_printf (MSG_NOTE, "\n");
2330 /* Function vect_analyze_data_refs_alignment
2332 Analyze the alignment of the data-references in the loop.
2333 Return FALSE if a data reference is found that cannot be vectorized. */
2335 bool
2336 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2338 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2340 /* Mark groups of data references with same alignment using
2341 data dependence information. */
2342 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2343 struct data_dependence_relation *ddr;
2344 unsigned int i;
2346 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2347 vect_find_same_alignment_drs (ddr);
2349 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2350 struct data_reference *dr;
2352 vect_record_base_alignments (vinfo);
2353 FOR_EACH_VEC_ELT (datarefs, i, dr)
2355 stmt_vec_info stmt_info = vect_dr_stmt (dr);
2356 if (STMT_VINFO_VECTORIZABLE (stmt_info))
2357 vect_compute_data_ref_alignment (dr);
2360 return true;
2364 /* Analyze alignment of DRs of stmts in NODE. */
2366 static bool
2367 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2369 /* We vectorize from the first scalar stmt in the node unless
2370 the node is permuted in which case we start from the first
2371 element in the group. */
2372 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2373 gimple *first_stmt = first_stmt_info->stmt;
2374 data_reference_p first_dr = STMT_VINFO_DATA_REF (first_stmt_info);
2375 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2376 first_stmt = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2378 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2379 vect_compute_data_ref_alignment (dr);
2380 /* For creating the data-ref pointer we need alignment of the
2381 first element anyway. */
2382 if (dr != first_dr)
2383 vect_compute_data_ref_alignment (first_dr);
2384 if (! verify_data_ref_alignment (dr))
2386 if (dump_enabled_p ())
2387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2388 "not vectorized: bad data alignment in basic "
2389 "block.\n");
2390 return false;
2393 return true;
2396 /* Function vect_slp_analyze_instance_alignment
2398 Analyze the alignment of the data-references in the SLP instance.
2399 Return FALSE if a data reference is found that cannot be vectorized. */
2401 bool
2402 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2404 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2406 slp_tree node;
2407 unsigned i;
2408 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2409 if (! vect_slp_analyze_and_verify_node_alignment (node))
2410 return false;
2412 node = SLP_INSTANCE_TREE (instance);
2413 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2414 && ! vect_slp_analyze_and_verify_node_alignment
2415 (SLP_INSTANCE_TREE (instance)))
2416 return false;
2418 return true;
2422 /* Analyze groups of accesses: check that DR belongs to a group of
2423 accesses of legal size, step, etc. Detect gaps, single element
2424 interleaving, and other special cases. Set grouped access info.
2425 Collect groups of strided stores for further use in SLP analysis.
2426 Worker for vect_analyze_group_access. */
2428 static bool
2429 vect_analyze_group_access_1 (struct data_reference *dr)
2431 tree step = DR_STEP (dr);
2432 tree scalar_type = TREE_TYPE (DR_REF (dr));
2433 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2434 stmt_vec_info stmt_info = vect_dr_stmt (dr);
2435 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2436 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2437 HOST_WIDE_INT dr_step = -1;
2438 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2439 bool slp_impossible = false;
2441 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2442 size of the interleaving group (including gaps). */
2443 if (tree_fits_shwi_p (step))
2445 dr_step = tree_to_shwi (step);
2446 /* Check that STEP is a multiple of type size. Otherwise there is
2447 a non-element-sized gap at the end of the group which we
2448 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2449 ??? As we can handle non-constant step fine here we should
2450 simply remove uses of DR_GROUP_GAP between the last and first
2451 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2452 simply not include that gap. */
2453 if ((dr_step % type_size) != 0)
2455 if (dump_enabled_p ())
2457 dump_printf_loc (MSG_NOTE, vect_location,
2458 "Step ");
2459 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2460 dump_printf (MSG_NOTE,
2461 " is not a multiple of the element size for ");
2462 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2463 dump_printf (MSG_NOTE, "\n");
2465 return false;
2467 groupsize = absu_hwi (dr_step) / type_size;
2469 else
2470 groupsize = 0;
2472 /* Not consecutive access is possible only if it is a part of interleaving. */
2473 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2475 /* Check if it this DR is a part of interleaving, and is a single
2476 element of the group that is accessed in the loop. */
2478 /* Gaps are supported only for loads. STEP must be a multiple of the type
2479 size. */
2480 if (DR_IS_READ (dr)
2481 && (dr_step % type_size) == 0
2482 && groupsize > 0)
2484 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2485 DR_GROUP_SIZE (stmt_info) = groupsize;
2486 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2487 if (dump_enabled_p ())
2489 dump_printf_loc (MSG_NOTE, vect_location,
2490 "Detected single element interleaving ");
2491 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2492 dump_printf (MSG_NOTE, " step ");
2493 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2494 dump_printf (MSG_NOTE, "\n");
2497 return true;
2500 if (dump_enabled_p ())
2502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2503 "not consecutive access ");
2504 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2505 stmt_info->stmt, 0);
2508 if (bb_vinfo)
2510 /* Mark the statement as unvectorizable. */
2511 STMT_VINFO_VECTORIZABLE (vect_dr_stmt (dr)) = false;
2512 return true;
2515 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2516 STMT_VINFO_STRIDED_P (stmt_info) = true;
2517 return true;
2520 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2522 /* First stmt in the interleaving chain. Check the chain. */
2523 gimple *next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2524 struct data_reference *data_ref = dr;
2525 unsigned int count = 1;
2526 tree prev_init = DR_INIT (data_ref);
2527 gimple *prev = stmt_info;
2528 HOST_WIDE_INT diff, gaps = 0;
2530 /* By construction, all group members have INTEGER_CST DR_INITs. */
2531 while (next)
2533 /* Skip same data-refs. In case that two or more stmts share
2534 data-ref (supported only for loads), we vectorize only the first
2535 stmt, and the rest get their vectorized loads from the first
2536 one. */
2537 if (!tree_int_cst_compare (DR_INIT (data_ref),
2538 DR_INIT (STMT_VINFO_DATA_REF (
2539 vinfo_for_stmt (next)))))
2541 if (DR_IS_WRITE (data_ref))
2543 if (dump_enabled_p ())
2544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2545 "Two store stmts share the same dr.\n");
2546 return false;
2549 if (dump_enabled_p ())
2550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2551 "Two or more load stmts share the same dr.\n");
2553 /* For load use the same data-ref load. */
2554 DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2556 prev = next;
2557 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2558 continue;
2561 prev = next;
2562 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2564 /* All group members have the same STEP by construction. */
2565 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2567 /* Check that the distance between two accesses is equal to the type
2568 size. Otherwise, we have gaps. */
2569 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2570 - TREE_INT_CST_LOW (prev_init)) / type_size;
2571 if (diff != 1)
2573 /* FORNOW: SLP of accesses with gaps is not supported. */
2574 slp_impossible = true;
2575 if (DR_IS_WRITE (data_ref))
2577 if (dump_enabled_p ())
2578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2579 "interleaved store with gaps\n");
2580 return false;
2583 gaps += diff - 1;
2586 last_accessed_element += diff;
2588 /* Store the gap from the previous member of the group. If there is no
2589 gap in the access, DR_GROUP_GAP is always 1. */
2590 DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2592 prev_init = DR_INIT (data_ref);
2593 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2594 /* Count the number of data-refs in the chain. */
2595 count++;
2598 if (groupsize == 0)
2599 groupsize = count + gaps;
2601 /* This could be UINT_MAX but as we are generating code in a very
2602 inefficient way we have to cap earlier. See PR78699 for example. */
2603 if (groupsize > 4096)
2605 if (dump_enabled_p ())
2606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2607 "group is too large\n");
2608 return false;
2611 /* Check that the size of the interleaving is equal to count for stores,
2612 i.e., that there are no gaps. */
2613 if (groupsize != count
2614 && !DR_IS_READ (dr))
2616 if (dump_enabled_p ())
2617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2618 "interleaved store with gaps\n");
2619 return false;
2622 /* If there is a gap after the last load in the group it is the
2623 difference between the groupsize and the last accessed
2624 element.
2625 When there is no gap, this difference should be 0. */
2626 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2628 DR_GROUP_SIZE (stmt_info) = groupsize;
2629 if (dump_enabled_p ())
2631 dump_printf_loc (MSG_NOTE, vect_location,
2632 "Detected interleaving ");
2633 if (DR_IS_READ (dr))
2634 dump_printf (MSG_NOTE, "load ");
2635 else
2636 dump_printf (MSG_NOTE, "store ");
2637 dump_printf (MSG_NOTE, "of size %u starting with ",
2638 (unsigned)groupsize);
2639 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2640 if (DR_GROUP_GAP (stmt_info) != 0)
2641 dump_printf_loc (MSG_NOTE, vect_location,
2642 "There is a gap of %u elements after the group\n",
2643 DR_GROUP_GAP (stmt_info));
2646 /* SLP: create an SLP data structure for every interleaving group of
2647 stores for further analysis in vect_analyse_slp. */
2648 if (DR_IS_WRITE (dr) && !slp_impossible)
2650 if (loop_vinfo)
2651 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2652 if (bb_vinfo)
2653 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2657 return true;
2660 /* Analyze groups of accesses: check that DR belongs to a group of
2661 accesses of legal size, step, etc. Detect gaps, single element
2662 interleaving, and other special cases. Set grouped access info.
2663 Collect groups of strided stores for further use in SLP analysis. */
2665 static bool
2666 vect_analyze_group_access (struct data_reference *dr)
2668 if (!vect_analyze_group_access_1 (dr))
2670 /* Dissolve the group if present. */
2671 gimple *next;
2672 gimple *stmt = DR_GROUP_FIRST_ELEMENT (vect_dr_stmt (dr));
2673 while (stmt)
2675 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2676 next = DR_GROUP_NEXT_ELEMENT (vinfo);
2677 DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2678 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2679 stmt = next;
2681 return false;
2683 return true;
2686 /* Analyze the access pattern of the data-reference DR.
2687 In case of non-consecutive accesses call vect_analyze_group_access() to
2688 analyze groups of accesses. */
2690 static bool
2691 vect_analyze_data_ref_access (struct data_reference *dr)
2693 tree step = DR_STEP (dr);
2694 tree scalar_type = TREE_TYPE (DR_REF (dr));
2695 stmt_vec_info stmt_info = vect_dr_stmt (dr);
2696 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2697 struct loop *loop = NULL;
2699 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2700 return true;
2702 if (loop_vinfo)
2703 loop = LOOP_VINFO_LOOP (loop_vinfo);
2705 if (loop_vinfo && !step)
2707 if (dump_enabled_p ())
2708 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2709 "bad data-ref access in loop\n");
2710 return false;
2713 /* Allow loads with zero step in inner-loop vectorization. */
2714 if (loop_vinfo && integer_zerop (step))
2716 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2717 if (!nested_in_vect_loop_p (loop, stmt_info))
2718 return DR_IS_READ (dr);
2719 /* Allow references with zero step for outer loops marked
2720 with pragma omp simd only - it guarantees absence of
2721 loop-carried dependencies between inner loop iterations. */
2722 if (loop->safelen < 2)
2724 if (dump_enabled_p ())
2725 dump_printf_loc (MSG_NOTE, vect_location,
2726 "zero step in inner loop of nest\n");
2727 return false;
2731 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2733 /* Interleaved accesses are not yet supported within outer-loop
2734 vectorization for references in the inner-loop. */
2735 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2737 /* For the rest of the analysis we use the outer-loop step. */
2738 step = STMT_VINFO_DR_STEP (stmt_info);
2739 if (integer_zerop (step))
2741 if (dump_enabled_p ())
2742 dump_printf_loc (MSG_NOTE, vect_location,
2743 "zero step in outer loop.\n");
2744 return DR_IS_READ (dr);
2748 /* Consecutive? */
2749 if (TREE_CODE (step) == INTEGER_CST)
2751 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2752 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2753 || (dr_step < 0
2754 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2756 /* Mark that it is not interleaving. */
2757 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2758 return true;
2762 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2764 if (dump_enabled_p ())
2765 dump_printf_loc (MSG_NOTE, vect_location,
2766 "grouped access in outer loop.\n");
2767 return false;
2771 /* Assume this is a DR handled by non-constant strided load case. */
2772 if (TREE_CODE (step) != INTEGER_CST)
2773 return (STMT_VINFO_STRIDED_P (stmt_info)
2774 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2775 || vect_analyze_group_access (dr)));
2777 /* Not consecutive access - check if it's a part of interleaving group. */
2778 return vect_analyze_group_access (dr);
2781 /* Compare two data-references DRA and DRB to group them into chunks
2782 suitable for grouping. */
2784 static int
2785 dr_group_sort_cmp (const void *dra_, const void *drb_)
2787 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2788 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2789 int cmp;
2791 /* Stabilize sort. */
2792 if (dra == drb)
2793 return 0;
2795 /* DRs in different loops never belong to the same group. */
2796 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2797 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2798 if (loopa != loopb)
2799 return loopa->num < loopb->num ? -1 : 1;
2801 /* Ordering of DRs according to base. */
2802 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2803 DR_BASE_ADDRESS (drb));
2804 if (cmp != 0)
2805 return cmp;
2807 /* And according to DR_OFFSET. */
2808 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2809 if (cmp != 0)
2810 return cmp;
2812 /* Put reads before writes. */
2813 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2814 return DR_IS_READ (dra) ? -1 : 1;
2816 /* Then sort after access size. */
2817 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2818 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2819 if (cmp != 0)
2820 return cmp;
2822 /* And after step. */
2823 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2824 if (cmp != 0)
2825 return cmp;
2827 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2828 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2829 if (cmp == 0)
2830 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2831 return cmp;
2834 /* If OP is the result of a conversion, return the unconverted value,
2835 otherwise return null. */
2837 static tree
2838 strip_conversion (tree op)
2840 if (TREE_CODE (op) != SSA_NAME)
2841 return NULL_TREE;
2842 gimple *stmt = SSA_NAME_DEF_STMT (op);
2843 if (!is_gimple_assign (stmt)
2844 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2845 return NULL_TREE;
2846 return gimple_assign_rhs1 (stmt);
2849 /* Return true if vectorizable_* routines can handle statements STMT1
2850 and STMT2 being in a single group. */
2852 static bool
2853 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2855 if (gimple_assign_single_p (stmt1))
2856 return gimple_assign_single_p (stmt2);
2858 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2860 /* Check for two masked loads or two masked stores. */
2861 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2862 return false;
2863 internal_fn ifn = gimple_call_internal_fn (stmt1);
2864 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2865 return false;
2866 if (ifn != gimple_call_internal_fn (stmt2))
2867 return false;
2869 /* Check that the masks are the same. Cope with casts of masks,
2870 like those created by build_mask_conversion. */
2871 tree mask1 = gimple_call_arg (stmt1, 2);
2872 tree mask2 = gimple_call_arg (stmt2, 2);
2873 if (!operand_equal_p (mask1, mask2, 0))
2875 mask1 = strip_conversion (mask1);
2876 if (!mask1)
2877 return false;
2878 mask2 = strip_conversion (mask2);
2879 if (!mask2)
2880 return false;
2881 if (!operand_equal_p (mask1, mask2, 0))
2882 return false;
2884 return true;
2887 return false;
2890 /* Function vect_analyze_data_ref_accesses.
2892 Analyze the access pattern of all the data references in the loop.
2894 FORNOW: the only access pattern that is considered vectorizable is a
2895 simple step 1 (consecutive) access.
2897 FORNOW: handle only arrays and pointer accesses. */
2899 bool
2900 vect_analyze_data_ref_accesses (vec_info *vinfo)
2902 unsigned int i;
2903 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2904 struct data_reference *dr;
2906 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2908 if (datarefs.is_empty ())
2909 return true;
2911 /* Sort the array of datarefs to make building the interleaving chains
2912 linear. Don't modify the original vector's order, it is needed for
2913 determining what dependencies are reversed. */
2914 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2915 datarefs_copy.qsort (dr_group_sort_cmp);
2917 /* Build the interleaving chains. */
2918 for (i = 0; i < datarefs_copy.length () - 1;)
2920 data_reference_p dra = datarefs_copy[i];
2921 stmt_vec_info stmtinfo_a = vect_dr_stmt (dra);
2922 stmt_vec_info lastinfo = NULL;
2923 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2924 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2926 ++i;
2927 continue;
2929 for (i = i + 1; i < datarefs_copy.length (); ++i)
2931 data_reference_p drb = datarefs_copy[i];
2932 stmt_vec_info stmtinfo_b = vect_dr_stmt (drb);
2933 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2934 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2935 break;
2937 /* ??? Imperfect sorting (non-compatible types, non-modulo
2938 accesses, same accesses) can lead to a group to be artificially
2939 split here as we don't just skip over those. If it really
2940 matters we can push those to a worklist and re-iterate
2941 over them. The we can just skip ahead to the next DR here. */
2943 /* DRs in a different loop should not be put into the same
2944 interleaving group. */
2945 if (gimple_bb (DR_STMT (dra))->loop_father
2946 != gimple_bb (DR_STMT (drb))->loop_father)
2947 break;
2949 /* Check that the data-refs have same first location (except init)
2950 and they are both either store or load (not load and store,
2951 not masked loads or stores). */
2952 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2953 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2954 DR_BASE_ADDRESS (drb)) != 0
2955 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2956 || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2957 break;
2959 /* Check that the data-refs have the same constant size. */
2960 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2961 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2962 if (!tree_fits_uhwi_p (sza)
2963 || !tree_fits_uhwi_p (szb)
2964 || !tree_int_cst_equal (sza, szb))
2965 break;
2967 /* Check that the data-refs have the same step. */
2968 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2969 break;
2971 /* Check the types are compatible.
2972 ??? We don't distinguish this during sorting. */
2973 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2974 TREE_TYPE (DR_REF (drb))))
2975 break;
2977 /* Check that the DR_INITs are compile-time constants. */
2978 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2979 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2980 break;
2982 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2983 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2984 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2985 HOST_WIDE_INT init_prev
2986 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2987 gcc_assert (init_a <= init_b
2988 && init_a <= init_prev
2989 && init_prev <= init_b);
2991 /* Do not place the same access in the interleaving chain twice. */
2992 if (init_b == init_prev)
2994 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2995 < gimple_uid (DR_STMT (drb)));
2996 /* ??? For now we simply "drop" the later reference which is
2997 otherwise the same rather than finishing off this group.
2998 In the end we'd want to re-process duplicates forming
2999 multiple groups from the refs, likely by just collecting
3000 all candidates (including duplicates and split points
3001 below) in a vector and then process them together. */
3002 continue;
3005 /* If init_b == init_a + the size of the type * k, we have an
3006 interleaving, and DRA is accessed before DRB. */
3007 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3008 if (type_size_a == 0
3009 || (init_b - init_a) % type_size_a != 0)
3010 break;
3012 /* If we have a store, the accesses are adjacent. This splits
3013 groups into chunks we support (we don't support vectorization
3014 of stores with gaps). */
3015 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3016 break;
3018 /* If the step (if not zero or non-constant) is greater than the
3019 difference between data-refs' inits this splits groups into
3020 suitable sizes. */
3021 if (tree_fits_shwi_p (DR_STEP (dra)))
3023 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3024 if (step != 0 && step <= (init_b - init_a))
3025 break;
3028 if (dump_enabled_p ())
3030 dump_printf_loc (MSG_NOTE, vect_location,
3031 "Detected interleaving ");
3032 if (DR_IS_READ (dra))
3033 dump_printf (MSG_NOTE, "load ");
3034 else
3035 dump_printf (MSG_NOTE, "store ");
3036 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3037 dump_printf (MSG_NOTE, " and ");
3038 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3039 dump_printf (MSG_NOTE, "\n");
3042 /* Link the found element into the group list. */
3043 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3045 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3046 lastinfo = stmtinfo_a;
3048 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3049 DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3050 lastinfo = stmtinfo_b;
3054 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3055 if (STMT_VINFO_VECTORIZABLE (vect_dr_stmt (dr))
3056 && !vect_analyze_data_ref_access (dr))
3058 if (dump_enabled_p ())
3059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3060 "not vectorized: complicated access pattern.\n");
3062 if (is_a <bb_vec_info> (vinfo))
3064 /* Mark the statement as not vectorizable. */
3065 STMT_VINFO_VECTORIZABLE (vect_dr_stmt (dr)) = false;
3066 continue;
3068 else
3070 datarefs_copy.release ();
3071 return false;
3075 datarefs_copy.release ();
3076 return true;
3079 /* Function vect_vfa_segment_size.
3081 Input:
3082 DR: The data reference.
3083 LENGTH_FACTOR: segment length to consider.
3085 Return a value suitable for the dr_with_seg_len::seg_len field.
3086 This is the "distance travelled" by the pointer from the first
3087 iteration in the segment to the last. Note that it does not include
3088 the size of the access; in effect it only describes the first byte. */
3090 static tree
3091 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3093 length_factor = size_binop (MINUS_EXPR,
3094 fold_convert (sizetype, length_factor),
3095 size_one_node);
3096 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3097 length_factor);
3100 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3101 gives the worst-case number of bytes covered by the segment. */
3103 static unsigned HOST_WIDE_INT
3104 vect_vfa_access_size (data_reference *dr)
3106 stmt_vec_info stmt_vinfo = vect_dr_stmt (dr);
3107 tree ref_type = TREE_TYPE (DR_REF (dr));
3108 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3109 unsigned HOST_WIDE_INT access_size = ref_size;
3110 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3112 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3113 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3115 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3116 && (vect_supportable_dr_alignment (dr, false)
3117 == dr_explicit_realign_optimized))
3119 /* We might access a full vector's worth. */
3120 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3121 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3123 return access_size;
3126 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3128 static unsigned int
3129 vect_vfa_align (const data_reference *dr)
3131 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3134 /* Function vect_no_alias_p.
3136 Given data references A and B with equal base and offset, see whether
3137 the alias relation can be decided at compilation time. Return 1 if
3138 it can and the references alias, 0 if it can and the references do
3139 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3140 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3141 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3143 static int
3144 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3145 tree segment_length_a, tree segment_length_b,
3146 unsigned HOST_WIDE_INT access_size_a,
3147 unsigned HOST_WIDE_INT access_size_b)
3149 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3150 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3151 poly_uint64 const_length_a;
3152 poly_uint64 const_length_b;
3154 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3155 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3156 [a, a+12) */
3157 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3159 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3160 offset_a = (offset_a + access_size_a) - const_length_a;
3162 else
3163 const_length_a = tree_to_poly_uint64 (segment_length_a);
3164 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3166 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3167 offset_b = (offset_b + access_size_b) - const_length_b;
3169 else
3170 const_length_b = tree_to_poly_uint64 (segment_length_b);
3172 const_length_a += access_size_a;
3173 const_length_b += access_size_b;
3175 if (ranges_known_overlap_p (offset_a, const_length_a,
3176 offset_b, const_length_b))
3177 return 1;
3179 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3180 offset_b, const_length_b))
3181 return 0;
3183 return -1;
3186 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3187 in DDR is >= VF. */
3189 static bool
3190 dependence_distance_ge_vf (data_dependence_relation *ddr,
3191 unsigned int loop_depth, poly_uint64 vf)
3193 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3194 || DDR_NUM_DIST_VECTS (ddr) == 0)
3195 return false;
3197 /* If the dependence is exact, we should have limited the VF instead. */
3198 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3200 unsigned int i;
3201 lambda_vector dist_v;
3202 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3204 HOST_WIDE_INT dist = dist_v[loop_depth];
3205 if (dist != 0
3206 && !(dist > 0 && DDR_REVERSED_P (ddr))
3207 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3208 return false;
3211 if (dump_enabled_p ())
3213 dump_printf_loc (MSG_NOTE, vect_location,
3214 "dependence distance between ");
3215 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3216 dump_printf (MSG_NOTE, " and ");
3217 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3218 dump_printf (MSG_NOTE, " is >= VF\n");
3221 return true;
3224 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3226 static void
3227 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3229 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3230 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3231 dump_printf (dump_kind, ") >= ");
3232 dump_dec (dump_kind, lower_bound.min_value);
3235 /* Record that the vectorized loop requires the vec_lower_bound described
3236 by EXPR, UNSIGNED_P and MIN_VALUE. */
3238 static void
3239 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3240 poly_uint64 min_value)
3242 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3243 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3244 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3246 unsigned_p &= lower_bounds[i].unsigned_p;
3247 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3248 if (lower_bounds[i].unsigned_p != unsigned_p
3249 || maybe_lt (lower_bounds[i].min_value, min_value))
3251 lower_bounds[i].unsigned_p = unsigned_p;
3252 lower_bounds[i].min_value = min_value;
3253 if (dump_enabled_p ())
3255 dump_printf_loc (MSG_NOTE, vect_location,
3256 "updating run-time check to ");
3257 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3258 dump_printf (MSG_NOTE, "\n");
3261 return;
3264 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3265 if (dump_enabled_p ())
3267 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3268 dump_lower_bound (MSG_NOTE, lower_bound);
3269 dump_printf (MSG_NOTE, "\n");
3271 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3274 /* Return true if it's unlikely that the step of the vectorized form of DR
3275 will span fewer than GAP bytes. */
3277 static bool
3278 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3280 stmt_vec_info stmt_info = vect_dr_stmt (dr);
3281 HOST_WIDE_INT count
3282 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3283 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3284 count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3285 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3288 /* Return true if we know that there is no alias between DR_A and DR_B
3289 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3290 *LOWER_BOUND_OUT to this N. */
3292 static bool
3293 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3294 poly_uint64 *lower_bound_out)
3296 /* Check that there is a constant gap of known sign between DR_A
3297 and DR_B. */
3298 poly_int64 init_a, init_b;
3299 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3300 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3301 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3302 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3303 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3304 || !ordered_p (init_a, init_b))
3305 return false;
3307 /* Sort DR_A and DR_B by the address they access. */
3308 if (maybe_lt (init_b, init_a))
3310 std::swap (init_a, init_b);
3311 std::swap (dr_a, dr_b);
3314 /* If the two accesses could be dependent within a scalar iteration,
3315 make sure that we'd retain their order. */
3316 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3317 && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3318 vect_dr_stmt (dr_b)))
3319 return false;
3321 /* There is no alias if abs (DR_STEP) is greater than or equal to
3322 the bytes spanned by the combination of the two accesses. */
3323 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3324 return true;
3327 /* Function vect_prune_runtime_alias_test_list.
3329 Prune a list of ddrs to be tested at run-time by versioning for alias.
3330 Merge several alias checks into one if possible.
3331 Return FALSE if resulting list of ddrs is longer then allowed by
3332 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3334 bool
3335 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3337 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3338 hash_set <tree_pair_hash> compared_objects;
3340 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3341 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3342 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3343 vec<vec_object_pair> &check_unequal_addrs
3344 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3345 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3346 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3348 ddr_p ddr;
3349 unsigned int i;
3350 tree length_factor;
3352 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3354 /* Step values are irrelevant for aliasing if the number of vector
3355 iterations is equal to the number of scalar iterations (which can
3356 happen for fully-SLP loops). */
3357 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3359 if (!ignore_step_p)
3361 /* Convert the checks for nonzero steps into bound tests. */
3362 tree value;
3363 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3364 vect_check_lower_bound (loop_vinfo, value, true, 1);
3367 if (may_alias_ddrs.is_empty ())
3368 return true;
3370 comp_alias_ddrs.create (may_alias_ddrs.length ());
3372 unsigned int loop_depth
3373 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3374 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3376 /* First, we collect all data ref pairs for aliasing checks. */
3377 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3379 int comp_res;
3380 poly_uint64 lower_bound;
3381 struct data_reference *dr_a, *dr_b;
3382 gimple *dr_group_first_a, *dr_group_first_b;
3383 tree segment_length_a, segment_length_b;
3384 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3385 unsigned int align_a, align_b;
3386 gimple *stmt_a, *stmt_b;
3388 /* Ignore the alias if the VF we chose ended up being no greater
3389 than the dependence distance. */
3390 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3391 continue;
3393 if (DDR_OBJECT_A (ddr))
3395 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3396 if (!compared_objects.add (new_pair))
3398 if (dump_enabled_p ())
3400 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3401 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3402 dump_printf (MSG_NOTE, " and ");
3403 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3404 dump_printf (MSG_NOTE, " have different addresses\n");
3406 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3408 continue;
3411 dr_a = DDR_A (ddr);
3412 stmt_a = vect_dr_stmt (DDR_A (ddr));
3414 dr_b = DDR_B (ddr);
3415 stmt_b = vect_dr_stmt (DDR_B (ddr));
3417 /* Skip the pair if inter-iteration dependencies are irrelevant
3418 and intra-iteration dependencies are guaranteed to be honored. */
3419 if (ignore_step_p
3420 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3421 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3423 if (dump_enabled_p ())
3425 dump_printf_loc (MSG_NOTE, vect_location,
3426 "no need for alias check between ");
3427 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3428 dump_printf (MSG_NOTE, " and ");
3429 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3430 dump_printf (MSG_NOTE, " when VF is 1\n");
3432 continue;
3435 /* See whether we can handle the alias using a bounds check on
3436 the step, and whether that's likely to be the best approach.
3437 (It might not be, for example, if the minimum step is much larger
3438 than the number of bytes handled by one vector iteration.) */
3439 if (!ignore_step_p
3440 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3441 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3442 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3443 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3445 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3446 if (dump_enabled_p ())
3448 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3449 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3450 dump_printf (MSG_NOTE, " and ");
3451 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3452 dump_printf (MSG_NOTE, " when the step ");
3453 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3454 dump_printf (MSG_NOTE, " is outside ");
3455 if (unsigned_p)
3456 dump_printf (MSG_NOTE, "[0");
3457 else
3459 dump_printf (MSG_NOTE, "(");
3460 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3462 dump_printf (MSG_NOTE, ", ");
3463 dump_dec (MSG_NOTE, lower_bound);
3464 dump_printf (MSG_NOTE, ")\n");
3466 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3467 lower_bound);
3468 continue;
3471 dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3472 if (dr_group_first_a)
3474 stmt_a = dr_group_first_a;
3475 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3478 dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3479 if (dr_group_first_b)
3481 stmt_b = dr_group_first_b;
3482 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3485 if (ignore_step_p)
3487 segment_length_a = size_zero_node;
3488 segment_length_b = size_zero_node;
3490 else
3492 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3493 length_factor = scalar_loop_iters;
3494 else
3495 length_factor = size_int (vect_factor);
3496 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3497 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3499 access_size_a = vect_vfa_access_size (dr_a);
3500 access_size_b = vect_vfa_access_size (dr_b);
3501 align_a = vect_vfa_align (dr_a);
3502 align_b = vect_vfa_align (dr_b);
3504 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3505 DR_BASE_ADDRESS (dr_b));
3506 if (comp_res == 0)
3507 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3508 DR_OFFSET (dr_b));
3510 /* See whether the alias is known at compilation time. */
3511 if (comp_res == 0
3512 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3513 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3514 && poly_int_tree_p (segment_length_a)
3515 && poly_int_tree_p (segment_length_b))
3517 int res = vect_compile_time_alias (dr_a, dr_b,
3518 segment_length_a,
3519 segment_length_b,
3520 access_size_a,
3521 access_size_b);
3522 if (res >= 0 && dump_enabled_p ())
3524 dump_printf_loc (MSG_NOTE, vect_location,
3525 "can tell at compile time that ");
3526 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3527 dump_printf (MSG_NOTE, " and ");
3528 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3529 if (res == 0)
3530 dump_printf (MSG_NOTE, " do not alias\n");
3531 else
3532 dump_printf (MSG_NOTE, " alias\n");
3535 if (res == 0)
3536 continue;
3538 if (res == 1)
3540 if (dump_enabled_p ())
3541 dump_printf_loc (MSG_NOTE, vect_location,
3542 "not vectorized: compilation time alias.\n");
3543 return false;
3547 dr_with_seg_len_pair_t dr_with_seg_len_pair
3548 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3549 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3551 /* Canonicalize pairs by sorting the two DR members. */
3552 if (comp_res > 0)
3553 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3555 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3558 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3560 unsigned int count = (comp_alias_ddrs.length ()
3561 + check_unequal_addrs.length ());
3563 dump_printf_loc (MSG_NOTE, vect_location,
3564 "improved number of alias checks from %d to %d\n",
3565 may_alias_ddrs.length (), count);
3566 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3568 if (dump_enabled_p ())
3569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3570 "number of versioning for alias "
3571 "run-time tests exceeds %d "
3572 "(--param vect-max-version-for-alias-checks)\n",
3573 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3574 return false;
3577 return true;
3580 /* Check whether we can use an internal function for a gather load
3581 or scatter store. READ_P is true for loads and false for stores.
3582 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3583 the type of the memory elements being loaded or stored. OFFSET_BITS
3584 is the number of bits in each scalar offset and OFFSET_SIGN is the
3585 sign of the offset. SCALE is the amount by which the offset should
3586 be multiplied *after* it has been converted to address width.
3588 Return true if the function is supported, storing the function
3589 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3591 bool
3592 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3593 tree memory_type, unsigned int offset_bits,
3594 signop offset_sign, int scale,
3595 internal_fn *ifn_out, tree *element_type_out)
3597 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3598 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3599 if (offset_bits > element_bits)
3600 /* Internal functions require the offset to be the same width as
3601 the vector elements. We can extend narrower offsets, but it isn't
3602 safe to truncate wider offsets. */
3603 return false;
3605 if (element_bits != memory_bits)
3606 /* For now the vector elements must be the same width as the
3607 memory elements. */
3608 return false;
3610 /* Work out which function we need. */
3611 internal_fn ifn;
3612 if (read_p)
3613 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3614 else
3615 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3617 /* Test whether the target supports this combination. */
3618 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3619 offset_sign, scale))
3620 return false;
3622 *ifn_out = ifn;
3623 *element_type_out = TREE_TYPE (vectype);
3624 return true;
3627 /* CALL is a call to an internal gather load or scatter store function.
3628 Describe the operation in INFO. */
3630 static void
3631 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3633 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3634 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3635 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3637 info->ifn = gimple_call_internal_fn (call);
3638 info->decl = NULL_TREE;
3639 info->base = gimple_call_arg (call, 0);
3640 info->offset = gimple_call_arg (call, 1);
3641 info->offset_dt = vect_unknown_def_type;
3642 info->offset_vectype = NULL_TREE;
3643 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3644 info->element_type = TREE_TYPE (vectype);
3645 info->memory_type = TREE_TYPE (DR_REF (dr));
3648 /* Return true if a non-affine read or write in STMT is suitable for a
3649 gather load or scatter store. Describe the operation in *INFO if so. */
3651 bool
3652 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3653 gather_scatter_info *info)
3655 HOST_WIDE_INT scale = 1;
3656 poly_int64 pbitpos, pbitsize;
3657 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3658 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3659 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3660 tree offtype = NULL_TREE;
3661 tree decl = NULL_TREE, base, off;
3662 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3663 tree memory_type = TREE_TYPE (DR_REF (dr));
3664 machine_mode pmode;
3665 int punsignedp, reversep, pvolatilep = 0;
3666 internal_fn ifn;
3667 tree element_type;
3668 bool masked_p = false;
3670 /* See whether this is already a call to a gather/scatter internal function.
3671 If not, see whether it's a masked load or store. */
3672 gcall *call = dyn_cast <gcall *> (stmt);
3673 if (call && gimple_call_internal_p (call))
3675 ifn = gimple_call_internal_fn (stmt);
3676 if (internal_gather_scatter_fn_p (ifn))
3678 vect_describe_gather_scatter_call (call, info);
3679 return true;
3681 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3684 /* True if we should aim to use internal functions rather than
3685 built-in functions. */
3686 bool use_ifn_p = (DR_IS_READ (dr)
3687 ? supports_vec_gather_load_p ()
3688 : supports_vec_scatter_store_p ());
3690 base = DR_REF (dr);
3691 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3692 see if we can use the def stmt of the address. */
3693 if (masked_p
3694 && TREE_CODE (base) == MEM_REF
3695 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3696 && integer_zerop (TREE_OPERAND (base, 1))
3697 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3699 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3700 if (is_gimple_assign (def_stmt)
3701 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3702 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3705 /* The gather and scatter builtins need address of the form
3706 loop_invariant + vector * {1, 2, 4, 8}
3708 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3709 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3710 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3711 multiplications and additions in it. To get a vector, we need
3712 a single SSA_NAME that will be defined in the loop and will
3713 contain everything that is not loop invariant and that can be
3714 vectorized. The following code attempts to find such a preexistng
3715 SSA_NAME OFF and put the loop invariants into a tree BASE
3716 that can be gimplified before the loop. */
3717 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3718 &punsignedp, &reversep, &pvolatilep);
3719 if (reversep)
3720 return false;
3722 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3724 if (TREE_CODE (base) == MEM_REF)
3726 if (!integer_zerop (TREE_OPERAND (base, 1)))
3728 if (off == NULL_TREE)
3729 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3730 else
3731 off = size_binop (PLUS_EXPR, off,
3732 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3734 base = TREE_OPERAND (base, 0);
3736 else
3737 base = build_fold_addr_expr (base);
3739 if (off == NULL_TREE)
3740 off = size_zero_node;
3742 /* If base is not loop invariant, either off is 0, then we start with just
3743 the constant offset in the loop invariant BASE and continue with base
3744 as OFF, otherwise give up.
3745 We could handle that case by gimplifying the addition of base + off
3746 into some SSA_NAME and use that as off, but for now punt. */
3747 if (!expr_invariant_in_loop_p (loop, base))
3749 if (!integer_zerop (off))
3750 return false;
3751 off = base;
3752 base = size_int (pbytepos);
3754 /* Otherwise put base + constant offset into the loop invariant BASE
3755 and continue with OFF. */
3756 else
3758 base = fold_convert (sizetype, base);
3759 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3762 /* OFF at this point may be either a SSA_NAME or some tree expression
3763 from get_inner_reference. Try to peel off loop invariants from it
3764 into BASE as long as possible. */
3765 STRIP_NOPS (off);
3766 while (offtype == NULL_TREE)
3768 enum tree_code code;
3769 tree op0, op1, add = NULL_TREE;
3771 if (TREE_CODE (off) == SSA_NAME)
3773 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3775 if (expr_invariant_in_loop_p (loop, off))
3776 return false;
3778 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3779 break;
3781 op0 = gimple_assign_rhs1 (def_stmt);
3782 code = gimple_assign_rhs_code (def_stmt);
3783 op1 = gimple_assign_rhs2 (def_stmt);
3785 else
3787 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3788 return false;
3789 code = TREE_CODE (off);
3790 extract_ops_from_tree (off, &code, &op0, &op1);
3792 switch (code)
3794 case POINTER_PLUS_EXPR:
3795 case PLUS_EXPR:
3796 if (expr_invariant_in_loop_p (loop, op0))
3798 add = op0;
3799 off = op1;
3800 do_add:
3801 add = fold_convert (sizetype, add);
3802 if (scale != 1)
3803 add = size_binop (MULT_EXPR, add, size_int (scale));
3804 base = size_binop (PLUS_EXPR, base, add);
3805 continue;
3807 if (expr_invariant_in_loop_p (loop, op1))
3809 add = op1;
3810 off = op0;
3811 goto do_add;
3813 break;
3814 case MINUS_EXPR:
3815 if (expr_invariant_in_loop_p (loop, op1))
3817 add = fold_convert (sizetype, op1);
3818 add = size_binop (MINUS_EXPR, size_zero_node, add);
3819 off = op0;
3820 goto do_add;
3822 break;
3823 case MULT_EXPR:
3824 if (scale == 1 && tree_fits_shwi_p (op1))
3826 int new_scale = tree_to_shwi (op1);
3827 /* Only treat this as a scaling operation if the target
3828 supports it. */
3829 if (use_ifn_p
3830 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3831 vectype, memory_type, 1,
3832 TYPE_SIGN (TREE_TYPE (op0)),
3833 new_scale, &ifn,
3834 &element_type))
3835 break;
3836 scale = new_scale;
3837 off = op0;
3838 continue;
3840 break;
3841 case SSA_NAME:
3842 off = op0;
3843 continue;
3844 CASE_CONVERT:
3845 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3846 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3847 break;
3848 if (TYPE_PRECISION (TREE_TYPE (op0))
3849 == TYPE_PRECISION (TREE_TYPE (off)))
3851 off = op0;
3852 continue;
3855 /* The internal functions need the offset to be the same width
3856 as the elements of VECTYPE. Don't include operations that
3857 cast the offset from that width to a different width. */
3858 if (use_ifn_p
3859 && (int_size_in_bytes (TREE_TYPE (vectype))
3860 == int_size_in_bytes (TREE_TYPE (off))))
3861 break;
3863 if (TYPE_PRECISION (TREE_TYPE (op0))
3864 < TYPE_PRECISION (TREE_TYPE (off)))
3866 off = op0;
3867 offtype = TREE_TYPE (off);
3868 STRIP_NOPS (off);
3869 continue;
3871 break;
3872 default:
3873 break;
3875 break;
3878 /* If at the end OFF still isn't a SSA_NAME or isn't
3879 defined in the loop, punt. */
3880 if (TREE_CODE (off) != SSA_NAME
3881 || expr_invariant_in_loop_p (loop, off))
3882 return false;
3884 if (offtype == NULL_TREE)
3885 offtype = TREE_TYPE (off);
3887 if (use_ifn_p)
3889 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3890 memory_type, TYPE_PRECISION (offtype),
3891 TYPE_SIGN (offtype), scale, &ifn,
3892 &element_type))
3893 return false;
3895 else
3897 if (DR_IS_READ (dr))
3899 if (targetm.vectorize.builtin_gather)
3900 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3902 else
3904 if (targetm.vectorize.builtin_scatter)
3905 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3908 if (!decl)
3909 return false;
3911 ifn = IFN_LAST;
3912 element_type = TREE_TYPE (vectype);
3915 info->ifn = ifn;
3916 info->decl = decl;
3917 info->base = base;
3918 info->offset = off;
3919 info->offset_dt = vect_unknown_def_type;
3920 info->offset_vectype = NULL_TREE;
3921 info->scale = scale;
3922 info->element_type = element_type;
3923 info->memory_type = memory_type;
3924 return true;
3927 /* Find the data references in STMT, analyze them with respect to LOOP and
3928 append them to DATAREFS. Return false if datarefs in this stmt cannot
3929 be handled. */
3931 bool
3932 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3933 vec<data_reference_p> *datarefs)
3935 /* We can ignore clobbers for dataref analysis - they are removed during
3936 loop vectorization and BB vectorization checks dependences with a
3937 stmt walk. */
3938 if (gimple_clobber_p (stmt))
3939 return true;
3941 if (gimple_has_volatile_ops (stmt))
3943 if (dump_enabled_p ())
3945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3946 "not vectorized: volatile type ");
3947 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3949 return false;
3952 if (stmt_can_throw_internal (stmt))
3954 if (dump_enabled_p ())
3956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3957 "not vectorized: statement can throw an "
3958 "exception ");
3959 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3961 return false;
3964 auto_vec<data_reference_p, 2> refs;
3965 if (!find_data_references_in_stmt (loop, stmt, &refs))
3966 return false;
3968 if (refs.is_empty ())
3969 return true;
3971 if (refs.length () > 1)
3973 if (dump_enabled_p ())
3975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3976 "not vectorized: more than one data ref "
3977 "in stmt: ");
3978 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3980 return false;
3983 if (gcall *call = dyn_cast <gcall *> (stmt))
3984 if (!gimple_call_internal_p (call)
3985 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3986 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3988 if (dump_enabled_p ())
3990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3991 "not vectorized: dr in a call ");
3992 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3994 return false;
3997 data_reference_p dr = refs.pop ();
3998 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3999 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4001 if (dump_enabled_p ())
4003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4004 "not vectorized: statement is bitfield "
4005 "access ");
4006 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4008 return false;
4011 if (DR_BASE_ADDRESS (dr)
4012 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4014 if (dump_enabled_p ())
4015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4016 "not vectorized: base addr of dr is a "
4017 "constant\n");
4018 return false;
4021 /* Check whether this may be a SIMD lane access and adjust the
4022 DR to make it easier for us to handle it. */
4023 if (loop
4024 && loop->simduid
4025 && (!DR_BASE_ADDRESS (dr)
4026 || !DR_OFFSET (dr)
4027 || !DR_INIT (dr)
4028 || !DR_STEP (dr)))
4030 struct data_reference *newdr
4031 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4032 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4033 if (DR_BASE_ADDRESS (newdr)
4034 && DR_OFFSET (newdr)
4035 && DR_INIT (newdr)
4036 && DR_STEP (newdr)
4037 && integer_zerop (DR_STEP (newdr)))
4039 tree off = DR_OFFSET (newdr);
4040 STRIP_NOPS (off);
4041 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4042 && TREE_CODE (off) == MULT_EXPR
4043 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4045 tree step = TREE_OPERAND (off, 1);
4046 off = TREE_OPERAND (off, 0);
4047 STRIP_NOPS (off);
4048 if (CONVERT_EXPR_P (off)
4049 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4050 < TYPE_PRECISION (TREE_TYPE (off))))
4051 off = TREE_OPERAND (off, 0);
4052 if (TREE_CODE (off) == SSA_NAME)
4054 gimple *def = SSA_NAME_DEF_STMT (off);
4055 tree reft = TREE_TYPE (DR_REF (newdr));
4056 if (is_gimple_call (def)
4057 && gimple_call_internal_p (def)
4058 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4060 tree arg = gimple_call_arg (def, 0);
4061 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4062 arg = SSA_NAME_VAR (arg);
4063 if (arg == loop->simduid
4064 /* For now. */
4065 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4067 DR_OFFSET (newdr) = ssize_int (0);
4068 DR_STEP (newdr) = step;
4069 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4070 DR_STEP_ALIGNMENT (newdr)
4071 = highest_pow2_factor (step);
4072 /* Mark as simd-lane access. */
4073 newdr->aux = (void *)-1;
4074 free_data_ref (dr);
4075 datarefs->safe_push (newdr);
4076 return true;
4082 free_data_ref (newdr);
4085 datarefs->safe_push (dr);
4086 return true;
4089 /* Function vect_analyze_data_refs.
4091 Find all the data references in the loop or basic block.
4093 The general structure of the analysis of data refs in the vectorizer is as
4094 follows:
4095 1- vect_analyze_data_refs(loop/bb): call
4096 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4097 in the loop/bb and their dependences.
4098 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4099 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4100 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4104 bool
4105 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4107 struct loop *loop = NULL;
4108 unsigned int i;
4109 struct data_reference *dr;
4110 tree scalar_type;
4112 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4114 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4115 loop = LOOP_VINFO_LOOP (loop_vinfo);
4117 /* Go through the data-refs, check that the analysis succeeded. Update
4118 pointer from stmt_vec_info struct to DR and vectype. */
4120 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4121 FOR_EACH_VEC_ELT (datarefs, i, dr)
4123 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4124 poly_uint64 vf;
4126 gcc_assert (DR_REF (dr));
4127 stmt_vec_info stmt_info = vect_dr_stmt (dr);
4129 /* Check that analysis of the data-ref succeeded. */
4130 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4131 || !DR_STEP (dr))
4133 bool maybe_gather
4134 = DR_IS_READ (dr)
4135 && !TREE_THIS_VOLATILE (DR_REF (dr))
4136 && (targetm.vectorize.builtin_gather != NULL
4137 || supports_vec_gather_load_p ());
4138 bool maybe_scatter
4139 = DR_IS_WRITE (dr)
4140 && !TREE_THIS_VOLATILE (DR_REF (dr))
4141 && (targetm.vectorize.builtin_scatter != NULL
4142 || supports_vec_scatter_store_p ());
4144 /* If target supports vector gather loads or scatter stores,
4145 see if they can't be used. */
4146 if (is_a <loop_vec_info> (vinfo)
4147 && !nested_in_vect_loop_p (loop, stmt_info))
4149 if (maybe_gather || maybe_scatter)
4151 if (maybe_gather)
4152 gatherscatter = GATHER;
4153 else
4154 gatherscatter = SCATTER;
4158 if (gatherscatter == SG_NONE)
4160 if (dump_enabled_p ())
4162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4163 "not vectorized: data ref analysis "
4164 "failed ");
4165 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4166 stmt_info->stmt, 0);
4168 if (is_a <bb_vec_info> (vinfo))
4170 /* In BB vectorization the ref can still participate
4171 in dependence analysis, we just can't vectorize it. */
4172 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4173 continue;
4175 return false;
4179 /* See if this was detected as SIMD lane access. */
4180 if (dr->aux == (void *)-1)
4182 if (nested_in_vect_loop_p (loop, stmt_info))
4184 if (dump_enabled_p ())
4186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4187 "not vectorized: data ref analysis "
4188 "failed ");
4189 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4190 stmt_info->stmt, 0);
4192 return false;
4194 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4197 tree base = get_base_address (DR_REF (dr));
4198 if (base && VAR_P (base) && DECL_NONALIASED (base))
4200 if (dump_enabled_p ())
4202 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4203 "not vectorized: base object not addressable "
4204 "for stmt: ");
4205 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4206 stmt_info->stmt, 0);
4208 if (is_a <bb_vec_info> (vinfo))
4210 /* In BB vectorization the ref can still participate
4211 in dependence analysis, we just can't vectorize it. */
4212 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4213 continue;
4215 return false;
4218 if (is_a <loop_vec_info> (vinfo)
4219 && DR_STEP (dr)
4220 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4222 if (nested_in_vect_loop_p (loop, stmt_info))
4224 if (dump_enabled_p ())
4226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4227 "not vectorized: not suitable for strided "
4228 "load ");
4229 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4230 stmt_info->stmt, 0);
4232 return false;
4234 STMT_VINFO_STRIDED_P (stmt_info) = true;
4237 /* Update DR field in stmt_vec_info struct. */
4239 /* If the dataref is in an inner-loop of the loop that is considered for
4240 for vectorization, we also want to analyze the access relative to
4241 the outer-loop (DR contains information only relative to the
4242 inner-most enclosing loop). We do that by building a reference to the
4243 first location accessed by the inner-loop, and analyze it relative to
4244 the outer-loop. */
4245 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4247 /* Build a reference to the first location accessed by the
4248 inner loop: *(BASE + INIT + OFFSET). By construction,
4249 this address must be invariant in the inner loop, so we
4250 can consider it as being used in the outer loop. */
4251 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4252 tree offset = unshare_expr (DR_OFFSET (dr));
4253 tree init = unshare_expr (DR_INIT (dr));
4254 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4255 init, offset);
4256 tree init_addr = fold_build_pointer_plus (base, init_offset);
4257 tree init_ref = build_fold_indirect_ref (init_addr);
4259 if (dump_enabled_p ())
4261 dump_printf_loc (MSG_NOTE, vect_location,
4262 "analyze in outer loop: ");
4263 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4264 dump_printf (MSG_NOTE, "\n");
4267 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4268 init_ref, loop))
4269 /* dr_analyze_innermost already explained the failure. */
4270 return false;
4272 if (dump_enabled_p ())
4274 dump_printf_loc (MSG_NOTE, vect_location,
4275 "\touter base_address: ");
4276 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4277 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4278 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4279 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4280 STMT_VINFO_DR_OFFSET (stmt_info));
4281 dump_printf (MSG_NOTE,
4282 "\n\touter constant offset from base address: ");
4283 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4284 STMT_VINFO_DR_INIT (stmt_info));
4285 dump_printf (MSG_NOTE, "\n\touter step: ");
4286 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4287 STMT_VINFO_DR_STEP (stmt_info));
4288 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4289 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4290 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4291 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4292 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4293 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4294 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4295 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4299 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4300 STMT_VINFO_DATA_REF (stmt_info) = dr;
4302 /* Set vectype for STMT. */
4303 scalar_type = TREE_TYPE (DR_REF (dr));
4304 STMT_VINFO_VECTYPE (stmt_info)
4305 = get_vectype_for_scalar_type (scalar_type);
4306 if (!STMT_VINFO_VECTYPE (stmt_info))
4308 if (dump_enabled_p ())
4310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4311 "not vectorized: no vectype for stmt: ");
4312 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4313 stmt_info->stmt, 0);
4314 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4315 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4316 scalar_type);
4317 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4320 if (is_a <bb_vec_info> (vinfo))
4322 /* No vector type is fine, the ref can still participate
4323 in dependence analysis, we just can't vectorize it. */
4324 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4325 continue;
4327 return false;
4329 else
4331 if (dump_enabled_p ())
4333 dump_printf_loc (MSG_NOTE, vect_location,
4334 "got vectype for stmt: ");
4335 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
4336 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4337 STMT_VINFO_VECTYPE (stmt_info));
4338 dump_printf (MSG_NOTE, "\n");
4342 /* Adjust the minimal vectorization factor according to the
4343 vector type. */
4344 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4345 *min_vf = upper_bound (*min_vf, vf);
4347 if (gatherscatter != SG_NONE)
4349 gather_scatter_info gs_info;
4350 if (!vect_check_gather_scatter (stmt_info,
4351 as_a <loop_vec_info> (vinfo),
4352 &gs_info)
4353 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4355 if (dump_enabled_p ())
4357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4358 (gatherscatter == GATHER) ?
4359 "not vectorized: not suitable for gather "
4360 "load " :
4361 "not vectorized: not suitable for scatter "
4362 "store ");
4363 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4364 stmt_info->stmt, 0);
4366 return false;
4368 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4372 /* We used to stop processing and prune the list here. Verify we no
4373 longer need to. */
4374 gcc_assert (i == datarefs.length ());
4376 return true;
4380 /* Function vect_get_new_vect_var.
4382 Returns a name for a new variable. The current naming scheme appends the
4383 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4384 the name of vectorizer generated variables, and appends that to NAME if
4385 provided. */
4387 tree
4388 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4390 const char *prefix;
4391 tree new_vect_var;
4393 switch (var_kind)
4395 case vect_simple_var:
4396 prefix = "vect";
4397 break;
4398 case vect_scalar_var:
4399 prefix = "stmp";
4400 break;
4401 case vect_mask_var:
4402 prefix = "mask";
4403 break;
4404 case vect_pointer_var:
4405 prefix = "vectp";
4406 break;
4407 default:
4408 gcc_unreachable ();
4411 if (name)
4413 char* tmp = concat (prefix, "_", name, NULL);
4414 new_vect_var = create_tmp_reg (type, tmp);
4415 free (tmp);
4417 else
4418 new_vect_var = create_tmp_reg (type, prefix);
4420 return new_vect_var;
4423 /* Like vect_get_new_vect_var but return an SSA name. */
4425 tree
4426 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4428 const char *prefix;
4429 tree new_vect_var;
4431 switch (var_kind)
4433 case vect_simple_var:
4434 prefix = "vect";
4435 break;
4436 case vect_scalar_var:
4437 prefix = "stmp";
4438 break;
4439 case vect_pointer_var:
4440 prefix = "vectp";
4441 break;
4442 default:
4443 gcc_unreachable ();
4446 if (name)
4448 char* tmp = concat (prefix, "_", name, NULL);
4449 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4450 free (tmp);
4452 else
4453 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4455 return new_vect_var;
4458 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4460 static void
4461 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4463 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4464 int misalign = DR_MISALIGNMENT (dr);
4465 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4466 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4467 else
4468 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4469 DR_TARGET_ALIGNMENT (dr), misalign);
4472 /* Function vect_create_addr_base_for_vector_ref.
4474 Create an expression that computes the address of the first memory location
4475 that will be accessed for a data reference.
4477 Input:
4478 STMT: The statement containing the data reference.
4479 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4480 OFFSET: Optional. If supplied, it is be added to the initial address.
4481 LOOP: Specify relative to which loop-nest should the address be computed.
4482 For example, when the dataref is in an inner-loop nested in an
4483 outer-loop that is now being vectorized, LOOP can be either the
4484 outer-loop, or the inner-loop. The first memory location accessed
4485 by the following dataref ('in' points to short):
4487 for (i=0; i<N; i++)
4488 for (j=0; j<M; j++)
4489 s += in[i+j]
4491 is as follows:
4492 if LOOP=i_loop: &in (relative to i_loop)
4493 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4494 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4495 initial address. Unlike OFFSET, which is number of elements to
4496 be added, BYTE_OFFSET is measured in bytes.
4498 Output:
4499 1. Return an SSA_NAME whose value is the address of the memory location of
4500 the first vector of the data reference.
4501 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4502 these statement(s) which define the returned SSA_NAME.
4504 FORNOW: We are only handling array accesses with step 1. */
4506 tree
4507 vect_create_addr_base_for_vector_ref (gimple *stmt,
4508 gimple_seq *new_stmt_list,
4509 tree offset,
4510 tree byte_offset)
4512 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4513 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4514 const char *base_name;
4515 tree addr_base;
4516 tree dest;
4517 gimple_seq seq = NULL;
4518 tree vect_ptr_type;
4519 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4520 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4521 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4523 tree data_ref_base = unshare_expr (drb->base_address);
4524 tree base_offset = unshare_expr (drb->offset);
4525 tree init = unshare_expr (drb->init);
4527 if (loop_vinfo)
4528 base_name = get_name (data_ref_base);
4529 else
4531 base_offset = ssize_int (0);
4532 init = ssize_int (0);
4533 base_name = get_name (DR_REF (dr));
4536 /* Create base_offset */
4537 base_offset = size_binop (PLUS_EXPR,
4538 fold_convert (sizetype, base_offset),
4539 fold_convert (sizetype, init));
4541 if (offset)
4543 offset = fold_build2 (MULT_EXPR, sizetype,
4544 fold_convert (sizetype, offset), step);
4545 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4546 base_offset, offset);
4548 if (byte_offset)
4550 byte_offset = fold_convert (sizetype, byte_offset);
4551 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4552 base_offset, byte_offset);
4555 /* base + base_offset */
4556 if (loop_vinfo)
4557 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4558 else
4560 addr_base = build1 (ADDR_EXPR,
4561 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4562 unshare_expr (DR_REF (dr)));
4565 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4566 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4567 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4568 gimple_seq_add_seq (new_stmt_list, seq);
4570 if (DR_PTR_INFO (dr)
4571 && TREE_CODE (addr_base) == SSA_NAME
4572 && !SSA_NAME_PTR_INFO (addr_base))
4574 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4575 if (offset || byte_offset)
4576 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4579 if (dump_enabled_p ())
4581 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4582 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4583 dump_printf (MSG_NOTE, "\n");
4586 return addr_base;
4590 /* Function vect_create_data_ref_ptr.
4592 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4593 location accessed in the loop by STMT, along with the def-use update
4594 chain to appropriately advance the pointer through the loop iterations.
4595 Also set aliasing information for the pointer. This pointer is used by
4596 the callers to this function to create a memory reference expression for
4597 vector load/store access.
4599 Input:
4600 1. STMT: a stmt that references memory. Expected to be of the form
4601 GIMPLE_ASSIGN <name, data-ref> or
4602 GIMPLE_ASSIGN <data-ref, name>.
4603 2. AGGR_TYPE: the type of the reference, which should be either a vector
4604 or an array.
4605 3. AT_LOOP: the loop where the vector memref is to be created.
4606 4. OFFSET (optional): an offset to be added to the initial address accessed
4607 by the data-ref in STMT.
4608 5. BSI: location where the new stmts are to be placed if there is no loop
4609 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4610 pointing to the initial address.
4611 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4612 to the initial address accessed by the data-ref in STMT. This is
4613 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4614 in bytes.
4615 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4616 to the IV during each iteration of the loop. NULL says to move
4617 by one copy of AGGR_TYPE up or down, depending on the step of the
4618 data reference.
4620 Output:
4621 1. Declare a new ptr to vector_type, and have it point to the base of the
4622 data reference (initial addressed accessed by the data reference).
4623 For example, for vector of type V8HI, the following code is generated:
4625 v8hi *ap;
4626 ap = (v8hi *)initial_address;
4628 if OFFSET is not supplied:
4629 initial_address = &a[init];
4630 if OFFSET is supplied:
4631 initial_address = &a[init + OFFSET];
4632 if BYTE_OFFSET is supplied:
4633 initial_address = &a[init] + BYTE_OFFSET;
4635 Return the initial_address in INITIAL_ADDRESS.
4637 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4638 update the pointer in each iteration of the loop.
4640 Return the increment stmt that updates the pointer in PTR_INCR.
4642 3. Set INV_P to true if the access pattern of the data reference in the
4643 vectorized loop is invariant. Set it to false otherwise.
4645 4. Return the pointer. */
4647 tree
4648 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4649 tree offset, tree *initial_address,
4650 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4651 bool only_init, bool *inv_p, tree byte_offset,
4652 tree iv_step)
4654 const char *base_name;
4655 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4656 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4657 struct loop *loop = NULL;
4658 bool nested_in_vect_loop = false;
4659 struct loop *containing_loop = NULL;
4660 tree aggr_ptr_type;
4661 tree aggr_ptr;
4662 tree new_temp;
4663 gimple_seq new_stmt_list = NULL;
4664 edge pe = NULL;
4665 basic_block new_bb;
4666 tree aggr_ptr_init;
4667 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4668 tree aptr;
4669 gimple_stmt_iterator incr_gsi;
4670 bool insert_after;
4671 tree indx_before_incr, indx_after_incr;
4672 gimple *incr;
4673 tree step;
4674 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4676 gcc_assert (iv_step != NULL_TREE
4677 || TREE_CODE (aggr_type) == ARRAY_TYPE
4678 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4680 if (loop_vinfo)
4682 loop = LOOP_VINFO_LOOP (loop_vinfo);
4683 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4684 containing_loop = (gimple_bb (stmt))->loop_father;
4685 pe = loop_preheader_edge (loop);
4687 else
4689 gcc_assert (bb_vinfo);
4690 only_init = true;
4691 *ptr_incr = NULL;
4694 /* Check the step (evolution) of the load in LOOP, and record
4695 whether it's invariant. */
4696 step = vect_dr_behavior (dr)->step;
4697 if (integer_zerop (step))
4698 *inv_p = true;
4699 else
4700 *inv_p = false;
4702 /* Create an expression for the first address accessed by this load
4703 in LOOP. */
4704 base_name = get_name (DR_BASE_ADDRESS (dr));
4706 if (dump_enabled_p ())
4708 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4709 dump_printf_loc (MSG_NOTE, vect_location,
4710 "create %s-pointer variable to type: ",
4711 get_tree_code_name (TREE_CODE (aggr_type)));
4712 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4713 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4714 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4715 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4716 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4717 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4718 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4719 else
4720 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4721 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4722 dump_printf (MSG_NOTE, "\n");
4725 /* (1) Create the new aggregate-pointer variable.
4726 Vector and array types inherit the alias set of their component
4727 type by default so we need to use a ref-all pointer if the data
4728 reference does not conflict with the created aggregated data
4729 reference because it is not addressable. */
4730 bool need_ref_all = false;
4731 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4732 get_alias_set (DR_REF (dr))))
4733 need_ref_all = true;
4734 /* Likewise for any of the data references in the stmt group. */
4735 else if (DR_GROUP_SIZE (stmt_info) > 1)
4737 gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4740 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4741 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4742 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4743 get_alias_set (DR_REF (sdr))))
4745 need_ref_all = true;
4746 break;
4748 orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4750 while (orig_stmt);
4752 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4753 need_ref_all);
4754 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4757 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4758 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4759 def-use update cycles for the pointer: one relative to the outer-loop
4760 (LOOP), which is what steps (3) and (4) below do. The other is relative
4761 to the inner-loop (which is the inner-most loop containing the dataref),
4762 and this is done be step (5) below.
4764 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4765 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4766 redundant. Steps (3),(4) create the following:
4768 vp0 = &base_addr;
4769 LOOP: vp1 = phi(vp0,vp2)
4772 vp2 = vp1 + step
4773 goto LOOP
4775 If there is an inner-loop nested in loop, then step (5) will also be
4776 applied, and an additional update in the inner-loop will be created:
4778 vp0 = &base_addr;
4779 LOOP: vp1 = phi(vp0,vp2)
4781 inner: vp3 = phi(vp1,vp4)
4782 vp4 = vp3 + inner_step
4783 if () goto inner
4785 vp2 = vp1 + step
4786 if () goto LOOP */
4788 /* (2) Calculate the initial address of the aggregate-pointer, and set
4789 the aggregate-pointer to point to it before the loop. */
4791 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4793 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4794 offset, byte_offset);
4795 if (new_stmt_list)
4797 if (pe)
4799 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4800 gcc_assert (!new_bb);
4802 else
4803 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4806 *initial_address = new_temp;
4807 aggr_ptr_init = new_temp;
4809 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4810 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4811 inner-loop nested in LOOP (during outer-loop vectorization). */
4813 /* No update in loop is required. */
4814 if (only_init && (!loop_vinfo || at_loop == loop))
4815 aptr = aggr_ptr_init;
4816 else
4818 if (iv_step == NULL_TREE)
4820 /* The step of the aggregate pointer is the type size. */
4821 iv_step = TYPE_SIZE_UNIT (aggr_type);
4822 /* One exception to the above is when the scalar step of the load in
4823 LOOP is zero. In this case the step here is also zero. */
4824 if (*inv_p)
4825 iv_step = size_zero_node;
4826 else if (tree_int_cst_sgn (step) == -1)
4827 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4830 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4832 create_iv (aggr_ptr_init,
4833 fold_convert (aggr_ptr_type, iv_step),
4834 aggr_ptr, loop, &incr_gsi, insert_after,
4835 &indx_before_incr, &indx_after_incr);
4836 incr = gsi_stmt (incr_gsi);
4837 loop_vinfo->add_stmt (incr);
4839 /* Copy the points-to information if it exists. */
4840 if (DR_PTR_INFO (dr))
4842 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4843 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4845 if (ptr_incr)
4846 *ptr_incr = incr;
4848 aptr = indx_before_incr;
4851 if (!nested_in_vect_loop || only_init)
4852 return aptr;
4855 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4856 nested in LOOP, if exists. */
4858 gcc_assert (nested_in_vect_loop);
4859 if (!only_init)
4861 standard_iv_increment_position (containing_loop, &incr_gsi,
4862 &insert_after);
4863 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4864 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4865 &indx_after_incr);
4866 incr = gsi_stmt (incr_gsi);
4867 loop_vinfo->add_stmt (incr);
4869 /* Copy the points-to information if it exists. */
4870 if (DR_PTR_INFO (dr))
4872 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4873 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4875 if (ptr_incr)
4876 *ptr_incr = incr;
4878 return indx_before_incr;
4880 else
4881 gcc_unreachable ();
4885 /* Function bump_vector_ptr
4887 Increment a pointer (to a vector type) by vector-size. If requested,
4888 i.e. if PTR-INCR is given, then also connect the new increment stmt
4889 to the existing def-use update-chain of the pointer, by modifying
4890 the PTR_INCR as illustrated below:
4892 The pointer def-use update-chain before this function:
4893 DATAREF_PTR = phi (p_0, p_2)
4894 ....
4895 PTR_INCR: p_2 = DATAREF_PTR + step
4897 The pointer def-use update-chain after this function:
4898 DATAREF_PTR = phi (p_0, p_2)
4899 ....
4900 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4901 ....
4902 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4904 Input:
4905 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4906 in the loop.
4907 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4908 the loop. The increment amount across iterations is expected
4909 to be vector_size.
4910 BSI - location where the new update stmt is to be placed.
4911 STMT - the original scalar memory-access stmt that is being vectorized.
4912 BUMP - optional. The offset by which to bump the pointer. If not given,
4913 the offset is assumed to be vector_size.
4915 Output: Return NEW_DATAREF_PTR as illustrated above.
4919 tree
4920 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4921 gimple *stmt, tree bump)
4923 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4924 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4925 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4926 tree update = TYPE_SIZE_UNIT (vectype);
4927 gassign *incr_stmt;
4928 ssa_op_iter iter;
4929 use_operand_p use_p;
4930 tree new_dataref_ptr;
4932 if (bump)
4933 update = bump;
4935 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4936 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4937 else
4938 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4939 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4940 dataref_ptr, update);
4941 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4943 /* Copy the points-to information if it exists. */
4944 if (DR_PTR_INFO (dr))
4946 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4947 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4950 if (!ptr_incr)
4951 return new_dataref_ptr;
4953 /* Update the vector-pointer's cross-iteration increment. */
4954 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4956 tree use = USE_FROM_PTR (use_p);
4958 if (use == dataref_ptr)
4959 SET_USE (use_p, new_dataref_ptr);
4960 else
4961 gcc_assert (operand_equal_p (use, update, 0));
4964 return new_dataref_ptr;
4968 /* Copy memory reference info such as base/clique from the SRC reference
4969 to the DEST MEM_REF. */
4971 void
4972 vect_copy_ref_info (tree dest, tree src)
4974 if (TREE_CODE (dest) != MEM_REF)
4975 return;
4977 tree src_base = src;
4978 while (handled_component_p (src_base))
4979 src_base = TREE_OPERAND (src_base, 0);
4980 if (TREE_CODE (src_base) != MEM_REF
4981 && TREE_CODE (src_base) != TARGET_MEM_REF)
4982 return;
4984 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4985 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4989 /* Function vect_create_destination_var.
4991 Create a new temporary of type VECTYPE. */
4993 tree
4994 vect_create_destination_var (tree scalar_dest, tree vectype)
4996 tree vec_dest;
4997 const char *name;
4998 char *new_name;
4999 tree type;
5000 enum vect_var_kind kind;
5002 kind = vectype
5003 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5004 ? vect_mask_var
5005 : vect_simple_var
5006 : vect_scalar_var;
5007 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5009 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5011 name = get_name (scalar_dest);
5012 if (name)
5013 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5014 else
5015 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5016 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5017 free (new_name);
5019 return vec_dest;
5022 /* Function vect_grouped_store_supported.
5024 Returns TRUE if interleave high and interleave low permutations
5025 are supported, and FALSE otherwise. */
5027 bool
5028 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5030 machine_mode mode = TYPE_MODE (vectype);
5032 /* vect_permute_store_chain requires the group size to be equal to 3 or
5033 be a power of two. */
5034 if (count != 3 && exact_log2 (count) == -1)
5036 if (dump_enabled_p ())
5037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5038 "the size of the group of accesses"
5039 " is not a power of 2 or not eqaul to 3\n");
5040 return false;
5043 /* Check that the permutation is supported. */
5044 if (VECTOR_MODE_P (mode))
5046 unsigned int i;
5047 if (count == 3)
5049 unsigned int j0 = 0, j1 = 0, j2 = 0;
5050 unsigned int i, j;
5052 unsigned int nelt;
5053 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5055 if (dump_enabled_p ())
5056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5057 "cannot handle groups of 3 stores for"
5058 " variable-length vectors\n");
5059 return false;
5062 vec_perm_builder sel (nelt, nelt, 1);
5063 sel.quick_grow (nelt);
5064 vec_perm_indices indices;
5065 for (j = 0; j < 3; j++)
5067 int nelt0 = ((3 - j) * nelt) % 3;
5068 int nelt1 = ((3 - j) * nelt + 1) % 3;
5069 int nelt2 = ((3 - j) * nelt + 2) % 3;
5070 for (i = 0; i < nelt; i++)
5072 if (3 * i + nelt0 < nelt)
5073 sel[3 * i + nelt0] = j0++;
5074 if (3 * i + nelt1 < nelt)
5075 sel[3 * i + nelt1] = nelt + j1++;
5076 if (3 * i + nelt2 < nelt)
5077 sel[3 * i + nelt2] = 0;
5079 indices.new_vector (sel, 2, nelt);
5080 if (!can_vec_perm_const_p (mode, indices))
5082 if (dump_enabled_p ())
5083 dump_printf (MSG_MISSED_OPTIMIZATION,
5084 "permutation op not supported by target.\n");
5085 return false;
5088 for (i = 0; i < nelt; i++)
5090 if (3 * i + nelt0 < nelt)
5091 sel[3 * i + nelt0] = 3 * i + nelt0;
5092 if (3 * i + nelt1 < nelt)
5093 sel[3 * i + nelt1] = 3 * i + nelt1;
5094 if (3 * i + nelt2 < nelt)
5095 sel[3 * i + nelt2] = nelt + j2++;
5097 indices.new_vector (sel, 2, nelt);
5098 if (!can_vec_perm_const_p (mode, indices))
5100 if (dump_enabled_p ())
5101 dump_printf (MSG_MISSED_OPTIMIZATION,
5102 "permutation op not supported by target.\n");
5103 return false;
5106 return true;
5108 else
5110 /* If length is not equal to 3 then only power of 2 is supported. */
5111 gcc_assert (pow2p_hwi (count));
5112 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5114 /* The encoding has 2 interleaved stepped patterns. */
5115 vec_perm_builder sel (nelt, 2, 3);
5116 sel.quick_grow (6);
5117 for (i = 0; i < 3; i++)
5119 sel[i * 2] = i;
5120 sel[i * 2 + 1] = i + nelt;
5122 vec_perm_indices indices (sel, 2, nelt);
5123 if (can_vec_perm_const_p (mode, indices))
5125 for (i = 0; i < 6; i++)
5126 sel[i] += exact_div (nelt, 2);
5127 indices.new_vector (sel, 2, nelt);
5128 if (can_vec_perm_const_p (mode, indices))
5129 return true;
5134 if (dump_enabled_p ())
5135 dump_printf (MSG_MISSED_OPTIMIZATION,
5136 "permutaion op not supported by target.\n");
5137 return false;
5141 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5142 type VECTYPE. MASKED_P says whether the masked form is needed. */
5144 bool
5145 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5146 bool masked_p)
5148 if (masked_p)
5149 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5150 vec_mask_store_lanes_optab,
5151 vectype, count);
5152 else
5153 return vect_lanes_optab_supported_p ("vec_store_lanes",
5154 vec_store_lanes_optab,
5155 vectype, count);
5159 /* Function vect_permute_store_chain.
5161 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5162 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5163 the data correctly for the stores. Return the final references for stores
5164 in RESULT_CHAIN.
5166 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5167 The input is 4 vectors each containing 8 elements. We assign a number to
5168 each element, the input sequence is:
5170 1st vec: 0 1 2 3 4 5 6 7
5171 2nd vec: 8 9 10 11 12 13 14 15
5172 3rd vec: 16 17 18 19 20 21 22 23
5173 4th vec: 24 25 26 27 28 29 30 31
5175 The output sequence should be:
5177 1st vec: 0 8 16 24 1 9 17 25
5178 2nd vec: 2 10 18 26 3 11 19 27
5179 3rd vec: 4 12 20 28 5 13 21 30
5180 4th vec: 6 14 22 30 7 15 23 31
5182 i.e., we interleave the contents of the four vectors in their order.
5184 We use interleave_high/low instructions to create such output. The input of
5185 each interleave_high/low operation is two vectors:
5186 1st vec 2nd vec
5187 0 1 2 3 4 5 6 7
5188 the even elements of the result vector are obtained left-to-right from the
5189 high/low elements of the first vector. The odd elements of the result are
5190 obtained left-to-right from the high/low elements of the second vector.
5191 The output of interleave_high will be: 0 4 1 5
5192 and of interleave_low: 2 6 3 7
5195 The permutation is done in log LENGTH stages. In each stage interleave_high
5196 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5197 where the first argument is taken from the first half of DR_CHAIN and the
5198 second argument from it's second half.
5199 In our example,
5201 I1: interleave_high (1st vec, 3rd vec)
5202 I2: interleave_low (1st vec, 3rd vec)
5203 I3: interleave_high (2nd vec, 4th vec)
5204 I4: interleave_low (2nd vec, 4th vec)
5206 The output for the first stage is:
5208 I1: 0 16 1 17 2 18 3 19
5209 I2: 4 20 5 21 6 22 7 23
5210 I3: 8 24 9 25 10 26 11 27
5211 I4: 12 28 13 29 14 30 15 31
5213 The output of the second stage, i.e. the final result is:
5215 I1: 0 8 16 24 1 9 17 25
5216 I2: 2 10 18 26 3 11 19 27
5217 I3: 4 12 20 28 5 13 21 30
5218 I4: 6 14 22 30 7 15 23 31. */
5220 void
5221 vect_permute_store_chain (vec<tree> dr_chain,
5222 unsigned int length,
5223 gimple *stmt,
5224 gimple_stmt_iterator *gsi,
5225 vec<tree> *result_chain)
5227 tree vect1, vect2, high, low;
5228 gimple *perm_stmt;
5229 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5230 tree perm_mask_low, perm_mask_high;
5231 tree data_ref;
5232 tree perm3_mask_low, perm3_mask_high;
5233 unsigned int i, j, n, log_length = exact_log2 (length);
5235 result_chain->quick_grow (length);
5236 memcpy (result_chain->address (), dr_chain.address (),
5237 length * sizeof (tree));
5239 if (length == 3)
5241 /* vect_grouped_store_supported ensures that this is constant. */
5242 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5243 unsigned int j0 = 0, j1 = 0, j2 = 0;
5245 vec_perm_builder sel (nelt, nelt, 1);
5246 sel.quick_grow (nelt);
5247 vec_perm_indices indices;
5248 for (j = 0; j < 3; j++)
5250 int nelt0 = ((3 - j) * nelt) % 3;
5251 int nelt1 = ((3 - j) * nelt + 1) % 3;
5252 int nelt2 = ((3 - j) * nelt + 2) % 3;
5254 for (i = 0; i < nelt; i++)
5256 if (3 * i + nelt0 < nelt)
5257 sel[3 * i + nelt0] = j0++;
5258 if (3 * i + nelt1 < nelt)
5259 sel[3 * i + nelt1] = nelt + j1++;
5260 if (3 * i + nelt2 < nelt)
5261 sel[3 * i + nelt2] = 0;
5263 indices.new_vector (sel, 2, nelt);
5264 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5266 for (i = 0; i < nelt; i++)
5268 if (3 * i + nelt0 < nelt)
5269 sel[3 * i + nelt0] = 3 * i + nelt0;
5270 if (3 * i + nelt1 < nelt)
5271 sel[3 * i + nelt1] = 3 * i + nelt1;
5272 if (3 * i + nelt2 < nelt)
5273 sel[3 * i + nelt2] = nelt + j2++;
5275 indices.new_vector (sel, 2, nelt);
5276 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5278 vect1 = dr_chain[0];
5279 vect2 = dr_chain[1];
5281 /* Create interleaving stmt:
5282 low = VEC_PERM_EXPR <vect1, vect2,
5283 {j, nelt, *, j + 1, nelt + j + 1, *,
5284 j + 2, nelt + j + 2, *, ...}> */
5285 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5286 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5287 vect2, perm3_mask_low);
5288 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5290 vect1 = data_ref;
5291 vect2 = dr_chain[2];
5292 /* Create interleaving stmt:
5293 low = VEC_PERM_EXPR <vect1, vect2,
5294 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5295 6, 7, nelt + j + 2, ...}> */
5296 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5297 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5298 vect2, perm3_mask_high);
5299 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5300 (*result_chain)[j] = data_ref;
5303 else
5305 /* If length is not equal to 3 then only power of 2 is supported. */
5306 gcc_assert (pow2p_hwi (length));
5308 /* The encoding has 2 interleaved stepped patterns. */
5309 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5310 vec_perm_builder sel (nelt, 2, 3);
5311 sel.quick_grow (6);
5312 for (i = 0; i < 3; i++)
5314 sel[i * 2] = i;
5315 sel[i * 2 + 1] = i + nelt;
5317 vec_perm_indices indices (sel, 2, nelt);
5318 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5320 for (i = 0; i < 6; i++)
5321 sel[i] += exact_div (nelt, 2);
5322 indices.new_vector (sel, 2, nelt);
5323 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5325 for (i = 0, n = log_length; i < n; i++)
5327 for (j = 0; j < length/2; j++)
5329 vect1 = dr_chain[j];
5330 vect2 = dr_chain[j+length/2];
5332 /* Create interleaving stmt:
5333 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5334 ...}> */
5335 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5336 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5337 vect2, perm_mask_high);
5338 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5339 (*result_chain)[2*j] = high;
5341 /* Create interleaving stmt:
5342 low = VEC_PERM_EXPR <vect1, vect2,
5343 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5344 ...}> */
5345 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5346 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5347 vect2, perm_mask_low);
5348 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5349 (*result_chain)[2*j+1] = low;
5351 memcpy (dr_chain.address (), result_chain->address (),
5352 length * sizeof (tree));
5357 /* Function vect_setup_realignment
5359 This function is called when vectorizing an unaligned load using
5360 the dr_explicit_realign[_optimized] scheme.
5361 This function generates the following code at the loop prolog:
5363 p = initial_addr;
5364 x msq_init = *(floor(p)); # prolog load
5365 realignment_token = call target_builtin;
5366 loop:
5367 x msq = phi (msq_init, ---)
5369 The stmts marked with x are generated only for the case of
5370 dr_explicit_realign_optimized.
5372 The code above sets up a new (vector) pointer, pointing to the first
5373 location accessed by STMT, and a "floor-aligned" load using that pointer.
5374 It also generates code to compute the "realignment-token" (if the relevant
5375 target hook was defined), and creates a phi-node at the loop-header bb
5376 whose arguments are the result of the prolog-load (created by this
5377 function) and the result of a load that takes place in the loop (to be
5378 created by the caller to this function).
5380 For the case of dr_explicit_realign_optimized:
5381 The caller to this function uses the phi-result (msq) to create the
5382 realignment code inside the loop, and sets up the missing phi argument,
5383 as follows:
5384 loop:
5385 msq = phi (msq_init, lsq)
5386 lsq = *(floor(p')); # load in loop
5387 result = realign_load (msq, lsq, realignment_token);
5389 For the case of dr_explicit_realign:
5390 loop:
5391 msq = *(floor(p)); # load in loop
5392 p' = p + (VS-1);
5393 lsq = *(floor(p')); # load in loop
5394 result = realign_load (msq, lsq, realignment_token);
5396 Input:
5397 STMT - (scalar) load stmt to be vectorized. This load accesses
5398 a memory location that may be unaligned.
5399 BSI - place where new code is to be inserted.
5400 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5401 is used.
5403 Output:
5404 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5405 target hook, if defined.
5406 Return value - the result of the loop-header phi node. */
5408 tree
5409 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5410 tree *realignment_token,
5411 enum dr_alignment_support alignment_support_scheme,
5412 tree init_addr,
5413 struct loop **at_loop)
5415 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5416 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5417 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5418 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5419 struct loop *loop = NULL;
5420 edge pe = NULL;
5421 tree scalar_dest = gimple_assign_lhs (stmt);
5422 tree vec_dest;
5423 gimple *inc;
5424 tree ptr;
5425 tree data_ref;
5426 basic_block new_bb;
5427 tree msq_init = NULL_TREE;
5428 tree new_temp;
5429 gphi *phi_stmt;
5430 tree msq = NULL_TREE;
5431 gimple_seq stmts = NULL;
5432 bool inv_p;
5433 bool compute_in_loop = false;
5434 bool nested_in_vect_loop = false;
5435 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5436 struct loop *loop_for_initial_load = NULL;
5438 if (loop_vinfo)
5440 loop = LOOP_VINFO_LOOP (loop_vinfo);
5441 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5444 gcc_assert (alignment_support_scheme == dr_explicit_realign
5445 || alignment_support_scheme == dr_explicit_realign_optimized);
5447 /* We need to generate three things:
5448 1. the misalignment computation
5449 2. the extra vector load (for the optimized realignment scheme).
5450 3. the phi node for the two vectors from which the realignment is
5451 done (for the optimized realignment scheme). */
5453 /* 1. Determine where to generate the misalignment computation.
5455 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5456 calculation will be generated by this function, outside the loop (in the
5457 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5458 caller, inside the loop.
5460 Background: If the misalignment remains fixed throughout the iterations of
5461 the loop, then both realignment schemes are applicable, and also the
5462 misalignment computation can be done outside LOOP. This is because we are
5463 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5464 are a multiple of VS (the Vector Size), and therefore the misalignment in
5465 different vectorized LOOP iterations is always the same.
5466 The problem arises only if the memory access is in an inner-loop nested
5467 inside LOOP, which is now being vectorized using outer-loop vectorization.
5468 This is the only case when the misalignment of the memory access may not
5469 remain fixed throughout the iterations of the inner-loop (as explained in
5470 detail in vect_supportable_dr_alignment). In this case, not only is the
5471 optimized realignment scheme not applicable, but also the misalignment
5472 computation (and generation of the realignment token that is passed to
5473 REALIGN_LOAD) have to be done inside the loop.
5475 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5476 or not, which in turn determines if the misalignment is computed inside
5477 the inner-loop, or outside LOOP. */
5479 if (init_addr != NULL_TREE || !loop_vinfo)
5481 compute_in_loop = true;
5482 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5486 /* 2. Determine where to generate the extra vector load.
5488 For the optimized realignment scheme, instead of generating two vector
5489 loads in each iteration, we generate a single extra vector load in the
5490 preheader of the loop, and in each iteration reuse the result of the
5491 vector load from the previous iteration. In case the memory access is in
5492 an inner-loop nested inside LOOP, which is now being vectorized using
5493 outer-loop vectorization, we need to determine whether this initial vector
5494 load should be generated at the preheader of the inner-loop, or can be
5495 generated at the preheader of LOOP. If the memory access has no evolution
5496 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5497 to be generated inside LOOP (in the preheader of the inner-loop). */
5499 if (nested_in_vect_loop)
5501 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5502 bool invariant_in_outerloop =
5503 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5504 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5506 else
5507 loop_for_initial_load = loop;
5508 if (at_loop)
5509 *at_loop = loop_for_initial_load;
5511 if (loop_for_initial_load)
5512 pe = loop_preheader_edge (loop_for_initial_load);
5514 /* 3. For the case of the optimized realignment, create the first vector
5515 load at the loop preheader. */
5517 if (alignment_support_scheme == dr_explicit_realign_optimized)
5519 /* Create msq_init = *(floor(p1)) in the loop preheader */
5520 gassign *new_stmt;
5522 gcc_assert (!compute_in_loop);
5523 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5524 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5525 NULL_TREE, &init_addr, NULL, &inc,
5526 true, &inv_p);
5527 if (TREE_CODE (ptr) == SSA_NAME)
5528 new_temp = copy_ssa_name (ptr);
5529 else
5530 new_temp = make_ssa_name (TREE_TYPE (ptr));
5531 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5532 new_stmt = gimple_build_assign
5533 (new_temp, BIT_AND_EXPR, ptr,
5534 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5535 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5536 gcc_assert (!new_bb);
5537 data_ref
5538 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5539 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5540 vect_copy_ref_info (data_ref, DR_REF (dr));
5541 new_stmt = gimple_build_assign (vec_dest, data_ref);
5542 new_temp = make_ssa_name (vec_dest, new_stmt);
5543 gimple_assign_set_lhs (new_stmt, new_temp);
5544 if (pe)
5546 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5547 gcc_assert (!new_bb);
5549 else
5550 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5552 msq_init = gimple_assign_lhs (new_stmt);
5555 /* 4. Create realignment token using a target builtin, if available.
5556 It is done either inside the containing loop, or before LOOP (as
5557 determined above). */
5559 if (targetm.vectorize.builtin_mask_for_load)
5561 gcall *new_stmt;
5562 tree builtin_decl;
5564 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5565 if (!init_addr)
5567 /* Generate the INIT_ADDR computation outside LOOP. */
5568 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5569 NULL_TREE);
5570 if (loop)
5572 pe = loop_preheader_edge (loop);
5573 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5574 gcc_assert (!new_bb);
5576 else
5577 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5580 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5581 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5582 vec_dest =
5583 vect_create_destination_var (scalar_dest,
5584 gimple_call_return_type (new_stmt));
5585 new_temp = make_ssa_name (vec_dest, new_stmt);
5586 gimple_call_set_lhs (new_stmt, new_temp);
5588 if (compute_in_loop)
5589 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5590 else
5592 /* Generate the misalignment computation outside LOOP. */
5593 pe = loop_preheader_edge (loop);
5594 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5595 gcc_assert (!new_bb);
5598 *realignment_token = gimple_call_lhs (new_stmt);
5600 /* The result of the CALL_EXPR to this builtin is determined from
5601 the value of the parameter and no global variables are touched
5602 which makes the builtin a "const" function. Requiring the
5603 builtin to have the "const" attribute makes it unnecessary
5604 to call mark_call_clobbered. */
5605 gcc_assert (TREE_READONLY (builtin_decl));
5608 if (alignment_support_scheme == dr_explicit_realign)
5609 return msq;
5611 gcc_assert (!compute_in_loop);
5612 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5615 /* 5. Create msq = phi <msq_init, lsq> in loop */
5617 pe = loop_preheader_edge (containing_loop);
5618 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5619 msq = make_ssa_name (vec_dest);
5620 phi_stmt = create_phi_node (msq, containing_loop->header);
5621 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5623 return msq;
5627 /* Function vect_grouped_load_supported.
5629 COUNT is the size of the load group (the number of statements plus the
5630 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5631 only one statement, with a gap of COUNT - 1.
5633 Returns true if a suitable permute exists. */
5635 bool
5636 vect_grouped_load_supported (tree vectype, bool single_element_p,
5637 unsigned HOST_WIDE_INT count)
5639 machine_mode mode = TYPE_MODE (vectype);
5641 /* If this is single-element interleaving with an element distance
5642 that leaves unused vector loads around punt - we at least create
5643 very sub-optimal code in that case (and blow up memory,
5644 see PR65518). */
5645 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5647 if (dump_enabled_p ())
5648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5649 "single-element interleaving not supported "
5650 "for not adjacent vector loads\n");
5651 return false;
5654 /* vect_permute_load_chain requires the group size to be equal to 3 or
5655 be a power of two. */
5656 if (count != 3 && exact_log2 (count) == -1)
5658 if (dump_enabled_p ())
5659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5660 "the size of the group of accesses"
5661 " is not a power of 2 or not equal to 3\n");
5662 return false;
5665 /* Check that the permutation is supported. */
5666 if (VECTOR_MODE_P (mode))
5668 unsigned int i, j;
5669 if (count == 3)
5671 unsigned int nelt;
5672 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5674 if (dump_enabled_p ())
5675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5676 "cannot handle groups of 3 loads for"
5677 " variable-length vectors\n");
5678 return false;
5681 vec_perm_builder sel (nelt, nelt, 1);
5682 sel.quick_grow (nelt);
5683 vec_perm_indices indices;
5684 unsigned int k;
5685 for (k = 0; k < 3; k++)
5687 for (i = 0; i < nelt; i++)
5688 if (3 * i + k < 2 * nelt)
5689 sel[i] = 3 * i + k;
5690 else
5691 sel[i] = 0;
5692 indices.new_vector (sel, 2, nelt);
5693 if (!can_vec_perm_const_p (mode, indices))
5695 if (dump_enabled_p ())
5696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5697 "shuffle of 3 loads is not supported by"
5698 " target\n");
5699 return false;
5701 for (i = 0, j = 0; i < nelt; i++)
5702 if (3 * i + k < 2 * nelt)
5703 sel[i] = i;
5704 else
5705 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5706 indices.new_vector (sel, 2, nelt);
5707 if (!can_vec_perm_const_p (mode, indices))
5709 if (dump_enabled_p ())
5710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5711 "shuffle of 3 loads is not supported by"
5712 " target\n");
5713 return false;
5716 return true;
5718 else
5720 /* If length is not equal to 3 then only power of 2 is supported. */
5721 gcc_assert (pow2p_hwi (count));
5722 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5724 /* The encoding has a single stepped pattern. */
5725 vec_perm_builder sel (nelt, 1, 3);
5726 sel.quick_grow (3);
5727 for (i = 0; i < 3; i++)
5728 sel[i] = i * 2;
5729 vec_perm_indices indices (sel, 2, nelt);
5730 if (can_vec_perm_const_p (mode, indices))
5732 for (i = 0; i < 3; i++)
5733 sel[i] = i * 2 + 1;
5734 indices.new_vector (sel, 2, nelt);
5735 if (can_vec_perm_const_p (mode, indices))
5736 return true;
5741 if (dump_enabled_p ())
5742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5743 "extract even/odd not supported by target\n");
5744 return false;
5747 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5748 type VECTYPE. MASKED_P says whether the masked form is needed. */
5750 bool
5751 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5752 bool masked_p)
5754 if (masked_p)
5755 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5756 vec_mask_load_lanes_optab,
5757 vectype, count);
5758 else
5759 return vect_lanes_optab_supported_p ("vec_load_lanes",
5760 vec_load_lanes_optab,
5761 vectype, count);
5764 /* Function vect_permute_load_chain.
5766 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5767 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5768 the input data correctly. Return the final references for loads in
5769 RESULT_CHAIN.
5771 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5772 The input is 4 vectors each containing 8 elements. We assign a number to each
5773 element, the input sequence is:
5775 1st vec: 0 1 2 3 4 5 6 7
5776 2nd vec: 8 9 10 11 12 13 14 15
5777 3rd vec: 16 17 18 19 20 21 22 23
5778 4th vec: 24 25 26 27 28 29 30 31
5780 The output sequence should be:
5782 1st vec: 0 4 8 12 16 20 24 28
5783 2nd vec: 1 5 9 13 17 21 25 29
5784 3rd vec: 2 6 10 14 18 22 26 30
5785 4th vec: 3 7 11 15 19 23 27 31
5787 i.e., the first output vector should contain the first elements of each
5788 interleaving group, etc.
5790 We use extract_even/odd instructions to create such output. The input of
5791 each extract_even/odd operation is two vectors
5792 1st vec 2nd vec
5793 0 1 2 3 4 5 6 7
5795 and the output is the vector of extracted even/odd elements. The output of
5796 extract_even will be: 0 2 4 6
5797 and of extract_odd: 1 3 5 7
5800 The permutation is done in log LENGTH stages. In each stage extract_even
5801 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5802 their order. In our example,
5804 E1: extract_even (1st vec, 2nd vec)
5805 E2: extract_odd (1st vec, 2nd vec)
5806 E3: extract_even (3rd vec, 4th vec)
5807 E4: extract_odd (3rd vec, 4th vec)
5809 The output for the first stage will be:
5811 E1: 0 2 4 6 8 10 12 14
5812 E2: 1 3 5 7 9 11 13 15
5813 E3: 16 18 20 22 24 26 28 30
5814 E4: 17 19 21 23 25 27 29 31
5816 In order to proceed and create the correct sequence for the next stage (or
5817 for the correct output, if the second stage is the last one, as in our
5818 example), we first put the output of extract_even operation and then the
5819 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5820 The input for the second stage is:
5822 1st vec (E1): 0 2 4 6 8 10 12 14
5823 2nd vec (E3): 16 18 20 22 24 26 28 30
5824 3rd vec (E2): 1 3 5 7 9 11 13 15
5825 4th vec (E4): 17 19 21 23 25 27 29 31
5827 The output of the second stage:
5829 E1: 0 4 8 12 16 20 24 28
5830 E2: 2 6 10 14 18 22 26 30
5831 E3: 1 5 9 13 17 21 25 29
5832 E4: 3 7 11 15 19 23 27 31
5834 And RESULT_CHAIN after reordering:
5836 1st vec (E1): 0 4 8 12 16 20 24 28
5837 2nd vec (E3): 1 5 9 13 17 21 25 29
5838 3rd vec (E2): 2 6 10 14 18 22 26 30
5839 4th vec (E4): 3 7 11 15 19 23 27 31. */
5841 static void
5842 vect_permute_load_chain (vec<tree> dr_chain,
5843 unsigned int length,
5844 gimple *stmt,
5845 gimple_stmt_iterator *gsi,
5846 vec<tree> *result_chain)
5848 tree data_ref, first_vect, second_vect;
5849 tree perm_mask_even, perm_mask_odd;
5850 tree perm3_mask_low, perm3_mask_high;
5851 gimple *perm_stmt;
5852 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5853 unsigned int i, j, log_length = exact_log2 (length);
5855 result_chain->quick_grow (length);
5856 memcpy (result_chain->address (), dr_chain.address (),
5857 length * sizeof (tree));
5859 if (length == 3)
5861 /* vect_grouped_load_supported ensures that this is constant. */
5862 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5863 unsigned int k;
5865 vec_perm_builder sel (nelt, nelt, 1);
5866 sel.quick_grow (nelt);
5867 vec_perm_indices indices;
5868 for (k = 0; k < 3; k++)
5870 for (i = 0; i < nelt; i++)
5871 if (3 * i + k < 2 * nelt)
5872 sel[i] = 3 * i + k;
5873 else
5874 sel[i] = 0;
5875 indices.new_vector (sel, 2, nelt);
5876 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5878 for (i = 0, j = 0; i < nelt; i++)
5879 if (3 * i + k < 2 * nelt)
5880 sel[i] = i;
5881 else
5882 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5883 indices.new_vector (sel, 2, nelt);
5884 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5886 first_vect = dr_chain[0];
5887 second_vect = dr_chain[1];
5889 /* Create interleaving stmt (low part of):
5890 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5891 ...}> */
5892 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5893 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5894 second_vect, perm3_mask_low);
5895 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5897 /* Create interleaving stmt (high part of):
5898 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5899 ...}> */
5900 first_vect = data_ref;
5901 second_vect = dr_chain[2];
5902 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5903 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5904 second_vect, perm3_mask_high);
5905 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5906 (*result_chain)[k] = data_ref;
5909 else
5911 /* If length is not equal to 3 then only power of 2 is supported. */
5912 gcc_assert (pow2p_hwi (length));
5914 /* The encoding has a single stepped pattern. */
5915 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5916 vec_perm_builder sel (nelt, 1, 3);
5917 sel.quick_grow (3);
5918 for (i = 0; i < 3; ++i)
5919 sel[i] = i * 2;
5920 vec_perm_indices indices (sel, 2, nelt);
5921 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5923 for (i = 0; i < 3; ++i)
5924 sel[i] = i * 2 + 1;
5925 indices.new_vector (sel, 2, nelt);
5926 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5928 for (i = 0; i < log_length; i++)
5930 for (j = 0; j < length; j += 2)
5932 first_vect = dr_chain[j];
5933 second_vect = dr_chain[j+1];
5935 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5936 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5937 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5938 first_vect, second_vect,
5939 perm_mask_even);
5940 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5941 (*result_chain)[j/2] = data_ref;
5943 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5944 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5945 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5946 first_vect, second_vect,
5947 perm_mask_odd);
5948 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5949 (*result_chain)[j/2+length/2] = data_ref;
5951 memcpy (dr_chain.address (), result_chain->address (),
5952 length * sizeof (tree));
5957 /* Function vect_shift_permute_load_chain.
5959 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5960 sequence of stmts to reorder the input data accordingly.
5961 Return the final references for loads in RESULT_CHAIN.
5962 Return true if successed, false otherwise.
5964 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5965 The input is 3 vectors each containing 8 elements. We assign a
5966 number to each element, the input sequence is:
5968 1st vec: 0 1 2 3 4 5 6 7
5969 2nd vec: 8 9 10 11 12 13 14 15
5970 3rd vec: 16 17 18 19 20 21 22 23
5972 The output sequence should be:
5974 1st vec: 0 3 6 9 12 15 18 21
5975 2nd vec: 1 4 7 10 13 16 19 22
5976 3rd vec: 2 5 8 11 14 17 20 23
5978 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5980 First we shuffle all 3 vectors to get correct elements order:
5982 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5983 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5984 3rd vec: (16 19 22) (17 20 23) (18 21)
5986 Next we unite and shift vector 3 times:
5988 1st step:
5989 shift right by 6 the concatenation of:
5990 "1st vec" and "2nd vec"
5991 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5992 "2nd vec" and "3rd vec"
5993 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5994 "3rd vec" and "1st vec"
5995 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5996 | New vectors |
5998 So that now new vectors are:
6000 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6001 2nd vec: (10 13) (16 19 22) (17 20 23)
6002 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6004 2nd step:
6005 shift right by 5 the concatenation of:
6006 "1st vec" and "3rd vec"
6007 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6008 "2nd vec" and "1st vec"
6009 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6010 "3rd vec" and "2nd vec"
6011 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6012 | New vectors |
6014 So that now new vectors are:
6016 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6017 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6018 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6020 3rd step:
6021 shift right by 5 the concatenation of:
6022 "1st vec" and "1st vec"
6023 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6024 shift right by 3 the concatenation of:
6025 "2nd vec" and "2nd vec"
6026 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6027 | New vectors |
6029 So that now all vectors are READY:
6030 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6031 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6032 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6034 This algorithm is faster than one in vect_permute_load_chain if:
6035 1. "shift of a concatination" is faster than general permutation.
6036 This is usually so.
6037 2. The TARGET machine can't execute vector instructions in parallel.
6038 This is because each step of the algorithm depends on previous.
6039 The algorithm in vect_permute_load_chain is much more parallel.
6041 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6044 static bool
6045 vect_shift_permute_load_chain (vec<tree> dr_chain,
6046 unsigned int length,
6047 gimple *stmt,
6048 gimple_stmt_iterator *gsi,
6049 vec<tree> *result_chain)
6051 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6052 tree perm2_mask1, perm2_mask2, perm3_mask;
6053 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6054 gimple *perm_stmt;
6056 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6057 unsigned int i;
6058 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6059 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6061 unsigned HOST_WIDE_INT nelt, vf;
6062 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6063 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6064 /* Not supported for variable-length vectors. */
6065 return false;
6067 vec_perm_builder sel (nelt, nelt, 1);
6068 sel.quick_grow (nelt);
6070 result_chain->quick_grow (length);
6071 memcpy (result_chain->address (), dr_chain.address (),
6072 length * sizeof (tree));
6074 if (pow2p_hwi (length) && vf > 4)
6076 unsigned int j, log_length = exact_log2 (length);
6077 for (i = 0; i < nelt / 2; ++i)
6078 sel[i] = i * 2;
6079 for (i = 0; i < nelt / 2; ++i)
6080 sel[nelt / 2 + i] = i * 2 + 1;
6081 vec_perm_indices indices (sel, 2, nelt);
6082 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6084 if (dump_enabled_p ())
6085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6086 "shuffle of 2 fields structure is not \
6087 supported by target\n");
6088 return false;
6090 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6092 for (i = 0; i < nelt / 2; ++i)
6093 sel[i] = i * 2 + 1;
6094 for (i = 0; i < nelt / 2; ++i)
6095 sel[nelt / 2 + i] = i * 2;
6096 indices.new_vector (sel, 2, nelt);
6097 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6099 if (dump_enabled_p ())
6100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6101 "shuffle of 2 fields structure is not \
6102 supported by target\n");
6103 return false;
6105 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6107 /* Generating permutation constant to shift all elements.
6108 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6109 for (i = 0; i < nelt; i++)
6110 sel[i] = nelt / 2 + i;
6111 indices.new_vector (sel, 2, nelt);
6112 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6114 if (dump_enabled_p ())
6115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6116 "shift permutation is not supported by target\n");
6117 return false;
6119 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6121 /* Generating permutation constant to select vector from 2.
6122 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6123 for (i = 0; i < nelt / 2; i++)
6124 sel[i] = i;
6125 for (i = nelt / 2; i < nelt; i++)
6126 sel[i] = nelt + i;
6127 indices.new_vector (sel, 2, nelt);
6128 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6130 if (dump_enabled_p ())
6131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6132 "select is not supported by target\n");
6133 return false;
6135 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6137 for (i = 0; i < log_length; i++)
6139 for (j = 0; j < length; j += 2)
6141 first_vect = dr_chain[j];
6142 second_vect = dr_chain[j + 1];
6144 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6145 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6146 first_vect, first_vect,
6147 perm2_mask1);
6148 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6149 vect[0] = data_ref;
6151 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6152 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6153 second_vect, second_vect,
6154 perm2_mask2);
6155 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6156 vect[1] = data_ref;
6158 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6159 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6160 vect[0], vect[1], shift1_mask);
6161 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6162 (*result_chain)[j/2 + length/2] = data_ref;
6164 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6165 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6166 vect[0], vect[1], select_mask);
6167 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6168 (*result_chain)[j/2] = data_ref;
6170 memcpy (dr_chain.address (), result_chain->address (),
6171 length * sizeof (tree));
6173 return true;
6175 if (length == 3 && vf > 2)
6177 unsigned int k = 0, l = 0;
6179 /* Generating permutation constant to get all elements in rigth order.
6180 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6181 for (i = 0; i < nelt; i++)
6183 if (3 * k + (l % 3) >= nelt)
6185 k = 0;
6186 l += (3 - (nelt % 3));
6188 sel[i] = 3 * k + (l % 3);
6189 k++;
6191 vec_perm_indices indices (sel, 2, nelt);
6192 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6194 if (dump_enabled_p ())
6195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6196 "shuffle of 3 fields structure is not \
6197 supported by target\n");
6198 return false;
6200 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6202 /* Generating permutation constant to shift all elements.
6203 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6204 for (i = 0; i < nelt; i++)
6205 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6206 indices.new_vector (sel, 2, nelt);
6207 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6209 if (dump_enabled_p ())
6210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6211 "shift permutation is not supported by target\n");
6212 return false;
6214 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6216 /* Generating permutation constant to shift all elements.
6217 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6218 for (i = 0; i < nelt; i++)
6219 sel[i] = 2 * (nelt / 3) + 1 + i;
6220 indices.new_vector (sel, 2, nelt);
6221 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6223 if (dump_enabled_p ())
6224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6225 "shift permutation is not supported by target\n");
6226 return false;
6228 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6230 /* Generating permutation constant to shift all elements.
6231 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6232 for (i = 0; i < nelt; i++)
6233 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6234 indices.new_vector (sel, 2, nelt);
6235 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6237 if (dump_enabled_p ())
6238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6239 "shift permutation is not supported by target\n");
6240 return false;
6242 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6244 /* Generating permutation constant to shift all elements.
6245 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6246 for (i = 0; i < nelt; i++)
6247 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6248 indices.new_vector (sel, 2, nelt);
6249 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6251 if (dump_enabled_p ())
6252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6253 "shift permutation is not supported by target\n");
6254 return false;
6256 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6258 for (k = 0; k < 3; k++)
6260 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6261 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6262 dr_chain[k], dr_chain[k],
6263 perm3_mask);
6264 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6265 vect[k] = data_ref;
6268 for (k = 0; k < 3; k++)
6270 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6271 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6272 vect[k % 3], vect[(k + 1) % 3],
6273 shift1_mask);
6274 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6275 vect_shift[k] = data_ref;
6278 for (k = 0; k < 3; k++)
6280 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6281 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6282 vect_shift[(4 - k) % 3],
6283 vect_shift[(3 - k) % 3],
6284 shift2_mask);
6285 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6286 vect[k] = data_ref;
6289 (*result_chain)[3 - (nelt % 3)] = vect[2];
6291 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6292 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6293 vect[0], shift3_mask);
6294 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6295 (*result_chain)[nelt % 3] = data_ref;
6297 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6298 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6299 vect[1], shift4_mask);
6300 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6301 (*result_chain)[0] = data_ref;
6302 return true;
6304 return false;
6307 /* Function vect_transform_grouped_load.
6309 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6310 to perform their permutation and ascribe the result vectorized statements to
6311 the scalar statements.
6314 void
6315 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6316 gimple_stmt_iterator *gsi)
6318 machine_mode mode;
6319 vec<tree> result_chain = vNULL;
6321 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6322 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6323 vectors, that are ready for vector computation. */
6324 result_chain.create (size);
6326 /* If reassociation width for vector type is 2 or greater target machine can
6327 execute 2 or more vector instructions in parallel. Otherwise try to
6328 get chain for loads group using vect_shift_permute_load_chain. */
6329 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6330 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6331 || pow2p_hwi (size)
6332 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6333 gsi, &result_chain))
6334 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6335 vect_record_grouped_load_vectors (stmt, result_chain);
6336 result_chain.release ();
6339 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6340 generated as part of the vectorization of STMT. Assign the statement
6341 for each vector to the associated scalar statement. */
6343 void
6344 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6346 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6347 vec_info *vinfo = stmt_info->vinfo;
6348 gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
6349 gimple *next_stmt;
6350 unsigned int i, gap_count;
6351 tree tmp_data_ref;
6353 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6354 Since we scan the chain starting from it's first node, their order
6355 corresponds the order of data-refs in RESULT_CHAIN. */
6356 next_stmt = first_stmt;
6357 gap_count = 1;
6358 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6360 if (!next_stmt)
6361 break;
6363 /* Skip the gaps. Loads created for the gaps will be removed by dead
6364 code elimination pass later. No need to check for the first stmt in
6365 the group, since it always exists.
6366 DR_GROUP_GAP is the number of steps in elements from the previous
6367 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6368 correspond to the gaps. */
6369 if (next_stmt != first_stmt
6370 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6372 gap_count++;
6373 continue;
6376 while (next_stmt)
6378 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6379 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6380 copies, and we put the new vector statement in the first available
6381 RELATED_STMT. */
6382 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6383 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt_info;
6384 else
6386 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6388 stmt_vec_info prev_stmt_info
6389 = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6390 stmt_vec_info rel_stmt_info
6391 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6392 while (rel_stmt_info)
6394 prev_stmt_info = rel_stmt_info;
6395 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6398 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6402 next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6403 gap_count = 1;
6404 /* If NEXT_STMT accesses the same DR as the previous statement,
6405 put the same TMP_DATA_REF as its vectorized statement; otherwise
6406 get the next data-ref from RESULT_CHAIN. */
6407 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6408 break;
6413 /* Function vect_force_dr_alignment_p.
6415 Returns whether the alignment of a DECL can be forced to be aligned
6416 on ALIGNMENT bit boundary. */
6418 bool
6419 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6421 if (!VAR_P (decl))
6422 return false;
6424 if (decl_in_symtab_p (decl)
6425 && !symtab_node::get (decl)->can_increase_alignment_p ())
6426 return false;
6428 if (TREE_STATIC (decl))
6429 return (alignment <= MAX_OFILE_ALIGNMENT);
6430 else
6431 return (alignment <= MAX_STACK_ALIGNMENT);
6435 /* Return whether the data reference DR is supported with respect to its
6436 alignment.
6437 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6438 it is aligned, i.e., check if it is possible to vectorize it with different
6439 alignment. */
6441 enum dr_alignment_support
6442 vect_supportable_dr_alignment (struct data_reference *dr,
6443 bool check_aligned_accesses)
6445 stmt_vec_info stmt_info = vect_dr_stmt (dr);
6446 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6447 machine_mode mode = TYPE_MODE (vectype);
6448 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6449 struct loop *vect_loop = NULL;
6450 bool nested_in_vect_loop = false;
6452 if (aligned_access_p (dr) && !check_aligned_accesses)
6453 return dr_aligned;
6455 /* For now assume all conditional loads/stores support unaligned
6456 access without any special code. */
6457 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6458 if (gimple_call_internal_p (stmt)
6459 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6460 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6461 return dr_unaligned_supported;
6463 if (loop_vinfo)
6465 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6466 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6469 /* Possibly unaligned access. */
6471 /* We can choose between using the implicit realignment scheme (generating
6472 a misaligned_move stmt) and the explicit realignment scheme (generating
6473 aligned loads with a REALIGN_LOAD). There are two variants to the
6474 explicit realignment scheme: optimized, and unoptimized.
6475 We can optimize the realignment only if the step between consecutive
6476 vector loads is equal to the vector size. Since the vector memory
6477 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6478 is guaranteed that the misalignment amount remains the same throughout the
6479 execution of the vectorized loop. Therefore, we can create the
6480 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6481 at the loop preheader.
6483 However, in the case of outer-loop vectorization, when vectorizing a
6484 memory access in the inner-loop nested within the LOOP that is now being
6485 vectorized, while it is guaranteed that the misalignment of the
6486 vectorized memory access will remain the same in different outer-loop
6487 iterations, it is *not* guaranteed that is will remain the same throughout
6488 the execution of the inner-loop. This is because the inner-loop advances
6489 with the original scalar step (and not in steps of VS). If the inner-loop
6490 step happens to be a multiple of VS, then the misalignment remains fixed
6491 and we can use the optimized realignment scheme. For example:
6493 for (i=0; i<N; i++)
6494 for (j=0; j<M; j++)
6495 s += a[i+j];
6497 When vectorizing the i-loop in the above example, the step between
6498 consecutive vector loads is 1, and so the misalignment does not remain
6499 fixed across the execution of the inner-loop, and the realignment cannot
6500 be optimized (as illustrated in the following pseudo vectorized loop):
6502 for (i=0; i<N; i+=4)
6503 for (j=0; j<M; j++){
6504 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6505 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6506 // (assuming that we start from an aligned address).
6509 We therefore have to use the unoptimized realignment scheme:
6511 for (i=0; i<N; i+=4)
6512 for (j=k; j<M; j+=4)
6513 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6514 // that the misalignment of the initial address is
6515 // 0).
6517 The loop can then be vectorized as follows:
6519 for (k=0; k<4; k++){
6520 rt = get_realignment_token (&vp[k]);
6521 for (i=0; i<N; i+=4){
6522 v1 = vp[i+k];
6523 for (j=k; j<M; j+=4){
6524 v2 = vp[i+j+VS-1];
6525 va = REALIGN_LOAD <v1,v2,rt>;
6526 vs += va;
6527 v1 = v2;
6530 } */
6532 if (DR_IS_READ (dr))
6534 bool is_packed = false;
6535 tree type = (TREE_TYPE (DR_REF (dr)));
6537 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6538 && (!targetm.vectorize.builtin_mask_for_load
6539 || targetm.vectorize.builtin_mask_for_load ()))
6541 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6543 /* If we are doing SLP then the accesses need not have the
6544 same alignment, instead it depends on the SLP group size. */
6545 if (loop_vinfo
6546 && STMT_SLP_TYPE (stmt_info)
6547 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6548 * DR_GROUP_SIZE (vinfo_for_stmt
6549 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6550 TYPE_VECTOR_SUBPARTS (vectype)))
6552 else if (!loop_vinfo
6553 || (nested_in_vect_loop
6554 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6555 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6556 return dr_explicit_realign;
6557 else
6558 return dr_explicit_realign_optimized;
6560 if (!known_alignment_for_access_p (dr))
6561 is_packed = not_size_aligned (DR_REF (dr));
6563 if (targetm.vectorize.support_vector_misalignment
6564 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6565 /* Can't software pipeline the loads, but can at least do them. */
6566 return dr_unaligned_supported;
6568 else
6570 bool is_packed = false;
6571 tree type = (TREE_TYPE (DR_REF (dr)));
6573 if (!known_alignment_for_access_p (dr))
6574 is_packed = not_size_aligned (DR_REF (dr));
6576 if (targetm.vectorize.support_vector_misalignment
6577 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6578 return dr_unaligned_supported;
6581 /* Unsupported. */
6582 return dr_unaligned_unsupported;