* lto.c (do_stream_out): Add PART parameter; open dump file.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob63429a34bf2e16e99f9c5253e3ee66284b2d9fe0
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 if (is_pattern_stmt_p (stmtinfo_a))
216 stmt_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
217 if (is_pattern_stmt_p (stmtinfo_b))
218 stmt_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
219 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
220 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
223 /* A subroutine of vect_analyze_data_ref_dependence. Handle
224 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
225 distances. These distances are conservatively correct but they don't
226 reflect a guaranteed dependence.
228 Return true if this function does all the work necessary to avoid
229 an alias or false if the caller should use the dependence distances
230 to limit the vectorization factor in the usual way. LOOP_DEPTH is
231 the depth of the loop described by LOOP_VINFO and the other arguments
232 are as for vect_analyze_data_ref_dependence. */
234 static bool
235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
236 loop_vec_info loop_vinfo,
237 int loop_depth, unsigned int *max_vf)
239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
240 lambda_vector dist_v;
241 unsigned int i;
242 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
244 int dist = dist_v[loop_depth];
245 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
247 /* If the user asserted safelen >= DIST consecutive iterations
248 can be executed concurrently, assume independence.
250 ??? An alternative would be to add the alias check even
251 in this case, and vectorize the fallback loop with the
252 maximum VF set to safelen. However, if the user has
253 explicitly given a length, it's less likely that that
254 would be a win. */
255 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
257 if ((unsigned int) loop->safelen < *max_vf)
258 *max_vf = loop->safelen;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
260 continue;
263 /* For dependence distances of 2 or more, we have the option
264 of limiting VF or checking for an alias at runtime.
265 Prefer to check at runtime if we can, to avoid limiting
266 the VF unnecessarily when the bases are in fact independent.
268 Note that the alias checks will be removed if the VF ends up
269 being small enough. */
270 return (!STMT_VINFO_GATHER_SCATTER_P
271 (vinfo_for_stmt (DR_STMT (DDR_A (ddr))))
272 && !STMT_VINFO_GATHER_SCATTER_P
273 (vinfo_for_stmt (DR_STMT (DDR_B (ddr))))
274 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
277 return true;
281 /* Function vect_analyze_data_ref_dependence.
283 Return TRUE if there (might) exist a dependence between a memory-reference
284 DRA and a memory-reference DRB. When versioning for alias may check a
285 dependence at run-time, return FALSE. Adjust *MAX_VF according to
286 the data dependence. */
288 static bool
289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
290 loop_vec_info loop_vinfo,
291 unsigned int *max_vf)
293 unsigned int i;
294 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
295 struct data_reference *dra = DDR_A (ddr);
296 struct data_reference *drb = DDR_B (ddr);
297 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
298 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
299 lambda_vector dist_v;
300 unsigned int loop_depth;
302 /* In loop analysis all data references should be vectorizable. */
303 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
304 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
305 gcc_unreachable ();
307 /* Independent data accesses. */
308 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
309 return false;
311 if (dra == drb
312 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
313 return false;
315 /* We do not have to consider dependences between accesses that belong
316 to the same group, unless the stride could be smaller than the
317 group size. */
318 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
319 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
320 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
321 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
322 return false;
324 /* Even if we have an anti-dependence then, as the vectorized loop covers at
325 least two scalar iterations, there is always also a true dependence.
326 As the vectorizer does not re-order loads and stores we can ignore
327 the anti-dependence if TBAA can disambiguate both DRs similar to the
328 case with known negative distance anti-dependences (positive
329 distance anti-dependences would violate TBAA constraints). */
330 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
331 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
332 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
333 get_alias_set (DR_REF (drb))))
334 return false;
336 /* Unknown data dependence. */
337 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
339 /* If user asserted safelen consecutive iterations can be
340 executed concurrently, assume independence. */
341 if (loop->safelen >= 2)
343 if ((unsigned int) loop->safelen < *max_vf)
344 *max_vf = loop->safelen;
345 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
346 return false;
349 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
350 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "versioning for alias not supported for: "
356 "can't determine dependence between ");
357 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
358 DR_REF (dra));
359 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
360 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
361 DR_REF (drb));
362 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 return true;
367 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "versioning for alias required: "
371 "can't determine dependence between ");
372 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
373 DR_REF (dra));
374 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
375 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
376 DR_REF (drb));
377 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
380 /* Add to list of ddrs that need to be tested at run-time. */
381 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
384 /* Known data dependence. */
385 if (DDR_NUM_DIST_VECTS (ddr) == 0)
387 /* If user asserted safelen consecutive iterations can be
388 executed concurrently, assume independence. */
389 if (loop->safelen >= 2)
391 if ((unsigned int) loop->safelen < *max_vf)
392 *max_vf = loop->safelen;
393 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
394 return false;
397 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
398 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
400 if (dump_enabled_p ())
402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
403 "versioning for alias not supported for: "
404 "bad dist vector for ");
405 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
406 DR_REF (dra));
407 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
408 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
409 DR_REF (drb));
410 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
412 return true;
415 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "versioning for alias required: "
419 "bad dist vector for ");
420 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
421 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
422 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
423 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
425 /* Add to list of ddrs that need to be tested at run-time. */
426 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
429 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
431 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
432 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
433 loop_depth, max_vf))
434 return false;
436 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
438 int dist = dist_v[loop_depth];
440 if (dump_enabled_p ())
441 dump_printf_loc (MSG_NOTE, vect_location,
442 "dependence distance = %d.\n", dist);
444 if (dist == 0)
446 if (dump_enabled_p ())
448 dump_printf_loc (MSG_NOTE, vect_location,
449 "dependence distance == 0 between ");
450 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
451 dump_printf (MSG_NOTE, " and ");
452 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
453 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
456 /* When we perform grouped accesses and perform implicit CSE
457 by detecting equal accesses and doing disambiguation with
458 runtime alias tests like for
459 .. = a[i];
460 .. = a[i+1];
461 a[i] = ..;
462 a[i+1] = ..;
463 *p = ..;
464 .. = a[i];
465 .. = a[i+1];
466 where we will end up loading { a[i], a[i+1] } once, make
467 sure that inserting group loads before the first load and
468 stores after the last store will do the right thing.
469 Similar for groups like
470 a[i] = ...;
471 ... = a[i];
472 a[i+1] = ...;
473 where loads from the group interleave with the store. */
474 if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
475 vect_dr_stmt (drb)))
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
479 "READ_WRITE dependence in interleaving.\n");
480 return true;
483 if (loop->safelen < 2)
485 tree indicator = dr_zero_step_indicator (dra);
486 if (!indicator || integer_zerop (indicator))
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
490 "access also has a zero step\n");
491 return true;
493 else if (TREE_CODE (indicator) != INTEGER_CST)
494 vect_check_nonzero_value (loop_vinfo, indicator);
496 continue;
499 if (dist > 0 && DDR_REVERSED_P (ddr))
501 /* If DDR_REVERSED_P the order of the data-refs in DDR was
502 reversed (to make distance vector positive), and the actual
503 distance is negative. */
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
506 "dependence distance negative.\n");
507 /* Record a negative dependence distance to later limit the
508 amount of stmt copying / unrolling we can perform.
509 Only need to handle read-after-write dependence. */
510 if (DR_IS_READ (drb)
511 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
512 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
513 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
514 continue;
517 unsigned int abs_dist = abs (dist);
518 if (abs_dist >= 2 && abs_dist < *max_vf)
520 /* The dependence distance requires reduction of the maximal
521 vectorization factor. */
522 *max_vf = abs (dist);
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_NOTE, vect_location,
525 "adjusting maximal vectorization factor to %i\n",
526 *max_vf);
529 if (abs_dist >= *max_vf)
531 /* Dependence distance does not create dependence, as far as
532 vectorization is concerned, in this case. */
533 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE, vect_location,
535 "dependence distance >= VF.\n");
536 continue;
539 if (dump_enabled_p ())
541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
542 "not vectorized, possible dependence "
543 "between data-refs ");
544 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
545 dump_printf (MSG_NOTE, " and ");
546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
547 dump_printf (MSG_NOTE, "\n");
550 return true;
553 return false;
556 /* Function vect_analyze_data_ref_dependences.
558 Examine all the data references in the loop, and make sure there do not
559 exist any data dependences between them. Set *MAX_VF according to
560 the maximum vectorization factor the data dependences allow. */
562 bool
563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
564 unsigned int *max_vf)
566 unsigned int i;
567 struct data_dependence_relation *ddr;
569 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
571 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
573 LOOP_VINFO_DDRS (loop_vinfo)
574 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
575 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
576 /* We need read-read dependences to compute
577 STMT_VINFO_SAME_ALIGN_REFS. */
578 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
579 &LOOP_VINFO_DDRS (loop_vinfo),
580 LOOP_VINFO_LOOP_NEST (loop_vinfo),
581 true);
582 gcc_assert (res);
585 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
587 /* For epilogues we either have no aliases or alias versioning
588 was applied to original loop. Therefore we may just get max_vf
589 using VF of original loop. */
590 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
591 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
592 else
593 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
594 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
595 return false;
597 return true;
601 /* Function vect_slp_analyze_data_ref_dependence.
603 Return TRUE if there (might) exist a dependence between a memory-reference
604 DRA and a memory-reference DRB. When versioning for alias may check a
605 dependence at run-time, return FALSE. Adjust *MAX_VF according to
606 the data dependence. */
608 static bool
609 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
611 struct data_reference *dra = DDR_A (ddr);
612 struct data_reference *drb = DDR_B (ddr);
614 /* We need to check dependences of statements marked as unvectorizable
615 as well, they still can prohibit vectorization. */
617 /* Independent data accesses. */
618 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
619 return false;
621 if (dra == drb)
622 return false;
624 /* Read-read is OK. */
625 if (DR_IS_READ (dra) && DR_IS_READ (drb))
626 return false;
628 /* If dra and drb are part of the same interleaving chain consider
629 them independent. */
630 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (vect_dr_stmt (dra)))
631 && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dra)))
632 == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (drb)))))
633 return false;
635 /* Unknown data dependence. */
636 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
638 if (dump_enabled_p ())
640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641 "can't determine dependence between ");
642 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
643 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
644 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
645 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
648 else if (dump_enabled_p ())
650 dump_printf_loc (MSG_NOTE, vect_location,
651 "determined dependence between ");
652 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
653 dump_printf (MSG_NOTE, " and ");
654 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
655 dump_printf (MSG_NOTE, "\n");
658 return true;
662 /* Analyze dependences involved in the transform of SLP NODE. STORES
663 contain the vector of scalar stores of this instance if we are
664 disambiguating the loads. */
666 static bool
667 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
668 vec<gimple *> stores, gimple *last_store)
670 /* This walks over all stmts involved in the SLP load/store done
671 in NODE verifying we can sink them up to the last stmt in the
672 group. */
673 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
674 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
676 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
677 if (access == last_access)
678 continue;
679 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
680 ao_ref ref;
681 bool ref_initialized_p = false;
682 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
683 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
685 gimple *stmt = gsi_stmt (gsi);
686 if (! gimple_vuse (stmt)
687 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
688 continue;
690 /* If we couldn't record a (single) data reference for this
691 stmt we have to resort to the alias oracle. */
692 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
693 if (!dr_b)
695 /* We are moving a store or sinking a load - this means
696 we cannot use TBAA for disambiguation. */
697 if (!ref_initialized_p)
698 ao_ref_init (&ref, DR_REF (dr_a));
699 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
700 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
701 return false;
702 continue;
705 bool dependent = false;
706 /* If we run into a store of this same instance (we've just
707 marked those) then delay dependence checking until we run
708 into the last store because this is where it will have
709 been sunk to (and we verify if we can do that as well). */
710 if (gimple_visited_p (stmt))
712 if (stmt != last_store)
713 continue;
714 unsigned i;
715 gimple *store;
716 FOR_EACH_VEC_ELT (stores, i, store)
718 data_reference *store_dr
719 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
720 ddr_p ddr = initialize_data_dependence_relation
721 (dr_a, store_dr, vNULL);
722 dependent = vect_slp_analyze_data_ref_dependence (ddr);
723 free_dependence_relation (ddr);
724 if (dependent)
725 break;
728 else
730 ddr_p ddr = initialize_data_dependence_relation (dr_a,
731 dr_b, vNULL);
732 dependent = vect_slp_analyze_data_ref_dependence (ddr);
733 free_dependence_relation (ddr);
735 if (dependent)
736 return false;
739 return true;
743 /* Function vect_analyze_data_ref_dependences.
745 Examine all the data references in the basic-block, and make sure there
746 do not exist any data dependences between them. Set *MAX_VF according to
747 the maximum vectorization factor the data dependences allow. */
749 bool
750 vect_slp_analyze_instance_dependence (slp_instance instance)
752 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
754 /* The stores of this instance are at the root of the SLP tree. */
755 slp_tree store = SLP_INSTANCE_TREE (instance);
756 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
757 store = NULL;
759 /* Verify we can sink stores to the vectorized stmt insert location. */
760 gimple *last_store = NULL;
761 if (store)
763 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
764 return false;
766 /* Mark stores in this instance and remember the last one. */
767 last_store = vect_find_last_scalar_stmt_in_slp (store);
768 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
769 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
772 bool res = true;
774 /* Verify we can sink loads to the vectorized stmt insert location,
775 special-casing stores of this instance. */
776 slp_tree load;
777 unsigned int i;
778 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
779 if (! vect_slp_analyze_node_dependences (instance, load,
780 store
781 ? SLP_TREE_SCALAR_STMTS (store)
782 : vNULL, last_store))
784 res = false;
785 break;
788 /* Unset the visited flag. */
789 if (store)
790 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
791 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
793 return res;
796 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
797 the statement that contains DRB, which is useful for recording in the
798 dump file. */
800 static void
801 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
802 innermost_loop_behavior *drb)
804 bool existed;
805 innermost_loop_behavior *&entry
806 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
807 if (!existed || entry->base_alignment < drb->base_alignment)
809 entry = drb;
810 if (dump_enabled_p ())
812 dump_printf_loc (MSG_NOTE, vect_location,
813 "recording new base alignment for ");
814 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
815 dump_printf (MSG_NOTE, "\n");
816 dump_printf_loc (MSG_NOTE, vect_location,
817 " alignment: %d\n", drb->base_alignment);
818 dump_printf_loc (MSG_NOTE, vect_location,
819 " misalignment: %d\n", drb->base_misalignment);
820 dump_printf_loc (MSG_NOTE, vect_location,
821 " based on: ");
822 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
827 /* If the region we're going to vectorize is reached, all unconditional
828 data references occur at least once. We can therefore pool the base
829 alignment guarantees from each unconditional reference. Do this by
830 going through all the data references in VINFO and checking whether
831 the containing statement makes the reference unconditionally. If so,
832 record the alignment of the base address in VINFO so that it can be
833 used for all other references with the same base. */
835 void
836 vect_record_base_alignments (vec_info *vinfo)
838 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
839 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
840 data_reference *dr;
841 unsigned int i;
842 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
844 gimple *stmt = vect_dr_stmt (dr);
845 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
846 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
847 && STMT_VINFO_VECTORIZABLE (stmt_info)
848 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
850 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
852 /* If DR is nested in the loop that is being vectorized, we can also
853 record the alignment of the base wrt the outer loop. */
854 if (loop && nested_in_vect_loop_p (loop, stmt))
855 vect_record_base_alignment
856 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
861 /* Return the target alignment for the vectorized form of DR. */
863 static unsigned int
864 vect_calculate_target_alignment (struct data_reference *dr)
866 gimple *stmt = vect_dr_stmt (dr);
867 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
868 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
869 return targetm.vectorize.preferred_vector_alignment (vectype);
872 /* Function vect_compute_data_ref_alignment
874 Compute the misalignment of the data reference DR.
876 Output:
877 1. DR_MISALIGNMENT (DR) is defined.
879 FOR NOW: No analysis is actually performed. Misalignment is calculated
880 only for trivial cases. TODO. */
882 static void
883 vect_compute_data_ref_alignment (struct data_reference *dr)
885 gimple *stmt = vect_dr_stmt (dr);
886 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
887 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
888 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
889 struct loop *loop = NULL;
890 tree ref = DR_REF (dr);
891 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
893 if (dump_enabled_p ())
894 dump_printf_loc (MSG_NOTE, vect_location,
895 "vect_compute_data_ref_alignment:\n");
897 if (loop_vinfo)
898 loop = LOOP_VINFO_LOOP (loop_vinfo);
900 /* Initialize misalignment to unknown. */
901 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
903 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
904 return;
906 innermost_loop_behavior *drb = vect_dr_behavior (dr);
907 bool step_preserves_misalignment_p;
909 unsigned HOST_WIDE_INT vector_alignment
910 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
911 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
913 /* No step for BB vectorization. */
914 if (!loop)
916 gcc_assert (integer_zerop (drb->step));
917 step_preserves_misalignment_p = true;
920 /* In case the dataref is in an inner-loop of the loop that is being
921 vectorized (LOOP), we use the base and misalignment information
922 relative to the outer-loop (LOOP). This is ok only if the misalignment
923 stays the same throughout the execution of the inner-loop, which is why
924 we have to check that the stride of the dataref in the inner-loop evenly
925 divides by the vector alignment. */
926 else if (nested_in_vect_loop_p (loop, stmt))
928 step_preserves_misalignment_p
929 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
931 if (dump_enabled_p ())
933 if (step_preserves_misalignment_p)
934 dump_printf_loc (MSG_NOTE, vect_location,
935 "inner step divides the vector alignment.\n");
936 else
937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
938 "inner step doesn't divide the vector"
939 " alignment.\n");
943 /* Similarly we can only use base and misalignment information relative to
944 an innermost loop if the misalignment stays the same throughout the
945 execution of the loop. As above, this is the case if the stride of
946 the dataref evenly divides by the alignment. */
947 else
949 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
950 step_preserves_misalignment_p
951 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
953 if (!step_preserves_misalignment_p && dump_enabled_p ())
954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
955 "step doesn't divide the vector alignment.\n");
958 unsigned int base_alignment = drb->base_alignment;
959 unsigned int base_misalignment = drb->base_misalignment;
961 /* Calculate the maximum of the pooled base address alignment and the
962 alignment that we can compute for DR itself. */
963 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
964 if (entry && base_alignment < (*entry)->base_alignment)
966 base_alignment = (*entry)->base_alignment;
967 base_misalignment = (*entry)->base_misalignment;
970 if (drb->offset_alignment < vector_alignment
971 || !step_preserves_misalignment_p
972 /* We need to know whether the step wrt the vectorized loop is
973 negative when computing the starting misalignment below. */
974 || TREE_CODE (drb->step) != INTEGER_CST)
976 if (dump_enabled_p ())
978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
979 "Unknown alignment for access: ");
980 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
981 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
983 return;
986 if (base_alignment < vector_alignment)
988 unsigned int max_alignment;
989 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
990 if (max_alignment < vector_alignment
991 || !vect_can_force_dr_alignment_p (base,
992 vector_alignment * BITS_PER_UNIT))
994 if (dump_enabled_p ())
996 dump_printf_loc (MSG_NOTE, vect_location,
997 "can't force alignment of ref: ");
998 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
999 dump_printf (MSG_NOTE, "\n");
1001 return;
1004 /* Force the alignment of the decl.
1005 NOTE: This is the only change to the code we make during
1006 the analysis phase, before deciding to vectorize the loop. */
1007 if (dump_enabled_p ())
1009 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1010 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1011 dump_printf (MSG_NOTE, "\n");
1014 DR_VECT_AUX (dr)->base_decl = base;
1015 DR_VECT_AUX (dr)->base_misaligned = true;
1016 base_misalignment = 0;
1018 poly_int64 misalignment
1019 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1021 /* If this is a backward running DR then first access in the larger
1022 vectype actually is N-1 elements before the address in the DR.
1023 Adjust misalign accordingly. */
1024 if (tree_int_cst_sgn (drb->step) < 0)
1025 /* PLUS because STEP is negative. */
1026 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1027 * TREE_INT_CST_LOW (drb->step));
1029 unsigned int const_misalignment;
1030 if (!known_misalignment (misalignment, vector_alignment,
1031 &const_misalignment))
1033 if (dump_enabled_p ())
1035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1036 "Non-constant misalignment for access: ");
1037 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1038 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1040 return;
1043 SET_DR_MISALIGNMENT (dr, const_misalignment);
1045 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1048 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1049 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1050 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1053 return;
1056 /* Function vect_update_misalignment_for_peel.
1057 Sets DR's misalignment
1058 - to 0 if it has the same alignment as DR_PEEL,
1059 - to the misalignment computed using NPEEL if DR's salignment is known,
1060 - to -1 (unknown) otherwise.
1062 DR - the data reference whose misalignment is to be adjusted.
1063 DR_PEEL - the data reference whose misalignment is being made
1064 zero in the vector loop by the peel.
1065 NPEEL - the number of iterations in the peel loop if the misalignment
1066 of DR_PEEL is known at compile time. */
1068 static void
1069 vect_update_misalignment_for_peel (struct data_reference *dr,
1070 struct data_reference *dr_peel, int npeel)
1072 unsigned int i;
1073 vec<dr_p> same_aligned_drs;
1074 struct data_reference *current_dr;
1075 int dr_size = vect_get_scalar_dr_size (dr);
1076 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1077 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
1078 stmt_vec_info peel_stmt_info = vinfo_for_stmt (vect_dr_stmt (dr_peel));
1080 /* For interleaved data accesses the step in the loop must be multiplied by
1081 the size of the interleaving group. */
1082 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1083 dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1084 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1085 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1087 /* It can be assumed that the data refs with the same alignment as dr_peel
1088 are aligned in the vector loop. */
1089 same_aligned_drs
1090 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (vect_dr_stmt (dr_peel)));
1091 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1093 if (current_dr != dr)
1094 continue;
1095 gcc_assert (!known_alignment_for_access_p (dr)
1096 || !known_alignment_for_access_p (dr_peel)
1097 || (DR_MISALIGNMENT (dr) / dr_size
1098 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1099 SET_DR_MISALIGNMENT (dr, 0);
1100 return;
1103 if (known_alignment_for_access_p (dr)
1104 && known_alignment_for_access_p (dr_peel))
1106 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1107 int misal = DR_MISALIGNMENT (dr);
1108 misal += negative ? -npeel * dr_size : npeel * dr_size;
1109 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1110 SET_DR_MISALIGNMENT (dr, misal);
1111 return;
1114 if (dump_enabled_p ())
1115 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1116 "to unknown (-1).\n");
1117 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1121 /* Function verify_data_ref_alignment
1123 Return TRUE if DR can be handled with respect to alignment. */
1125 static bool
1126 verify_data_ref_alignment (data_reference_p dr)
1128 enum dr_alignment_support supportable_dr_alignment
1129 = vect_supportable_dr_alignment (dr, false);
1130 if (!supportable_dr_alignment)
1132 if (dump_enabled_p ())
1134 if (DR_IS_READ (dr))
1135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1136 "not vectorized: unsupported unaligned load.");
1137 else
1138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1139 "not vectorized: unsupported unaligned "
1140 "store.");
1142 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1143 DR_REF (dr));
1144 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1146 return false;
1149 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1150 dump_printf_loc (MSG_NOTE, vect_location,
1151 "Vectorizing an unaligned access.\n");
1153 return true;
1156 /* Function vect_verify_datarefs_alignment
1158 Return TRUE if all data references in the loop can be
1159 handled with respect to alignment. */
1161 bool
1162 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1164 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1165 struct data_reference *dr;
1166 unsigned int i;
1168 FOR_EACH_VEC_ELT (datarefs, i, dr)
1170 gimple *stmt = vect_dr_stmt (dr);
1171 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1173 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1174 continue;
1176 /* For interleaving, only the alignment of the first access matters. */
1177 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1178 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1179 continue;
1181 /* Strided accesses perform only component accesses, alignment is
1182 irrelevant for them. */
1183 if (STMT_VINFO_STRIDED_P (stmt_info)
1184 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1185 continue;
1187 if (! verify_data_ref_alignment (dr))
1188 return false;
1191 return true;
1194 /* Given an memory reference EXP return whether its alignment is less
1195 than its size. */
1197 static bool
1198 not_size_aligned (tree exp)
1200 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1201 return true;
1203 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1204 > get_object_alignment (exp));
1207 /* Function vector_alignment_reachable_p
1209 Return true if vector alignment for DR is reachable by peeling
1210 a few loop iterations. Return false otherwise. */
1212 static bool
1213 vector_alignment_reachable_p (struct data_reference *dr)
1215 gimple *stmt = vect_dr_stmt (dr);
1216 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1217 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1219 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1221 /* For interleaved access we peel only if number of iterations in
1222 the prolog loop ({VF - misalignment}), is a multiple of the
1223 number of the interleaved accesses. */
1224 int elem_size, mis_in_elements;
1226 /* FORNOW: handle only known alignment. */
1227 if (!known_alignment_for_access_p (dr))
1228 return false;
1230 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1231 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1232 elem_size = vector_element_size (vector_size, nelements);
1233 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1235 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1236 return false;
1239 /* If misalignment is known at the compile time then allow peeling
1240 only if natural alignment is reachable through peeling. */
1241 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1243 HOST_WIDE_INT elmsize =
1244 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1245 if (dump_enabled_p ())
1247 dump_printf_loc (MSG_NOTE, vect_location,
1248 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1249 dump_printf (MSG_NOTE,
1250 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1252 if (DR_MISALIGNMENT (dr) % elmsize)
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256 "data size does not divide the misalignment.\n");
1257 return false;
1261 if (!known_alignment_for_access_p (dr))
1263 tree type = TREE_TYPE (DR_REF (dr));
1264 bool is_packed = not_size_aligned (DR_REF (dr));
1265 if (dump_enabled_p ())
1266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1267 "Unknown misalignment, %snaturally aligned\n",
1268 is_packed ? "not " : "");
1269 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1272 return true;
1276 /* Calculate the cost of the memory access represented by DR. */
1278 static void
1279 vect_get_data_access_cost (struct data_reference *dr,
1280 unsigned int *inside_cost,
1281 unsigned int *outside_cost,
1282 stmt_vector_for_cost *body_cost_vec,
1283 stmt_vector_for_cost *prologue_cost_vec)
1285 gimple *stmt = vect_dr_stmt (dr);
1286 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1287 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1288 int ncopies;
1290 if (PURE_SLP_STMT (stmt_info))
1291 ncopies = 1;
1292 else
1293 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1295 if (DR_IS_READ (dr))
1296 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1297 prologue_cost_vec, body_cost_vec, false);
1298 else
1299 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_NOTE, vect_location,
1303 "vect_get_data_access_cost: inside_cost = %d, "
1304 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1308 typedef struct _vect_peel_info
1310 struct data_reference *dr;
1311 int npeel;
1312 unsigned int count;
1313 } *vect_peel_info;
1315 typedef struct _vect_peel_extended_info
1317 struct _vect_peel_info peel_info;
1318 unsigned int inside_cost;
1319 unsigned int outside_cost;
1320 } *vect_peel_extended_info;
1323 /* Peeling hashtable helpers. */
1325 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1327 static inline hashval_t hash (const _vect_peel_info *);
1328 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1331 inline hashval_t
1332 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1334 return (hashval_t) peel_info->npeel;
1337 inline bool
1338 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1340 return (a->npeel == b->npeel);
1344 /* Insert DR into peeling hash table with NPEEL as key. */
1346 static void
1347 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1348 loop_vec_info loop_vinfo, struct data_reference *dr,
1349 int npeel)
1351 struct _vect_peel_info elem, *slot;
1352 _vect_peel_info **new_slot;
1353 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1355 elem.npeel = npeel;
1356 slot = peeling_htab->find (&elem);
1357 if (slot)
1358 slot->count++;
1359 else
1361 slot = XNEW (struct _vect_peel_info);
1362 slot->npeel = npeel;
1363 slot->dr = dr;
1364 slot->count = 1;
1365 new_slot = peeling_htab->find_slot (slot, INSERT);
1366 *new_slot = slot;
1369 if (!supportable_dr_alignment
1370 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1371 slot->count += VECT_MAX_COST;
1375 /* Traverse peeling hash table to find peeling option that aligns maximum
1376 number of data accesses. */
1379 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1380 _vect_peel_extended_info *max)
1382 vect_peel_info elem = *slot;
1384 if (elem->count > max->peel_info.count
1385 || (elem->count == max->peel_info.count
1386 && max->peel_info.npeel > elem->npeel))
1388 max->peel_info.npeel = elem->npeel;
1389 max->peel_info.count = elem->count;
1390 max->peel_info.dr = elem->dr;
1393 return 1;
1396 /* Get the costs of peeling NPEEL iterations checking data access costs
1397 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1398 misalignment will be zero after peeling. */
1400 static void
1401 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1402 struct data_reference *dr0,
1403 unsigned int *inside_cost,
1404 unsigned int *outside_cost,
1405 stmt_vector_for_cost *body_cost_vec,
1406 stmt_vector_for_cost *prologue_cost_vec,
1407 unsigned int npeel,
1408 bool unknown_misalignment)
1410 unsigned i;
1411 data_reference *dr;
1413 FOR_EACH_VEC_ELT (datarefs, i, dr)
1415 gimple *stmt = vect_dr_stmt (dr);
1416 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1417 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1418 continue;
1420 /* For interleaving, only the alignment of the first access
1421 matters. */
1422 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1423 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1424 continue;
1426 /* Strided accesses perform only component accesses, alignment is
1427 irrelevant for them. */
1428 if (STMT_VINFO_STRIDED_P (stmt_info)
1429 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1430 continue;
1432 int save_misalignment;
1433 save_misalignment = DR_MISALIGNMENT (dr);
1434 if (npeel == 0)
1436 else if (unknown_misalignment && dr == dr0)
1437 SET_DR_MISALIGNMENT (dr, 0);
1438 else
1439 vect_update_misalignment_for_peel (dr, dr0, npeel);
1440 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1441 body_cost_vec, prologue_cost_vec);
1442 SET_DR_MISALIGNMENT (dr, save_misalignment);
1446 /* Traverse peeling hash table and calculate cost for each peeling option.
1447 Find the one with the lowest cost. */
1450 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1451 _vect_peel_extended_info *min)
1453 vect_peel_info elem = *slot;
1454 int dummy;
1455 unsigned int inside_cost = 0, outside_cost = 0;
1456 gimple *stmt = vect_dr_stmt (elem->dr);
1457 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1458 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1459 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1460 epilogue_cost_vec;
1462 prologue_cost_vec.create (2);
1463 body_cost_vec.create (2);
1464 epilogue_cost_vec.create (2);
1466 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1467 elem->dr, &inside_cost, &outside_cost,
1468 &body_cost_vec, &prologue_cost_vec,
1469 elem->npeel, false);
1471 body_cost_vec.release ();
1473 outside_cost += vect_get_known_peeling_cost
1474 (loop_vinfo, elem->npeel, &dummy,
1475 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1476 &prologue_cost_vec, &epilogue_cost_vec);
1478 /* Prologue and epilogue costs are added to the target model later.
1479 These costs depend only on the scalar iteration cost, the
1480 number of peeling iterations finally chosen, and the number of
1481 misaligned statements. So discard the information found here. */
1482 prologue_cost_vec.release ();
1483 epilogue_cost_vec.release ();
1485 if (inside_cost < min->inside_cost
1486 || (inside_cost == min->inside_cost
1487 && outside_cost < min->outside_cost))
1489 min->inside_cost = inside_cost;
1490 min->outside_cost = outside_cost;
1491 min->peel_info.dr = elem->dr;
1492 min->peel_info.npeel = elem->npeel;
1493 min->peel_info.count = elem->count;
1496 return 1;
1500 /* Choose best peeling option by traversing peeling hash table and either
1501 choosing an option with the lowest cost (if cost model is enabled) or the
1502 option that aligns as many accesses as possible. */
1504 static struct _vect_peel_extended_info
1505 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1506 loop_vec_info loop_vinfo)
1508 struct _vect_peel_extended_info res;
1510 res.peel_info.dr = NULL;
1512 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1514 res.inside_cost = INT_MAX;
1515 res.outside_cost = INT_MAX;
1516 peeling_htab->traverse <_vect_peel_extended_info *,
1517 vect_peeling_hash_get_lowest_cost> (&res);
1519 else
1521 res.peel_info.count = 0;
1522 peeling_htab->traverse <_vect_peel_extended_info *,
1523 vect_peeling_hash_get_most_frequent> (&res);
1524 res.inside_cost = 0;
1525 res.outside_cost = 0;
1528 return res;
1531 /* Return true if the new peeling NPEEL is supported. */
1533 static bool
1534 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1535 unsigned npeel)
1537 unsigned i;
1538 struct data_reference *dr = NULL;
1539 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1540 gimple *stmt;
1541 stmt_vec_info stmt_info;
1542 enum dr_alignment_support supportable_dr_alignment;
1544 /* Ensure that all data refs can be vectorized after the peel. */
1545 FOR_EACH_VEC_ELT (datarefs, i, dr)
1547 int save_misalignment;
1549 if (dr == dr0)
1550 continue;
1552 stmt = vect_dr_stmt (dr);
1553 stmt_info = vinfo_for_stmt (stmt);
1554 /* For interleaving, only the alignment of the first access
1555 matters. */
1556 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1557 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1558 continue;
1560 /* Strided accesses perform only component accesses, alignment is
1561 irrelevant for them. */
1562 if (STMT_VINFO_STRIDED_P (stmt_info)
1563 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1564 continue;
1566 save_misalignment = DR_MISALIGNMENT (dr);
1567 vect_update_misalignment_for_peel (dr, dr0, npeel);
1568 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1569 SET_DR_MISALIGNMENT (dr, save_misalignment);
1571 if (!supportable_dr_alignment)
1572 return false;
1575 return true;
1578 /* Function vect_enhance_data_refs_alignment
1580 This pass will use loop versioning and loop peeling in order to enhance
1581 the alignment of data references in the loop.
1583 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1584 original loop is to be vectorized. Any other loops that are created by
1585 the transformations performed in this pass - are not supposed to be
1586 vectorized. This restriction will be relaxed.
1588 This pass will require a cost model to guide it whether to apply peeling
1589 or versioning or a combination of the two. For example, the scheme that
1590 intel uses when given a loop with several memory accesses, is as follows:
1591 choose one memory access ('p') which alignment you want to force by doing
1592 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1593 other accesses are not necessarily aligned, or (2) use loop versioning to
1594 generate one loop in which all accesses are aligned, and another loop in
1595 which only 'p' is necessarily aligned.
1597 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1598 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1599 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1601 Devising a cost model is the most critical aspect of this work. It will
1602 guide us on which access to peel for, whether to use loop versioning, how
1603 many versions to create, etc. The cost model will probably consist of
1604 generic considerations as well as target specific considerations (on
1605 powerpc for example, misaligned stores are more painful than misaligned
1606 loads).
1608 Here are the general steps involved in alignment enhancements:
1610 -- original loop, before alignment analysis:
1611 for (i=0; i<N; i++){
1612 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1613 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1616 -- After vect_compute_data_refs_alignment:
1617 for (i=0; i<N; i++){
1618 x = q[i]; # DR_MISALIGNMENT(q) = 3
1619 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1622 -- Possibility 1: we do loop versioning:
1623 if (p is aligned) {
1624 for (i=0; i<N; i++){ # loop 1A
1625 x = q[i]; # DR_MISALIGNMENT(q) = 3
1626 p[i] = y; # DR_MISALIGNMENT(p) = 0
1629 else {
1630 for (i=0; i<N; i++){ # loop 1B
1631 x = q[i]; # DR_MISALIGNMENT(q) = 3
1632 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1636 -- Possibility 2: we do loop peeling:
1637 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1638 x = q[i];
1639 p[i] = y;
1641 for (i = 3; i < N; i++){ # loop 2A
1642 x = q[i]; # DR_MISALIGNMENT(q) = 0
1643 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1646 -- Possibility 3: combination of loop peeling and versioning:
1647 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1648 x = q[i];
1649 p[i] = y;
1651 if (p is aligned) {
1652 for (i = 3; i<N; i++){ # loop 3A
1653 x = q[i]; # DR_MISALIGNMENT(q) = 0
1654 p[i] = y; # DR_MISALIGNMENT(p) = 0
1657 else {
1658 for (i = 3; i<N; i++){ # loop 3B
1659 x = q[i]; # DR_MISALIGNMENT(q) = 0
1660 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1664 These loops are later passed to loop_transform to be vectorized. The
1665 vectorizer will use the alignment information to guide the transformation
1666 (whether to generate regular loads/stores, or with special handling for
1667 misalignment). */
1669 bool
1670 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1672 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1673 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674 enum dr_alignment_support supportable_dr_alignment;
1675 struct data_reference *dr0 = NULL, *first_store = NULL;
1676 struct data_reference *dr;
1677 unsigned int i, j;
1678 bool do_peeling = false;
1679 bool do_versioning = false;
1680 bool stat;
1681 gimple *stmt;
1682 stmt_vec_info stmt_info;
1683 unsigned int npeel = 0;
1684 bool one_misalignment_known = false;
1685 bool one_misalignment_unknown = false;
1686 bool one_dr_unsupportable = false;
1687 struct data_reference *unsupportable_dr = NULL;
1688 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1689 unsigned possible_npeel_number = 1;
1690 tree vectype;
1691 unsigned int mis, same_align_drs_max = 0;
1692 hash_table<peel_info_hasher> peeling_htab (1);
1694 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1696 /* Reset data so we can safely be called multiple times. */
1697 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1698 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1700 /* While cost model enhancements are expected in the future, the high level
1701 view of the code at this time is as follows:
1703 A) If there is a misaligned access then see if peeling to align
1704 this access can make all data references satisfy
1705 vect_supportable_dr_alignment. If so, update data structures
1706 as needed and return true.
1708 B) If peeling wasn't possible and there is a data reference with an
1709 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1710 then see if loop versioning checks can be used to make all data
1711 references satisfy vect_supportable_dr_alignment. If so, update
1712 data structures as needed and return true.
1714 C) If neither peeling nor versioning were successful then return false if
1715 any data reference does not satisfy vect_supportable_dr_alignment.
1717 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1719 Note, Possibility 3 above (which is peeling and versioning together) is not
1720 being done at this time. */
1722 /* (1) Peeling to force alignment. */
1724 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1725 Considerations:
1726 + How many accesses will become aligned due to the peeling
1727 - How many accesses will become unaligned due to the peeling,
1728 and the cost of misaligned accesses.
1729 - The cost of peeling (the extra runtime checks, the increase
1730 in code size). */
1732 FOR_EACH_VEC_ELT (datarefs, i, dr)
1734 stmt = vect_dr_stmt (dr);
1735 stmt_info = vinfo_for_stmt (stmt);
1737 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1738 continue;
1740 /* For interleaving, only the alignment of the first access
1741 matters. */
1742 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1743 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1744 continue;
1746 /* For scatter-gather or invariant accesses there is nothing
1747 to enhance. */
1748 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1749 || integer_zerop (DR_STEP (dr)))
1750 continue;
1752 /* Strided accesses perform only component accesses, alignment is
1753 irrelevant for them. */
1754 if (STMT_VINFO_STRIDED_P (stmt_info)
1755 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1756 continue;
1758 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1759 do_peeling = vector_alignment_reachable_p (dr);
1760 if (do_peeling)
1762 if (known_alignment_for_access_p (dr))
1764 unsigned int npeel_tmp = 0;
1765 bool negative = tree_int_cst_compare (DR_STEP (dr),
1766 size_zero_node) < 0;
1768 vectype = STMT_VINFO_VECTYPE (stmt_info);
1769 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1770 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1771 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1772 if (DR_MISALIGNMENT (dr) != 0)
1773 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1775 /* For multiple types, it is possible that the bigger type access
1776 will have more than one peeling option. E.g., a loop with two
1777 types: one of size (vector size / 4), and the other one of
1778 size (vector size / 8). Vectorization factor will 8. If both
1779 accesses are misaligned by 3, the first one needs one scalar
1780 iteration to be aligned, and the second one needs 5. But the
1781 first one will be aligned also by peeling 5 scalar
1782 iterations, and in that case both accesses will be aligned.
1783 Hence, except for the immediate peeling amount, we also want
1784 to try to add full vector size, while we don't exceed
1785 vectorization factor.
1786 We do this automatically for cost model, since we calculate
1787 cost for every peeling option. */
1788 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1790 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1791 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1792 possible_npeel_number
1793 = vect_get_num_vectors (nscalars, vectype);
1795 /* NPEEL_TMP is 0 when there is no misalignment, but also
1796 allow peeling NELEMENTS. */
1797 if (DR_MISALIGNMENT (dr) == 0)
1798 possible_npeel_number++;
1801 /* Save info about DR in the hash table. Also include peeling
1802 amounts according to the explanation above. */
1803 for (j = 0; j < possible_npeel_number; j++)
1805 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1806 dr, npeel_tmp);
1807 npeel_tmp += target_align / dr_size;
1810 one_misalignment_known = true;
1812 else
1814 /* If we don't know any misalignment values, we prefer
1815 peeling for data-ref that has the maximum number of data-refs
1816 with the same alignment, unless the target prefers to align
1817 stores over load. */
1818 unsigned same_align_drs
1819 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1820 if (!dr0
1821 || same_align_drs_max < same_align_drs)
1823 same_align_drs_max = same_align_drs;
1824 dr0 = dr;
1826 /* For data-refs with the same number of related
1827 accesses prefer the one where the misalign
1828 computation will be invariant in the outermost loop. */
1829 else if (same_align_drs_max == same_align_drs)
1831 struct loop *ivloop0, *ivloop;
1832 ivloop0 = outermost_invariant_loop_for_expr
1833 (loop, DR_BASE_ADDRESS (dr0));
1834 ivloop = outermost_invariant_loop_for_expr
1835 (loop, DR_BASE_ADDRESS (dr));
1836 if ((ivloop && !ivloop0)
1837 || (ivloop && ivloop0
1838 && flow_loop_nested_p (ivloop, ivloop0)))
1839 dr0 = dr;
1842 one_misalignment_unknown = true;
1844 /* Check for data refs with unsupportable alignment that
1845 can be peeled. */
1846 if (!supportable_dr_alignment)
1848 one_dr_unsupportable = true;
1849 unsupportable_dr = dr;
1852 if (!first_store && DR_IS_WRITE (dr))
1853 first_store = dr;
1856 else
1858 if (!aligned_access_p (dr))
1860 if (dump_enabled_p ())
1861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1862 "vector alignment may not be reachable\n");
1863 break;
1868 /* Check if we can possibly peel the loop. */
1869 if (!vect_can_advance_ivs_p (loop_vinfo)
1870 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1871 || loop->inner)
1872 do_peeling = false;
1874 struct _vect_peel_extended_info peel_for_known_alignment;
1875 struct _vect_peel_extended_info peel_for_unknown_alignment;
1876 struct _vect_peel_extended_info best_peel;
1878 peel_for_unknown_alignment.inside_cost = INT_MAX;
1879 peel_for_unknown_alignment.outside_cost = INT_MAX;
1880 peel_for_unknown_alignment.peel_info.count = 0;
1882 if (do_peeling
1883 && one_misalignment_unknown)
1885 /* Check if the target requires to prefer stores over loads, i.e., if
1886 misaligned stores are more expensive than misaligned loads (taking
1887 drs with same alignment into account). */
1888 unsigned int load_inside_cost = 0;
1889 unsigned int load_outside_cost = 0;
1890 unsigned int store_inside_cost = 0;
1891 unsigned int store_outside_cost = 0;
1892 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1894 stmt_vector_for_cost dummy;
1895 dummy.create (2);
1896 vect_get_peeling_costs_all_drs (datarefs, dr0,
1897 &load_inside_cost,
1898 &load_outside_cost,
1899 &dummy, &dummy, estimated_npeels, true);
1900 dummy.release ();
1902 if (first_store)
1904 dummy.create (2);
1905 vect_get_peeling_costs_all_drs (datarefs, first_store,
1906 &store_inside_cost,
1907 &store_outside_cost,
1908 &dummy, &dummy,
1909 estimated_npeels, true);
1910 dummy.release ();
1912 else
1914 store_inside_cost = INT_MAX;
1915 store_outside_cost = INT_MAX;
1918 if (load_inside_cost > store_inside_cost
1919 || (load_inside_cost == store_inside_cost
1920 && load_outside_cost > store_outside_cost))
1922 dr0 = first_store;
1923 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1924 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1926 else
1928 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1929 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1932 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1933 prologue_cost_vec.create (2);
1934 epilogue_cost_vec.create (2);
1936 int dummy2;
1937 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1938 (loop_vinfo, estimated_npeels, &dummy2,
1939 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1940 &prologue_cost_vec, &epilogue_cost_vec);
1942 prologue_cost_vec.release ();
1943 epilogue_cost_vec.release ();
1945 peel_for_unknown_alignment.peel_info.count = 1
1946 + STMT_VINFO_SAME_ALIGN_REFS
1947 (vinfo_for_stmt (vect_dr_stmt (dr0))).length ();
1950 peel_for_unknown_alignment.peel_info.npeel = 0;
1951 peel_for_unknown_alignment.peel_info.dr = dr0;
1953 best_peel = peel_for_unknown_alignment;
1955 peel_for_known_alignment.inside_cost = INT_MAX;
1956 peel_for_known_alignment.outside_cost = INT_MAX;
1957 peel_for_known_alignment.peel_info.count = 0;
1958 peel_for_known_alignment.peel_info.dr = NULL;
1960 if (do_peeling && one_misalignment_known)
1962 /* Peeling is possible, but there is no data access that is not supported
1963 unless aligned. So we try to choose the best possible peeling from
1964 the hash table. */
1965 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1966 (&peeling_htab, loop_vinfo);
1969 /* Compare costs of peeling for known and unknown alignment. */
1970 if (peel_for_known_alignment.peel_info.dr != NULL
1971 && peel_for_unknown_alignment.inside_cost
1972 >= peel_for_known_alignment.inside_cost)
1974 best_peel = peel_for_known_alignment;
1976 /* If the best peeling for known alignment has NPEEL == 0, perform no
1977 peeling at all except if there is an unsupportable dr that we can
1978 align. */
1979 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1980 do_peeling = false;
1983 /* If there is an unsupportable data ref, prefer this over all choices so far
1984 since we'd have to discard a chosen peeling except when it accidentally
1985 aligned the unsupportable data ref. */
1986 if (one_dr_unsupportable)
1987 dr0 = unsupportable_dr;
1988 else if (do_peeling)
1990 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1991 TODO: Use nopeel_outside_cost or get rid of it? */
1992 unsigned nopeel_inside_cost = 0;
1993 unsigned nopeel_outside_cost = 0;
1995 stmt_vector_for_cost dummy;
1996 dummy.create (2);
1997 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1998 &nopeel_outside_cost, &dummy, &dummy,
1999 0, false);
2000 dummy.release ();
2002 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2003 costs will be recorded. */
2004 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2005 prologue_cost_vec.create (2);
2006 epilogue_cost_vec.create (2);
2008 int dummy2;
2009 nopeel_outside_cost += vect_get_known_peeling_cost
2010 (loop_vinfo, 0, &dummy2,
2011 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2012 &prologue_cost_vec, &epilogue_cost_vec);
2014 prologue_cost_vec.release ();
2015 epilogue_cost_vec.release ();
2017 npeel = best_peel.peel_info.npeel;
2018 dr0 = best_peel.peel_info.dr;
2020 /* If no peeling is not more expensive than the best peeling we
2021 have so far, don't perform any peeling. */
2022 if (nopeel_inside_cost <= best_peel.inside_cost)
2023 do_peeling = false;
2026 if (do_peeling)
2028 stmt = vect_dr_stmt (dr0);
2029 stmt_info = vinfo_for_stmt (stmt);
2030 vectype = STMT_VINFO_VECTYPE (stmt_info);
2032 if (known_alignment_for_access_p (dr0))
2034 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2035 size_zero_node) < 0;
2036 if (!npeel)
2038 /* Since it's known at compile time, compute the number of
2039 iterations in the peeled loop (the peeling factor) for use in
2040 updating DR_MISALIGNMENT values. The peeling factor is the
2041 vectorization factor minus the misalignment as an element
2042 count. */
2043 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2044 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2045 npeel = ((mis & (target_align - 1))
2046 / vect_get_scalar_dr_size (dr0));
2049 /* For interleaved data access every iteration accesses all the
2050 members of the group, therefore we divide the number of iterations
2051 by the group size. */
2052 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr0));
2053 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2054 npeel /= DR_GROUP_SIZE (stmt_info);
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_NOTE, vect_location,
2058 "Try peeling by %d\n", npeel);
2061 /* Ensure that all datarefs can be vectorized after the peel. */
2062 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2063 do_peeling = false;
2065 /* Check if all datarefs are supportable and log. */
2066 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2068 stat = vect_verify_datarefs_alignment (loop_vinfo);
2069 if (!stat)
2070 do_peeling = false;
2071 else
2072 return stat;
2075 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2076 if (do_peeling)
2078 unsigned max_allowed_peel
2079 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2080 if (max_allowed_peel != (unsigned)-1)
2082 unsigned max_peel = npeel;
2083 if (max_peel == 0)
2085 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2086 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2088 if (max_peel > max_allowed_peel)
2090 do_peeling = false;
2091 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_NOTE, vect_location,
2093 "Disable peeling, max peels reached: %d\n", max_peel);
2098 /* Cost model #2 - if peeling may result in a remaining loop not
2099 iterating enough to be vectorized then do not peel. Since this
2100 is a cost heuristic rather than a correctness decision, use the
2101 most likely runtime value for variable vectorization factors. */
2102 if (do_peeling
2103 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2105 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2106 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2107 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2108 < assumed_vf + max_peel)
2109 do_peeling = false;
2112 if (do_peeling)
2114 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2115 If the misalignment of DR_i is identical to that of dr0 then set
2116 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2117 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2118 by the peeling factor times the element size of DR_i (MOD the
2119 vectorization factor times the size). Otherwise, the
2120 misalignment of DR_i must be set to unknown. */
2121 FOR_EACH_VEC_ELT (datarefs, i, dr)
2122 if (dr != dr0)
2124 /* Strided accesses perform only component accesses, alignment
2125 is irrelevant for them. */
2126 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2127 if (STMT_VINFO_STRIDED_P (stmt_info)
2128 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2129 continue;
2131 vect_update_misalignment_for_peel (dr, dr0, npeel);
2134 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2135 if (npeel)
2136 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2137 else
2138 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2139 = DR_MISALIGNMENT (dr0);
2140 SET_DR_MISALIGNMENT (dr0, 0);
2141 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_NOTE, vect_location,
2144 "Alignment of access forced using peeling.\n");
2145 dump_printf_loc (MSG_NOTE, vect_location,
2146 "Peeling for alignment will be applied.\n");
2149 /* The inside-loop cost will be accounted for in vectorizable_load
2150 and vectorizable_store correctly with adjusted alignments.
2151 Drop the body_cst_vec on the floor here. */
2152 stat = vect_verify_datarefs_alignment (loop_vinfo);
2153 gcc_assert (stat);
2154 return stat;
2158 /* (2) Versioning to force alignment. */
2160 /* Try versioning if:
2161 1) optimize loop for speed
2162 2) there is at least one unsupported misaligned data ref with an unknown
2163 misalignment, and
2164 3) all misaligned data refs with a known misalignment are supported, and
2165 4) the number of runtime alignment checks is within reason. */
2167 do_versioning =
2168 optimize_loop_nest_for_speed_p (loop)
2169 && (!loop->inner); /* FORNOW */
2171 if (do_versioning)
2173 FOR_EACH_VEC_ELT (datarefs, i, dr)
2175 stmt = vect_dr_stmt (dr);
2176 stmt_info = vinfo_for_stmt (stmt);
2178 /* For interleaving, only the alignment of the first access
2179 matters. */
2180 if (aligned_access_p (dr)
2181 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2182 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2183 continue;
2185 if (STMT_VINFO_STRIDED_P (stmt_info))
2187 /* Strided loads perform only component accesses, alignment is
2188 irrelevant for them. */
2189 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2190 continue;
2191 do_versioning = false;
2192 break;
2195 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2197 if (!supportable_dr_alignment)
2199 gimple *stmt;
2200 int mask;
2201 tree vectype;
2203 if (known_alignment_for_access_p (dr)
2204 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2205 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2207 do_versioning = false;
2208 break;
2211 stmt = vect_dr_stmt (dr);
2212 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2213 gcc_assert (vectype);
2215 /* At present we don't support versioning for alignment
2216 with variable VF, since there's no guarantee that the
2217 VF is a power of two. We could relax this if we added
2218 a way of enforcing a power-of-two size. */
2219 unsigned HOST_WIDE_INT size;
2220 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2222 do_versioning = false;
2223 break;
2226 /* The rightmost bits of an aligned address must be zeros.
2227 Construct the mask needed for this test. For example,
2228 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2229 mask must be 15 = 0xf. */
2230 mask = size - 1;
2232 /* FORNOW: use the same mask to test all potentially unaligned
2233 references in the loop. The vectorizer currently supports
2234 a single vector size, see the reference to
2235 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2236 vectorization factor is computed. */
2237 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2238 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2239 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2240 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2241 vect_dr_stmt (dr));
2245 /* Versioning requires at least one misaligned data reference. */
2246 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2247 do_versioning = false;
2248 else if (!do_versioning)
2249 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2252 if (do_versioning)
2254 vec<gimple *> may_misalign_stmts
2255 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2256 gimple *stmt;
2258 /* It can now be assumed that the data references in the statements
2259 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2260 of the loop being vectorized. */
2261 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2263 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2264 dr = STMT_VINFO_DATA_REF (stmt_info);
2265 SET_DR_MISALIGNMENT (dr, 0);
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE, vect_location,
2268 "Alignment of access forced using versioning.\n");
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_NOTE, vect_location,
2273 "Versioning for alignment will be applied.\n");
2275 /* Peeling and versioning can't be done together at this time. */
2276 gcc_assert (! (do_peeling && do_versioning));
2278 stat = vect_verify_datarefs_alignment (loop_vinfo);
2279 gcc_assert (stat);
2280 return stat;
2283 /* This point is reached if neither peeling nor versioning is being done. */
2284 gcc_assert (! (do_peeling || do_versioning));
2286 stat = vect_verify_datarefs_alignment (loop_vinfo);
2287 return stat;
2291 /* Function vect_find_same_alignment_drs.
2293 Update group and alignment relations according to the chosen
2294 vectorization factor. */
2296 static void
2297 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2299 struct data_reference *dra = DDR_A (ddr);
2300 struct data_reference *drb = DDR_B (ddr);
2301 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2302 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2304 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2305 return;
2307 if (dra == drb)
2308 return;
2310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2312 return;
2314 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2315 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2316 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2317 return;
2319 /* Two references with distance zero have the same alignment. */
2320 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2321 - wi::to_poly_offset (DR_INIT (drb)));
2322 if (maybe_ne (diff, 0))
2324 /* Get the wider of the two alignments. */
2325 unsigned int align_a = (vect_calculate_target_alignment (dra)
2326 / BITS_PER_UNIT);
2327 unsigned int align_b = (vect_calculate_target_alignment (drb)
2328 / BITS_PER_UNIT);
2329 unsigned int max_align = MAX (align_a, align_b);
2331 /* Require the gap to be a multiple of the larger vector alignment. */
2332 if (!multiple_p (diff, max_align))
2333 return;
2336 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2337 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2338 if (dump_enabled_p ())
2340 dump_printf_loc (MSG_NOTE, vect_location,
2341 "accesses have the same alignment: ");
2342 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2343 dump_printf (MSG_NOTE, " and ");
2344 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2345 dump_printf (MSG_NOTE, "\n");
2350 /* Function vect_analyze_data_refs_alignment
2352 Analyze the alignment of the data-references in the loop.
2353 Return FALSE if a data reference is found that cannot be vectorized. */
2355 bool
2356 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2358 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2360 /* Mark groups of data references with same alignment using
2361 data dependence information. */
2362 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2363 struct data_dependence_relation *ddr;
2364 unsigned int i;
2366 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2367 vect_find_same_alignment_drs (ddr);
2369 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2370 struct data_reference *dr;
2372 vect_record_base_alignments (vinfo);
2373 FOR_EACH_VEC_ELT (datarefs, i, dr)
2375 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2376 if (STMT_VINFO_VECTORIZABLE (stmt_info))
2377 vect_compute_data_ref_alignment (dr);
2380 return true;
2384 /* Analyze alignment of DRs of stmts in NODE. */
2386 static bool
2387 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2389 /* We vectorize from the first scalar stmt in the node unless
2390 the node is permuted in which case we start from the first
2391 element in the group. */
2392 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2393 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2394 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2395 first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2397 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2398 vect_compute_data_ref_alignment (dr);
2399 /* For creating the data-ref pointer we need alignment of the
2400 first element anyway. */
2401 if (dr != first_dr)
2402 vect_compute_data_ref_alignment (first_dr);
2403 if (! verify_data_ref_alignment (dr))
2405 if (dump_enabled_p ())
2406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2407 "not vectorized: bad data alignment in basic "
2408 "block.\n");
2409 return false;
2412 return true;
2415 /* Function vect_slp_analyze_instance_alignment
2417 Analyze the alignment of the data-references in the SLP instance.
2418 Return FALSE if a data reference is found that cannot be vectorized. */
2420 bool
2421 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2423 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2425 slp_tree node;
2426 unsigned i;
2427 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2428 if (! vect_slp_analyze_and_verify_node_alignment (node))
2429 return false;
2431 node = SLP_INSTANCE_TREE (instance);
2432 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2433 && ! vect_slp_analyze_and_verify_node_alignment
2434 (SLP_INSTANCE_TREE (instance)))
2435 return false;
2437 return true;
2441 /* Analyze groups of accesses: check that DR belongs to a group of
2442 accesses of legal size, step, etc. Detect gaps, single element
2443 interleaving, and other special cases. Set grouped access info.
2444 Collect groups of strided stores for further use in SLP analysis.
2445 Worker for vect_analyze_group_access. */
2447 static bool
2448 vect_analyze_group_access_1 (struct data_reference *dr)
2450 tree step = DR_STEP (dr);
2451 tree scalar_type = TREE_TYPE (DR_REF (dr));
2452 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2453 gimple *stmt = vect_dr_stmt (dr);
2454 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2455 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2456 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2457 HOST_WIDE_INT dr_step = -1;
2458 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2459 bool slp_impossible = false;
2461 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2462 size of the interleaving group (including gaps). */
2463 if (tree_fits_shwi_p (step))
2465 dr_step = tree_to_shwi (step);
2466 /* Check that STEP is a multiple of type size. Otherwise there is
2467 a non-element-sized gap at the end of the group which we
2468 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2469 ??? As we can handle non-constant step fine here we should
2470 simply remove uses of DR_GROUP_GAP between the last and first
2471 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2472 simply not include that gap. */
2473 if ((dr_step % type_size) != 0)
2475 if (dump_enabled_p ())
2477 dump_printf_loc (MSG_NOTE, vect_location,
2478 "Step ");
2479 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2480 dump_printf (MSG_NOTE,
2481 " is not a multiple of the element size for ");
2482 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2483 dump_printf (MSG_NOTE, "\n");
2485 return false;
2487 groupsize = absu_hwi (dr_step) / type_size;
2489 else
2490 groupsize = 0;
2492 /* Not consecutive access is possible only if it is a part of interleaving. */
2493 if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2495 /* Check if it this DR is a part of interleaving, and is a single
2496 element of the group that is accessed in the loop. */
2498 /* Gaps are supported only for loads. STEP must be a multiple of the type
2499 size. */
2500 if (DR_IS_READ (dr)
2501 && (dr_step % type_size) == 0
2502 && groupsize > 0)
2504 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2505 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2506 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2507 if (dump_enabled_p ())
2509 dump_printf_loc (MSG_NOTE, vect_location,
2510 "Detected single element interleaving ");
2511 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2512 dump_printf (MSG_NOTE, " step ");
2513 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2514 dump_printf (MSG_NOTE, "\n");
2517 return true;
2520 if (dump_enabled_p ())
2522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523 "not consecutive access ");
2524 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2527 if (bb_vinfo)
2529 /* Mark the statement as unvectorizable. */
2530 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
2531 return true;
2534 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2535 STMT_VINFO_STRIDED_P (stmt_info) = true;
2536 return true;
2539 if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2541 /* First stmt in the interleaving chain. Check the chain. */
2542 gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2543 struct data_reference *data_ref = dr;
2544 unsigned int count = 1;
2545 tree prev_init = DR_INIT (data_ref);
2546 gimple *prev = stmt;
2547 HOST_WIDE_INT diff, gaps = 0;
2549 /* By construction, all group members have INTEGER_CST DR_INITs. */
2550 while (next)
2552 /* Skip same data-refs. In case that two or more stmts share
2553 data-ref (supported only for loads), we vectorize only the first
2554 stmt, and the rest get their vectorized loads from the first
2555 one. */
2556 if (!tree_int_cst_compare (DR_INIT (data_ref),
2557 DR_INIT (STMT_VINFO_DATA_REF (
2558 vinfo_for_stmt (next)))))
2560 if (DR_IS_WRITE (data_ref))
2562 if (dump_enabled_p ())
2563 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2564 "Two store stmts share the same dr.\n");
2565 return false;
2568 if (dump_enabled_p ())
2569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2570 "Two or more load stmts share the same dr.\n");
2572 /* For load use the same data-ref load. */
2573 DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2575 prev = next;
2576 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2577 continue;
2580 prev = next;
2581 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2583 /* All group members have the same STEP by construction. */
2584 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2586 /* Check that the distance between two accesses is equal to the type
2587 size. Otherwise, we have gaps. */
2588 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2589 - TREE_INT_CST_LOW (prev_init)) / type_size;
2590 if (diff != 1)
2592 /* FORNOW: SLP of accesses with gaps is not supported. */
2593 slp_impossible = true;
2594 if (DR_IS_WRITE (data_ref))
2596 if (dump_enabled_p ())
2597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2598 "interleaved store with gaps\n");
2599 return false;
2602 gaps += diff - 1;
2605 last_accessed_element += diff;
2607 /* Store the gap from the previous member of the group. If there is no
2608 gap in the access, DR_GROUP_GAP is always 1. */
2609 DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2611 prev_init = DR_INIT (data_ref);
2612 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2613 /* Count the number of data-refs in the chain. */
2614 count++;
2617 if (groupsize == 0)
2618 groupsize = count + gaps;
2620 /* This could be UINT_MAX but as we are generating code in a very
2621 inefficient way we have to cap earlier. See PR78699 for example. */
2622 if (groupsize > 4096)
2624 if (dump_enabled_p ())
2625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2626 "group is too large\n");
2627 return false;
2630 /* Check that the size of the interleaving is equal to count for stores,
2631 i.e., that there are no gaps. */
2632 if (groupsize != count
2633 && !DR_IS_READ (dr))
2635 if (dump_enabled_p ())
2636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2637 "interleaved store with gaps\n");
2638 return false;
2641 /* If there is a gap after the last load in the group it is the
2642 difference between the groupsize and the last accessed
2643 element.
2644 When there is no gap, this difference should be 0. */
2645 DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2647 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2648 if (dump_enabled_p ())
2650 dump_printf_loc (MSG_NOTE, vect_location,
2651 "Detected interleaving ");
2652 if (DR_IS_READ (dr))
2653 dump_printf (MSG_NOTE, "load ");
2654 else
2655 dump_printf (MSG_NOTE, "store ");
2656 dump_printf (MSG_NOTE, "of size %u starting with ",
2657 (unsigned)groupsize);
2658 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2659 if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2660 dump_printf_loc (MSG_NOTE, vect_location,
2661 "There is a gap of %u elements after the group\n",
2662 DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2665 /* SLP: create an SLP data structure for every interleaving group of
2666 stores for further analysis in vect_analyse_slp. */
2667 if (DR_IS_WRITE (dr) && !slp_impossible)
2669 if (loop_vinfo)
2670 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2671 if (bb_vinfo)
2672 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2676 return true;
2679 /* Analyze groups of accesses: check that DR belongs to a group of
2680 accesses of legal size, step, etc. Detect gaps, single element
2681 interleaving, and other special cases. Set grouped access info.
2682 Collect groups of strided stores for further use in SLP analysis. */
2684 static bool
2685 vect_analyze_group_access (struct data_reference *dr)
2687 if (!vect_analyze_group_access_1 (dr))
2689 /* Dissolve the group if present. */
2690 gimple *next;
2691 gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dr)));
2692 while (stmt)
2694 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2695 next = DR_GROUP_NEXT_ELEMENT (vinfo);
2696 DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2697 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2698 stmt = next;
2700 return false;
2702 return true;
2705 /* Analyze the access pattern of the data-reference DR.
2706 In case of non-consecutive accesses call vect_analyze_group_access() to
2707 analyze groups of accesses. */
2709 static bool
2710 vect_analyze_data_ref_access (struct data_reference *dr)
2712 tree step = DR_STEP (dr);
2713 tree scalar_type = TREE_TYPE (DR_REF (dr));
2714 gimple *stmt = vect_dr_stmt (dr);
2715 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2716 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2717 struct loop *loop = NULL;
2719 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2720 return true;
2722 if (loop_vinfo)
2723 loop = LOOP_VINFO_LOOP (loop_vinfo);
2725 if (loop_vinfo && !step)
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2729 "bad data-ref access in loop\n");
2730 return false;
2733 /* Allow loads with zero step in inner-loop vectorization. */
2734 if (loop_vinfo && integer_zerop (step))
2736 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2737 if (!nested_in_vect_loop_p (loop, stmt))
2738 return DR_IS_READ (dr);
2739 /* Allow references with zero step for outer loops marked
2740 with pragma omp simd only - it guarantees absence of
2741 loop-carried dependencies between inner loop iterations. */
2742 if (loop->safelen < 2)
2744 if (dump_enabled_p ())
2745 dump_printf_loc (MSG_NOTE, vect_location,
2746 "zero step in inner loop of nest\n");
2747 return false;
2751 if (loop && nested_in_vect_loop_p (loop, stmt))
2753 /* Interleaved accesses are not yet supported within outer-loop
2754 vectorization for references in the inner-loop. */
2755 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2757 /* For the rest of the analysis we use the outer-loop step. */
2758 step = STMT_VINFO_DR_STEP (stmt_info);
2759 if (integer_zerop (step))
2761 if (dump_enabled_p ())
2762 dump_printf_loc (MSG_NOTE, vect_location,
2763 "zero step in outer loop.\n");
2764 return DR_IS_READ (dr);
2768 /* Consecutive? */
2769 if (TREE_CODE (step) == INTEGER_CST)
2771 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2772 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2773 || (dr_step < 0
2774 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2776 /* Mark that it is not interleaving. */
2777 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2778 return true;
2782 if (loop && nested_in_vect_loop_p (loop, stmt))
2784 if (dump_enabled_p ())
2785 dump_printf_loc (MSG_NOTE, vect_location,
2786 "grouped access in outer loop.\n");
2787 return false;
2791 /* Assume this is a DR handled by non-constant strided load case. */
2792 if (TREE_CODE (step) != INTEGER_CST)
2793 return (STMT_VINFO_STRIDED_P (stmt_info)
2794 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2795 || vect_analyze_group_access (dr)));
2797 /* Not consecutive access - check if it's a part of interleaving group. */
2798 return vect_analyze_group_access (dr);
2801 /* Compare two data-references DRA and DRB to group them into chunks
2802 suitable for grouping. */
2804 static int
2805 dr_group_sort_cmp (const void *dra_, const void *drb_)
2807 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2808 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2809 int cmp;
2811 /* Stabilize sort. */
2812 if (dra == drb)
2813 return 0;
2815 /* DRs in different loops never belong to the same group. */
2816 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2817 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2818 if (loopa != loopb)
2819 return loopa->num < loopb->num ? -1 : 1;
2821 /* Ordering of DRs according to base. */
2822 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2823 DR_BASE_ADDRESS (drb));
2824 if (cmp != 0)
2825 return cmp;
2827 /* And according to DR_OFFSET. */
2828 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2829 if (cmp != 0)
2830 return cmp;
2832 /* Put reads before writes. */
2833 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2834 return DR_IS_READ (dra) ? -1 : 1;
2836 /* Then sort after access size. */
2837 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2838 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2839 if (cmp != 0)
2840 return cmp;
2842 /* And after step. */
2843 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2844 if (cmp != 0)
2845 return cmp;
2847 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2848 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2849 if (cmp == 0)
2850 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2851 return cmp;
2854 /* If OP is the result of a conversion, return the unconverted value,
2855 otherwise return null. */
2857 static tree
2858 strip_conversion (tree op)
2860 if (TREE_CODE (op) != SSA_NAME)
2861 return NULL_TREE;
2862 gimple *stmt = SSA_NAME_DEF_STMT (op);
2863 if (!is_gimple_assign (stmt)
2864 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2865 return NULL_TREE;
2866 return gimple_assign_rhs1 (stmt);
2869 /* Return true if vectorizable_* routines can handle statements STMT1
2870 and STMT2 being in a single group. */
2872 static bool
2873 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2875 if (gimple_assign_single_p (stmt1))
2876 return gimple_assign_single_p (stmt2);
2878 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2880 /* Check for two masked loads or two masked stores. */
2881 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2882 return false;
2883 internal_fn ifn = gimple_call_internal_fn (stmt1);
2884 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2885 return false;
2886 if (ifn != gimple_call_internal_fn (stmt2))
2887 return false;
2889 /* Check that the masks are the same. Cope with casts of masks,
2890 like those created by build_mask_conversion. */
2891 tree mask1 = gimple_call_arg (stmt1, 2);
2892 tree mask2 = gimple_call_arg (stmt2, 2);
2893 if (!operand_equal_p (mask1, mask2, 0))
2895 mask1 = strip_conversion (mask1);
2896 if (!mask1)
2897 return false;
2898 mask2 = strip_conversion (mask2);
2899 if (!mask2)
2900 return false;
2901 if (!operand_equal_p (mask1, mask2, 0))
2902 return false;
2904 return true;
2907 return false;
2910 /* Function vect_analyze_data_ref_accesses.
2912 Analyze the access pattern of all the data references in the loop.
2914 FORNOW: the only access pattern that is considered vectorizable is a
2915 simple step 1 (consecutive) access.
2917 FORNOW: handle only arrays and pointer accesses. */
2919 bool
2920 vect_analyze_data_ref_accesses (vec_info *vinfo)
2922 unsigned int i;
2923 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2924 struct data_reference *dr;
2926 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2928 if (datarefs.is_empty ())
2929 return true;
2931 /* Sort the array of datarefs to make building the interleaving chains
2932 linear. Don't modify the original vector's order, it is needed for
2933 determining what dependencies are reversed. */
2934 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2935 datarefs_copy.qsort (dr_group_sort_cmp);
2937 /* Build the interleaving chains. */
2938 for (i = 0; i < datarefs_copy.length () - 1;)
2940 data_reference_p dra = datarefs_copy[i];
2941 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2942 stmt_vec_info lastinfo = NULL;
2943 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2944 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2946 ++i;
2947 continue;
2949 for (i = i + 1; i < datarefs_copy.length (); ++i)
2951 data_reference_p drb = datarefs_copy[i];
2952 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2953 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2954 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2955 break;
2957 /* ??? Imperfect sorting (non-compatible types, non-modulo
2958 accesses, same accesses) can lead to a group to be artificially
2959 split here as we don't just skip over those. If it really
2960 matters we can push those to a worklist and re-iterate
2961 over them. The we can just skip ahead to the next DR here. */
2963 /* DRs in a different loop should not be put into the same
2964 interleaving group. */
2965 if (gimple_bb (DR_STMT (dra))->loop_father
2966 != gimple_bb (DR_STMT (drb))->loop_father)
2967 break;
2969 /* Check that the data-refs have same first location (except init)
2970 and they are both either store or load (not load and store,
2971 not masked loads or stores). */
2972 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2973 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2974 DR_BASE_ADDRESS (drb)) != 0
2975 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2976 || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2977 break;
2979 /* Check that the data-refs have the same constant size. */
2980 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2981 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2982 if (!tree_fits_uhwi_p (sza)
2983 || !tree_fits_uhwi_p (szb)
2984 || !tree_int_cst_equal (sza, szb))
2985 break;
2987 /* Check that the data-refs have the same step. */
2988 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2989 break;
2991 /* Check the types are compatible.
2992 ??? We don't distinguish this during sorting. */
2993 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2994 TREE_TYPE (DR_REF (drb))))
2995 break;
2997 /* Check that the DR_INITs are compile-time constants. */
2998 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2999 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3000 break;
3002 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3003 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3004 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3005 HOST_WIDE_INT init_prev
3006 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3007 gcc_assert (init_a <= init_b
3008 && init_a <= init_prev
3009 && init_prev <= init_b);
3011 /* Do not place the same access in the interleaving chain twice. */
3012 if (init_b == init_prev)
3014 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3015 < gimple_uid (DR_STMT (drb)));
3016 /* ??? For now we simply "drop" the later reference which is
3017 otherwise the same rather than finishing off this group.
3018 In the end we'd want to re-process duplicates forming
3019 multiple groups from the refs, likely by just collecting
3020 all candidates (including duplicates and split points
3021 below) in a vector and then process them together. */
3022 continue;
3025 /* If init_b == init_a + the size of the type * k, we have an
3026 interleaving, and DRA is accessed before DRB. */
3027 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3028 if (type_size_a == 0
3029 || (init_b - init_a) % type_size_a != 0)
3030 break;
3032 /* If we have a store, the accesses are adjacent. This splits
3033 groups into chunks we support (we don't support vectorization
3034 of stores with gaps). */
3035 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3036 break;
3038 /* If the step (if not zero or non-constant) is greater than the
3039 difference between data-refs' inits this splits groups into
3040 suitable sizes. */
3041 if (tree_fits_shwi_p (DR_STEP (dra)))
3043 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3044 if (step != 0 && step <= (init_b - init_a))
3045 break;
3048 if (dump_enabled_p ())
3050 dump_printf_loc (MSG_NOTE, vect_location,
3051 "Detected interleaving ");
3052 if (DR_IS_READ (dra))
3053 dump_printf (MSG_NOTE, "load ");
3054 else
3055 dump_printf (MSG_NOTE, "store ");
3056 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3057 dump_printf (MSG_NOTE, " and ");
3058 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3059 dump_printf (MSG_NOTE, "\n");
3062 /* Link the found element into the group list. */
3063 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3065 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3066 lastinfo = stmtinfo_a;
3068 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3069 DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3070 lastinfo = stmtinfo_b;
3074 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3075 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr)))
3076 && !vect_analyze_data_ref_access (dr))
3078 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3080 "not vectorized: complicated access pattern.\n");
3082 if (is_a <bb_vec_info> (vinfo))
3084 /* Mark the statement as not vectorizable. */
3085 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
3086 continue;
3088 else
3090 datarefs_copy.release ();
3091 return false;
3095 datarefs_copy.release ();
3096 return true;
3099 /* Function vect_vfa_segment_size.
3101 Input:
3102 DR: The data reference.
3103 LENGTH_FACTOR: segment length to consider.
3105 Return a value suitable for the dr_with_seg_len::seg_len field.
3106 This is the "distance travelled" by the pointer from the first
3107 iteration in the segment to the last. Note that it does not include
3108 the size of the access; in effect it only describes the first byte. */
3110 static tree
3111 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3113 length_factor = size_binop (MINUS_EXPR,
3114 fold_convert (sizetype, length_factor),
3115 size_one_node);
3116 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3117 length_factor);
3120 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3121 gives the worst-case number of bytes covered by the segment. */
3123 static unsigned HOST_WIDE_INT
3124 vect_vfa_access_size (data_reference *dr)
3126 stmt_vec_info stmt_vinfo = vinfo_for_stmt (vect_dr_stmt (dr));
3127 tree ref_type = TREE_TYPE (DR_REF (dr));
3128 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3129 unsigned HOST_WIDE_INT access_size = ref_size;
3130 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3132 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3133 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3135 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3136 && (vect_supportable_dr_alignment (dr, false)
3137 == dr_explicit_realign_optimized))
3139 /* We might access a full vector's worth. */
3140 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3141 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3143 return access_size;
3146 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3148 static unsigned int
3149 vect_vfa_align (const data_reference *dr)
3151 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3154 /* Function vect_no_alias_p.
3156 Given data references A and B with equal base and offset, see whether
3157 the alias relation can be decided at compilation time. Return 1 if
3158 it can and the references alias, 0 if it can and the references do
3159 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3160 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3161 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3163 static int
3164 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3165 tree segment_length_a, tree segment_length_b,
3166 unsigned HOST_WIDE_INT access_size_a,
3167 unsigned HOST_WIDE_INT access_size_b)
3169 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3170 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3171 poly_uint64 const_length_a;
3172 poly_uint64 const_length_b;
3174 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3175 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3176 [a, a+12) */
3177 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3179 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3180 offset_a = (offset_a + access_size_a) - const_length_a;
3182 else
3183 const_length_a = tree_to_poly_uint64 (segment_length_a);
3184 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3186 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3187 offset_b = (offset_b + access_size_b) - const_length_b;
3189 else
3190 const_length_b = tree_to_poly_uint64 (segment_length_b);
3192 const_length_a += access_size_a;
3193 const_length_b += access_size_b;
3195 if (ranges_known_overlap_p (offset_a, const_length_a,
3196 offset_b, const_length_b))
3197 return 1;
3199 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3200 offset_b, const_length_b))
3201 return 0;
3203 return -1;
3206 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3207 in DDR is >= VF. */
3209 static bool
3210 dependence_distance_ge_vf (data_dependence_relation *ddr,
3211 unsigned int loop_depth, poly_uint64 vf)
3213 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3214 || DDR_NUM_DIST_VECTS (ddr) == 0)
3215 return false;
3217 /* If the dependence is exact, we should have limited the VF instead. */
3218 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3220 unsigned int i;
3221 lambda_vector dist_v;
3222 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3224 HOST_WIDE_INT dist = dist_v[loop_depth];
3225 if (dist != 0
3226 && !(dist > 0 && DDR_REVERSED_P (ddr))
3227 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3228 return false;
3231 if (dump_enabled_p ())
3233 dump_printf_loc (MSG_NOTE, vect_location,
3234 "dependence distance between ");
3235 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3236 dump_printf (MSG_NOTE, " and ");
3237 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3238 dump_printf (MSG_NOTE, " is >= VF\n");
3241 return true;
3244 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3246 static void
3247 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3249 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3250 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3251 dump_printf (dump_kind, ") >= ");
3252 dump_dec (dump_kind, lower_bound.min_value);
3255 /* Record that the vectorized loop requires the vec_lower_bound described
3256 by EXPR, UNSIGNED_P and MIN_VALUE. */
3258 static void
3259 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3260 poly_uint64 min_value)
3262 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3263 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3264 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3266 unsigned_p &= lower_bounds[i].unsigned_p;
3267 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3268 if (lower_bounds[i].unsigned_p != unsigned_p
3269 || maybe_lt (lower_bounds[i].min_value, min_value))
3271 lower_bounds[i].unsigned_p = unsigned_p;
3272 lower_bounds[i].min_value = min_value;
3273 if (dump_enabled_p ())
3275 dump_printf_loc (MSG_NOTE, vect_location,
3276 "updating run-time check to ");
3277 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3278 dump_printf (MSG_NOTE, "\n");
3281 return;
3284 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3285 if (dump_enabled_p ())
3287 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3288 dump_lower_bound (MSG_NOTE, lower_bound);
3289 dump_printf (MSG_NOTE, "\n");
3291 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3294 /* Return true if it's unlikely that the step of the vectorized form of DR
3295 will span fewer than GAP bytes. */
3297 static bool
3298 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3300 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
3301 HOST_WIDE_INT count
3302 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3303 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3304 count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3305 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3308 /* Return true if we know that there is no alias between DR_A and DR_B
3309 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3310 *LOWER_BOUND_OUT to this N. */
3312 static bool
3313 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3314 poly_uint64 *lower_bound_out)
3316 /* Check that there is a constant gap of known sign between DR_A
3317 and DR_B. */
3318 poly_int64 init_a, init_b;
3319 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3320 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3321 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3322 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3323 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3324 || !ordered_p (init_a, init_b))
3325 return false;
3327 /* Sort DR_A and DR_B by the address they access. */
3328 if (maybe_lt (init_b, init_a))
3330 std::swap (init_a, init_b);
3331 std::swap (dr_a, dr_b);
3334 /* If the two accesses could be dependent within a scalar iteration,
3335 make sure that we'd retain their order. */
3336 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3337 && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3338 vect_dr_stmt (dr_b)))
3339 return false;
3341 /* There is no alias if abs (DR_STEP) is greater than or equal to
3342 the bytes spanned by the combination of the two accesses. */
3343 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3344 return true;
3347 /* Function vect_prune_runtime_alias_test_list.
3349 Prune a list of ddrs to be tested at run-time by versioning for alias.
3350 Merge several alias checks into one if possible.
3351 Return FALSE if resulting list of ddrs is longer then allowed by
3352 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3354 bool
3355 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3357 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3358 hash_set <tree_pair_hash> compared_objects;
3360 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3361 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3362 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3363 vec<vec_object_pair> &check_unequal_addrs
3364 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3365 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3366 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3368 ddr_p ddr;
3369 unsigned int i;
3370 tree length_factor;
3372 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3374 /* Step values are irrelevant for aliasing if the number of vector
3375 iterations is equal to the number of scalar iterations (which can
3376 happen for fully-SLP loops). */
3377 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3379 if (!ignore_step_p)
3381 /* Convert the checks for nonzero steps into bound tests. */
3382 tree value;
3383 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3384 vect_check_lower_bound (loop_vinfo, value, true, 1);
3387 if (may_alias_ddrs.is_empty ())
3388 return true;
3390 comp_alias_ddrs.create (may_alias_ddrs.length ());
3392 unsigned int loop_depth
3393 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3394 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3396 /* First, we collect all data ref pairs for aliasing checks. */
3397 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3399 int comp_res;
3400 poly_uint64 lower_bound;
3401 struct data_reference *dr_a, *dr_b;
3402 gimple *dr_group_first_a, *dr_group_first_b;
3403 tree segment_length_a, segment_length_b;
3404 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3405 unsigned int align_a, align_b;
3406 gimple *stmt_a, *stmt_b;
3408 /* Ignore the alias if the VF we chose ended up being no greater
3409 than the dependence distance. */
3410 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3411 continue;
3413 if (DDR_OBJECT_A (ddr))
3415 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3416 if (!compared_objects.add (new_pair))
3418 if (dump_enabled_p ())
3420 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3421 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3422 dump_printf (MSG_NOTE, " and ");
3423 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3424 dump_printf (MSG_NOTE, " have different addresses\n");
3426 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3428 continue;
3431 dr_a = DDR_A (ddr);
3432 stmt_a = vect_dr_stmt (DDR_A (ddr));
3434 dr_b = DDR_B (ddr);
3435 stmt_b = vect_dr_stmt (DDR_B (ddr));
3437 /* Skip the pair if inter-iteration dependencies are irrelevant
3438 and intra-iteration dependencies are guaranteed to be honored. */
3439 if (ignore_step_p
3440 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3441 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3443 if (dump_enabled_p ())
3445 dump_printf_loc (MSG_NOTE, vect_location,
3446 "no need for alias check between ");
3447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3448 dump_printf (MSG_NOTE, " and ");
3449 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3450 dump_printf (MSG_NOTE, " when VF is 1\n");
3452 continue;
3455 /* See whether we can handle the alias using a bounds check on
3456 the step, and whether that's likely to be the best approach.
3457 (It might not be, for example, if the minimum step is much larger
3458 than the number of bytes handled by one vector iteration.) */
3459 if (!ignore_step_p
3460 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3461 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3462 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3463 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3465 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3466 if (dump_enabled_p ())
3468 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3469 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3470 dump_printf (MSG_NOTE, " and ");
3471 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3472 dump_printf (MSG_NOTE, " when the step ");
3473 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3474 dump_printf (MSG_NOTE, " is outside ");
3475 if (unsigned_p)
3476 dump_printf (MSG_NOTE, "[0");
3477 else
3479 dump_printf (MSG_NOTE, "(");
3480 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3482 dump_printf (MSG_NOTE, ", ");
3483 dump_dec (MSG_NOTE, lower_bound);
3484 dump_printf (MSG_NOTE, ")\n");
3486 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3487 lower_bound);
3488 continue;
3491 dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3492 if (dr_group_first_a)
3494 stmt_a = dr_group_first_a;
3495 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3498 dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3499 if (dr_group_first_b)
3501 stmt_b = dr_group_first_b;
3502 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3505 if (ignore_step_p)
3507 segment_length_a = size_zero_node;
3508 segment_length_b = size_zero_node;
3510 else
3512 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3513 length_factor = scalar_loop_iters;
3514 else
3515 length_factor = size_int (vect_factor);
3516 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3517 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3519 access_size_a = vect_vfa_access_size (dr_a);
3520 access_size_b = vect_vfa_access_size (dr_b);
3521 align_a = vect_vfa_align (dr_a);
3522 align_b = vect_vfa_align (dr_b);
3524 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3525 DR_BASE_ADDRESS (dr_b));
3526 if (comp_res == 0)
3527 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3528 DR_OFFSET (dr_b));
3530 /* See whether the alias is known at compilation time. */
3531 if (comp_res == 0
3532 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3533 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3534 && poly_int_tree_p (segment_length_a)
3535 && poly_int_tree_p (segment_length_b))
3537 int res = vect_compile_time_alias (dr_a, dr_b,
3538 segment_length_a,
3539 segment_length_b,
3540 access_size_a,
3541 access_size_b);
3542 if (res >= 0 && dump_enabled_p ())
3544 dump_printf_loc (MSG_NOTE, vect_location,
3545 "can tell at compile time that ");
3546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3547 dump_printf (MSG_NOTE, " and ");
3548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3549 if (res == 0)
3550 dump_printf (MSG_NOTE, " do not alias\n");
3551 else
3552 dump_printf (MSG_NOTE, " alias\n");
3555 if (res == 0)
3556 continue;
3558 if (res == 1)
3560 if (dump_enabled_p ())
3561 dump_printf_loc (MSG_NOTE, vect_location,
3562 "not vectorized: compilation time alias.\n");
3563 return false;
3567 dr_with_seg_len_pair_t dr_with_seg_len_pair
3568 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3569 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3571 /* Canonicalize pairs by sorting the two DR members. */
3572 if (comp_res > 0)
3573 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3575 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3578 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3580 unsigned int count = (comp_alias_ddrs.length ()
3581 + check_unequal_addrs.length ());
3583 dump_printf_loc (MSG_NOTE, vect_location,
3584 "improved number of alias checks from %d to %d\n",
3585 may_alias_ddrs.length (), count);
3586 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3588 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590 "number of versioning for alias "
3591 "run-time tests exceeds %d "
3592 "(--param vect-max-version-for-alias-checks)\n",
3593 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3594 return false;
3597 return true;
3600 /* Check whether we can use an internal function for a gather load
3601 or scatter store. READ_P is true for loads and false for stores.
3602 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3603 the type of the memory elements being loaded or stored. OFFSET_BITS
3604 is the number of bits in each scalar offset and OFFSET_SIGN is the
3605 sign of the offset. SCALE is the amount by which the offset should
3606 be multiplied *after* it has been converted to address width.
3608 Return true if the function is supported, storing the function
3609 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3611 bool
3612 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3613 tree memory_type, unsigned int offset_bits,
3614 signop offset_sign, int scale,
3615 internal_fn *ifn_out, tree *element_type_out)
3617 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3618 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3619 if (offset_bits > element_bits)
3620 /* Internal functions require the offset to be the same width as
3621 the vector elements. We can extend narrower offsets, but it isn't
3622 safe to truncate wider offsets. */
3623 return false;
3625 if (element_bits != memory_bits)
3626 /* For now the vector elements must be the same width as the
3627 memory elements. */
3628 return false;
3630 /* Work out which function we need. */
3631 internal_fn ifn;
3632 if (read_p)
3633 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3634 else
3635 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3637 /* Test whether the target supports this combination. */
3638 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3639 offset_sign, scale))
3640 return false;
3642 *ifn_out = ifn;
3643 *element_type_out = TREE_TYPE (vectype);
3644 return true;
3647 /* CALL is a call to an internal gather load or scatter store function.
3648 Describe the operation in INFO. */
3650 static void
3651 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3653 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3654 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3655 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3657 info->ifn = gimple_call_internal_fn (call);
3658 info->decl = NULL_TREE;
3659 info->base = gimple_call_arg (call, 0);
3660 info->offset = gimple_call_arg (call, 1);
3661 info->offset_dt = vect_unknown_def_type;
3662 info->offset_vectype = NULL_TREE;
3663 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3664 info->element_type = TREE_TYPE (vectype);
3665 info->memory_type = TREE_TYPE (DR_REF (dr));
3668 /* Return true if a non-affine read or write in STMT is suitable for a
3669 gather load or scatter store. Describe the operation in *INFO if so. */
3671 bool
3672 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3673 gather_scatter_info *info)
3675 HOST_WIDE_INT scale = 1;
3676 poly_int64 pbitpos, pbitsize;
3677 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3678 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3679 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3680 tree offtype = NULL_TREE;
3681 tree decl = NULL_TREE, base, off;
3682 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3683 tree memory_type = TREE_TYPE (DR_REF (dr));
3684 machine_mode pmode;
3685 int punsignedp, reversep, pvolatilep = 0;
3686 internal_fn ifn;
3687 tree element_type;
3688 bool masked_p = false;
3690 /* See whether this is already a call to a gather/scatter internal function.
3691 If not, see whether it's a masked load or store. */
3692 gcall *call = dyn_cast <gcall *> (stmt);
3693 if (call && gimple_call_internal_p (call))
3695 ifn = gimple_call_internal_fn (stmt);
3696 if (internal_gather_scatter_fn_p (ifn))
3698 vect_describe_gather_scatter_call (call, info);
3699 return true;
3701 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3704 /* True if we should aim to use internal functions rather than
3705 built-in functions. */
3706 bool use_ifn_p = (DR_IS_READ (dr)
3707 ? supports_vec_gather_load_p ()
3708 : supports_vec_scatter_store_p ());
3710 base = DR_REF (dr);
3711 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3712 see if we can use the def stmt of the address. */
3713 if (masked_p
3714 && TREE_CODE (base) == MEM_REF
3715 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3716 && integer_zerop (TREE_OPERAND (base, 1))
3717 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3719 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3720 if (is_gimple_assign (def_stmt)
3721 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3722 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3725 /* The gather and scatter builtins need address of the form
3726 loop_invariant + vector * {1, 2, 4, 8}
3728 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3729 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3730 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3731 multiplications and additions in it. To get a vector, we need
3732 a single SSA_NAME that will be defined in the loop and will
3733 contain everything that is not loop invariant and that can be
3734 vectorized. The following code attempts to find such a preexistng
3735 SSA_NAME OFF and put the loop invariants into a tree BASE
3736 that can be gimplified before the loop. */
3737 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3738 &punsignedp, &reversep, &pvolatilep);
3739 if (reversep)
3740 return false;
3742 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3744 if (TREE_CODE (base) == MEM_REF)
3746 if (!integer_zerop (TREE_OPERAND (base, 1)))
3748 if (off == NULL_TREE)
3749 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3750 else
3751 off = size_binop (PLUS_EXPR, off,
3752 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3754 base = TREE_OPERAND (base, 0);
3756 else
3757 base = build_fold_addr_expr (base);
3759 if (off == NULL_TREE)
3760 off = size_zero_node;
3762 /* If base is not loop invariant, either off is 0, then we start with just
3763 the constant offset in the loop invariant BASE and continue with base
3764 as OFF, otherwise give up.
3765 We could handle that case by gimplifying the addition of base + off
3766 into some SSA_NAME and use that as off, but for now punt. */
3767 if (!expr_invariant_in_loop_p (loop, base))
3769 if (!integer_zerop (off))
3770 return false;
3771 off = base;
3772 base = size_int (pbytepos);
3774 /* Otherwise put base + constant offset into the loop invariant BASE
3775 and continue with OFF. */
3776 else
3778 base = fold_convert (sizetype, base);
3779 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3782 /* OFF at this point may be either a SSA_NAME or some tree expression
3783 from get_inner_reference. Try to peel off loop invariants from it
3784 into BASE as long as possible. */
3785 STRIP_NOPS (off);
3786 while (offtype == NULL_TREE)
3788 enum tree_code code;
3789 tree op0, op1, add = NULL_TREE;
3791 if (TREE_CODE (off) == SSA_NAME)
3793 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3795 if (expr_invariant_in_loop_p (loop, off))
3796 return false;
3798 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3799 break;
3801 op0 = gimple_assign_rhs1 (def_stmt);
3802 code = gimple_assign_rhs_code (def_stmt);
3803 op1 = gimple_assign_rhs2 (def_stmt);
3805 else
3807 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3808 return false;
3809 code = TREE_CODE (off);
3810 extract_ops_from_tree (off, &code, &op0, &op1);
3812 switch (code)
3814 case POINTER_PLUS_EXPR:
3815 case PLUS_EXPR:
3816 if (expr_invariant_in_loop_p (loop, op0))
3818 add = op0;
3819 off = op1;
3820 do_add:
3821 add = fold_convert (sizetype, add);
3822 if (scale != 1)
3823 add = size_binop (MULT_EXPR, add, size_int (scale));
3824 base = size_binop (PLUS_EXPR, base, add);
3825 continue;
3827 if (expr_invariant_in_loop_p (loop, op1))
3829 add = op1;
3830 off = op0;
3831 goto do_add;
3833 break;
3834 case MINUS_EXPR:
3835 if (expr_invariant_in_loop_p (loop, op1))
3837 add = fold_convert (sizetype, op1);
3838 add = size_binop (MINUS_EXPR, size_zero_node, add);
3839 off = op0;
3840 goto do_add;
3842 break;
3843 case MULT_EXPR:
3844 if (scale == 1 && tree_fits_shwi_p (op1))
3846 int new_scale = tree_to_shwi (op1);
3847 /* Only treat this as a scaling operation if the target
3848 supports it. */
3849 if (use_ifn_p
3850 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3851 vectype, memory_type, 1,
3852 TYPE_SIGN (TREE_TYPE (op0)),
3853 new_scale, &ifn,
3854 &element_type))
3855 break;
3856 scale = new_scale;
3857 off = op0;
3858 continue;
3860 break;
3861 case SSA_NAME:
3862 off = op0;
3863 continue;
3864 CASE_CONVERT:
3865 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3866 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3867 break;
3868 if (TYPE_PRECISION (TREE_TYPE (op0))
3869 == TYPE_PRECISION (TREE_TYPE (off)))
3871 off = op0;
3872 continue;
3875 /* The internal functions need the offset to be the same width
3876 as the elements of VECTYPE. Don't include operations that
3877 cast the offset from that width to a different width. */
3878 if (use_ifn_p
3879 && (int_size_in_bytes (TREE_TYPE (vectype))
3880 == int_size_in_bytes (TREE_TYPE (off))))
3881 break;
3883 if (TYPE_PRECISION (TREE_TYPE (op0))
3884 < TYPE_PRECISION (TREE_TYPE (off)))
3886 off = op0;
3887 offtype = TREE_TYPE (off);
3888 STRIP_NOPS (off);
3889 continue;
3891 break;
3892 default:
3893 break;
3895 break;
3898 /* If at the end OFF still isn't a SSA_NAME or isn't
3899 defined in the loop, punt. */
3900 if (TREE_CODE (off) != SSA_NAME
3901 || expr_invariant_in_loop_p (loop, off))
3902 return false;
3904 if (offtype == NULL_TREE)
3905 offtype = TREE_TYPE (off);
3907 if (use_ifn_p)
3909 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3910 memory_type, TYPE_PRECISION (offtype),
3911 TYPE_SIGN (offtype), scale, &ifn,
3912 &element_type))
3913 return false;
3915 else
3917 if (DR_IS_READ (dr))
3919 if (targetm.vectorize.builtin_gather)
3920 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3922 else
3924 if (targetm.vectorize.builtin_scatter)
3925 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3928 if (!decl)
3929 return false;
3931 ifn = IFN_LAST;
3932 element_type = TREE_TYPE (vectype);
3935 info->ifn = ifn;
3936 info->decl = decl;
3937 info->base = base;
3938 info->offset = off;
3939 info->offset_dt = vect_unknown_def_type;
3940 info->offset_vectype = NULL_TREE;
3941 info->scale = scale;
3942 info->element_type = element_type;
3943 info->memory_type = memory_type;
3944 return true;
3947 /* Find the data references in STMT, analyze them with respect to LOOP and
3948 append them to DATAREFS. Return false if datarefs in this stmt cannot
3949 be handled. */
3951 bool
3952 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3953 vec<data_reference_p> *datarefs)
3955 /* We can ignore clobbers for dataref analysis - they are removed during
3956 loop vectorization and BB vectorization checks dependences with a
3957 stmt walk. */
3958 if (gimple_clobber_p (stmt))
3959 return true;
3961 if (gimple_has_volatile_ops (stmt))
3963 if (dump_enabled_p ())
3965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3966 "not vectorized: volatile type ");
3967 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3969 return false;
3972 if (stmt_can_throw_internal (stmt))
3974 if (dump_enabled_p ())
3976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3977 "not vectorized: statement can throw an "
3978 "exception ");
3979 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3981 return false;
3984 auto_vec<data_reference_p, 2> refs;
3985 if (!find_data_references_in_stmt (loop, stmt, &refs))
3986 return false;
3988 if (refs.is_empty ())
3989 return true;
3991 if (refs.length () > 1)
3993 if (dump_enabled_p ())
3995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3996 "not vectorized: more than one data ref "
3997 "in stmt: ");
3998 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4000 return false;
4003 if (gcall *call = dyn_cast <gcall *> (stmt))
4004 if (!gimple_call_internal_p (call)
4005 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4006 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4008 if (dump_enabled_p ())
4010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4011 "not vectorized: dr in a call ");
4012 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4014 return false;
4017 data_reference_p dr = refs.pop ();
4018 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4019 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4021 if (dump_enabled_p ())
4023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4024 "not vectorized: statement is bitfield "
4025 "access ");
4026 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4028 return false;
4031 if (DR_BASE_ADDRESS (dr)
4032 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4034 if (dump_enabled_p ())
4035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4036 "not vectorized: base addr of dr is a "
4037 "constant\n");
4038 return false;
4041 /* Check whether this may be a SIMD lane access and adjust the
4042 DR to make it easier for us to handle it. */
4043 if (loop
4044 && loop->simduid
4045 && (!DR_BASE_ADDRESS (dr)
4046 || !DR_OFFSET (dr)
4047 || !DR_INIT (dr)
4048 || !DR_STEP (dr)))
4050 struct data_reference *newdr
4051 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4052 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4053 if (DR_BASE_ADDRESS (newdr)
4054 && DR_OFFSET (newdr)
4055 && DR_INIT (newdr)
4056 && DR_STEP (newdr)
4057 && integer_zerop (DR_STEP (newdr)))
4059 tree off = DR_OFFSET (newdr);
4060 STRIP_NOPS (off);
4061 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4062 && TREE_CODE (off) == MULT_EXPR
4063 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4065 tree step = TREE_OPERAND (off, 1);
4066 off = TREE_OPERAND (off, 0);
4067 STRIP_NOPS (off);
4068 if (CONVERT_EXPR_P (off)
4069 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4070 < TYPE_PRECISION (TREE_TYPE (off))))
4071 off = TREE_OPERAND (off, 0);
4072 if (TREE_CODE (off) == SSA_NAME)
4074 gimple *def = SSA_NAME_DEF_STMT (off);
4075 tree reft = TREE_TYPE (DR_REF (newdr));
4076 if (is_gimple_call (def)
4077 && gimple_call_internal_p (def)
4078 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4080 tree arg = gimple_call_arg (def, 0);
4081 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4082 arg = SSA_NAME_VAR (arg);
4083 if (arg == loop->simduid
4084 /* For now. */
4085 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4087 DR_OFFSET (newdr) = ssize_int (0);
4088 DR_STEP (newdr) = step;
4089 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4090 DR_STEP_ALIGNMENT (newdr)
4091 = highest_pow2_factor (step);
4092 /* Mark as simd-lane access. */
4093 newdr->aux = (void *)-1;
4094 free_data_ref (dr);
4095 datarefs->safe_push (newdr);
4096 return true;
4102 free_data_ref (newdr);
4105 datarefs->safe_push (dr);
4106 return true;
4109 /* Function vect_analyze_data_refs.
4111 Find all the data references in the loop or basic block.
4113 The general structure of the analysis of data refs in the vectorizer is as
4114 follows:
4115 1- vect_analyze_data_refs(loop/bb): call
4116 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4117 in the loop/bb and their dependences.
4118 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4119 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4120 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4124 bool
4125 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4127 struct loop *loop = NULL;
4128 unsigned int i;
4129 struct data_reference *dr;
4130 tree scalar_type;
4132 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4134 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4135 loop = LOOP_VINFO_LOOP (loop_vinfo);
4137 /* Go through the data-refs, check that the analysis succeeded. Update
4138 pointer from stmt_vec_info struct to DR and vectype. */
4140 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4141 FOR_EACH_VEC_ELT (datarefs, i, dr)
4143 gimple *stmt;
4144 stmt_vec_info stmt_info;
4145 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4146 poly_uint64 vf;
4148 gcc_assert (DR_REF (dr));
4149 stmt = vect_dr_stmt (dr);
4150 stmt_info = vinfo_for_stmt (stmt);
4152 /* Check that analysis of the data-ref succeeded. */
4153 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4154 || !DR_STEP (dr))
4156 bool maybe_gather
4157 = DR_IS_READ (dr)
4158 && !TREE_THIS_VOLATILE (DR_REF (dr))
4159 && (targetm.vectorize.builtin_gather != NULL
4160 || supports_vec_gather_load_p ());
4161 bool maybe_scatter
4162 = DR_IS_WRITE (dr)
4163 && !TREE_THIS_VOLATILE (DR_REF (dr))
4164 && (targetm.vectorize.builtin_scatter != NULL
4165 || supports_vec_scatter_store_p ());
4167 /* If target supports vector gather loads or scatter stores,
4168 see if they can't be used. */
4169 if (is_a <loop_vec_info> (vinfo)
4170 && !nested_in_vect_loop_p (loop, stmt))
4172 if (maybe_gather || maybe_scatter)
4174 if (maybe_gather)
4175 gatherscatter = GATHER;
4176 else
4177 gatherscatter = SCATTER;
4181 if (gatherscatter == SG_NONE)
4183 if (dump_enabled_p ())
4185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4186 "not vectorized: data ref analysis "
4187 "failed ");
4188 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4190 if (is_a <bb_vec_info> (vinfo))
4192 /* In BB vectorization the ref can still participate
4193 in dependence analysis, we just can't vectorize it. */
4194 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4195 continue;
4197 return false;
4201 /* See if this was detected as SIMD lane access. */
4202 if (dr->aux == (void *)-1)
4204 if (nested_in_vect_loop_p (loop, stmt))
4206 if (dump_enabled_p ())
4208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4209 "not vectorized: data ref analysis "
4210 "failed ");
4211 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4213 return false;
4215 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4218 tree base = get_base_address (DR_REF (dr));
4219 if (base && VAR_P (base) && DECL_NONALIASED (base))
4221 if (dump_enabled_p ())
4223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4224 "not vectorized: base object not addressable "
4225 "for stmt: ");
4226 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4228 if (is_a <bb_vec_info> (vinfo))
4230 /* In BB vectorization the ref can still participate
4231 in dependence analysis, we just can't vectorize it. */
4232 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4233 continue;
4235 return false;
4238 if (is_a <loop_vec_info> (vinfo)
4239 && DR_STEP (dr)
4240 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4242 if (nested_in_vect_loop_p (loop, stmt))
4244 if (dump_enabled_p ())
4246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4247 "not vectorized: not suitable for strided "
4248 "load ");
4249 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4251 return false;
4253 STMT_VINFO_STRIDED_P (stmt_info) = true;
4256 /* Update DR field in stmt_vec_info struct. */
4258 /* If the dataref is in an inner-loop of the loop that is considered for
4259 for vectorization, we also want to analyze the access relative to
4260 the outer-loop (DR contains information only relative to the
4261 inner-most enclosing loop). We do that by building a reference to the
4262 first location accessed by the inner-loop, and analyze it relative to
4263 the outer-loop. */
4264 if (loop && nested_in_vect_loop_p (loop, stmt))
4266 /* Build a reference to the first location accessed by the
4267 inner loop: *(BASE + INIT + OFFSET). By construction,
4268 this address must be invariant in the inner loop, so we
4269 can consider it as being used in the outer loop. */
4270 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4271 tree offset = unshare_expr (DR_OFFSET (dr));
4272 tree init = unshare_expr (DR_INIT (dr));
4273 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4274 init, offset);
4275 tree init_addr = fold_build_pointer_plus (base, init_offset);
4276 tree init_ref = build_fold_indirect_ref (init_addr);
4278 if (dump_enabled_p ())
4280 dump_printf_loc (MSG_NOTE, vect_location,
4281 "analyze in outer loop: ");
4282 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4283 dump_printf (MSG_NOTE, "\n");
4286 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4287 init_ref, loop))
4288 /* dr_analyze_innermost already explained the failure. */
4289 return false;
4291 if (dump_enabled_p ())
4293 dump_printf_loc (MSG_NOTE, vect_location,
4294 "\touter base_address: ");
4295 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4296 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4297 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4298 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4299 STMT_VINFO_DR_OFFSET (stmt_info));
4300 dump_printf (MSG_NOTE,
4301 "\n\touter constant offset from base address: ");
4302 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4303 STMT_VINFO_DR_INIT (stmt_info));
4304 dump_printf (MSG_NOTE, "\n\touter step: ");
4305 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4306 STMT_VINFO_DR_STEP (stmt_info));
4307 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4308 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4309 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4310 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4311 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4312 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4313 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4314 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4318 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4319 STMT_VINFO_DATA_REF (stmt_info) = dr;
4321 /* Set vectype for STMT. */
4322 scalar_type = TREE_TYPE (DR_REF (dr));
4323 STMT_VINFO_VECTYPE (stmt_info)
4324 = get_vectype_for_scalar_type (scalar_type);
4325 if (!STMT_VINFO_VECTYPE (stmt_info))
4327 if (dump_enabled_p ())
4329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4330 "not vectorized: no vectype for stmt: ");
4331 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4332 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4333 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4334 scalar_type);
4335 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4338 if (is_a <bb_vec_info> (vinfo))
4340 /* No vector type is fine, the ref can still participate
4341 in dependence analysis, we just can't vectorize it. */
4342 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4343 continue;
4345 return false;
4347 else
4349 if (dump_enabled_p ())
4351 dump_printf_loc (MSG_NOTE, vect_location,
4352 "got vectype for stmt: ");
4353 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4354 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4355 STMT_VINFO_VECTYPE (stmt_info));
4356 dump_printf (MSG_NOTE, "\n");
4360 /* Adjust the minimal vectorization factor according to the
4361 vector type. */
4362 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4363 *min_vf = upper_bound (*min_vf, vf);
4365 if (gatherscatter != SG_NONE)
4367 gather_scatter_info gs_info;
4368 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4369 &gs_info)
4370 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4372 if (dump_enabled_p ())
4374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4375 (gatherscatter == GATHER) ?
4376 "not vectorized: not suitable for gather "
4377 "load " :
4378 "not vectorized: not suitable for scatter "
4379 "store ");
4380 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4382 return false;
4384 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4388 /* We used to stop processing and prune the list here. Verify we no
4389 longer need to. */
4390 gcc_assert (i == datarefs.length ());
4392 return true;
4396 /* Function vect_get_new_vect_var.
4398 Returns a name for a new variable. The current naming scheme appends the
4399 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4400 the name of vectorizer generated variables, and appends that to NAME if
4401 provided. */
4403 tree
4404 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4406 const char *prefix;
4407 tree new_vect_var;
4409 switch (var_kind)
4411 case vect_simple_var:
4412 prefix = "vect";
4413 break;
4414 case vect_scalar_var:
4415 prefix = "stmp";
4416 break;
4417 case vect_mask_var:
4418 prefix = "mask";
4419 break;
4420 case vect_pointer_var:
4421 prefix = "vectp";
4422 break;
4423 default:
4424 gcc_unreachable ();
4427 if (name)
4429 char* tmp = concat (prefix, "_", name, NULL);
4430 new_vect_var = create_tmp_reg (type, tmp);
4431 free (tmp);
4433 else
4434 new_vect_var = create_tmp_reg (type, prefix);
4436 return new_vect_var;
4439 /* Like vect_get_new_vect_var but return an SSA name. */
4441 tree
4442 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4444 const char *prefix;
4445 tree new_vect_var;
4447 switch (var_kind)
4449 case vect_simple_var:
4450 prefix = "vect";
4451 break;
4452 case vect_scalar_var:
4453 prefix = "stmp";
4454 break;
4455 case vect_pointer_var:
4456 prefix = "vectp";
4457 break;
4458 default:
4459 gcc_unreachable ();
4462 if (name)
4464 char* tmp = concat (prefix, "_", name, NULL);
4465 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4466 free (tmp);
4468 else
4469 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4471 return new_vect_var;
4474 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4476 static void
4477 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4479 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4480 int misalign = DR_MISALIGNMENT (dr);
4481 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4482 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4483 else
4484 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4485 DR_TARGET_ALIGNMENT (dr), misalign);
4488 /* Function vect_create_addr_base_for_vector_ref.
4490 Create an expression that computes the address of the first memory location
4491 that will be accessed for a data reference.
4493 Input:
4494 STMT: The statement containing the data reference.
4495 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4496 OFFSET: Optional. If supplied, it is be added to the initial address.
4497 LOOP: Specify relative to which loop-nest should the address be computed.
4498 For example, when the dataref is in an inner-loop nested in an
4499 outer-loop that is now being vectorized, LOOP can be either the
4500 outer-loop, or the inner-loop. The first memory location accessed
4501 by the following dataref ('in' points to short):
4503 for (i=0; i<N; i++)
4504 for (j=0; j<M; j++)
4505 s += in[i+j]
4507 is as follows:
4508 if LOOP=i_loop: &in (relative to i_loop)
4509 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4510 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4511 initial address. Unlike OFFSET, which is number of elements to
4512 be added, BYTE_OFFSET is measured in bytes.
4514 Output:
4515 1. Return an SSA_NAME whose value is the address of the memory location of
4516 the first vector of the data reference.
4517 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4518 these statement(s) which define the returned SSA_NAME.
4520 FORNOW: We are only handling array accesses with step 1. */
4522 tree
4523 vect_create_addr_base_for_vector_ref (gimple *stmt,
4524 gimple_seq *new_stmt_list,
4525 tree offset,
4526 tree byte_offset)
4528 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4529 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4530 const char *base_name;
4531 tree addr_base;
4532 tree dest;
4533 gimple_seq seq = NULL;
4534 tree vect_ptr_type;
4535 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4536 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4537 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4539 tree data_ref_base = unshare_expr (drb->base_address);
4540 tree base_offset = unshare_expr (drb->offset);
4541 tree init = unshare_expr (drb->init);
4543 if (loop_vinfo)
4544 base_name = get_name (data_ref_base);
4545 else
4547 base_offset = ssize_int (0);
4548 init = ssize_int (0);
4549 base_name = get_name (DR_REF (dr));
4552 /* Create base_offset */
4553 base_offset = size_binop (PLUS_EXPR,
4554 fold_convert (sizetype, base_offset),
4555 fold_convert (sizetype, init));
4557 if (offset)
4559 offset = fold_build2 (MULT_EXPR, sizetype,
4560 fold_convert (sizetype, offset), step);
4561 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4562 base_offset, offset);
4564 if (byte_offset)
4566 byte_offset = fold_convert (sizetype, byte_offset);
4567 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4568 base_offset, byte_offset);
4571 /* base + base_offset */
4572 if (loop_vinfo)
4573 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4574 else
4576 addr_base = build1 (ADDR_EXPR,
4577 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4578 unshare_expr (DR_REF (dr)));
4581 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4582 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4583 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4584 gimple_seq_add_seq (new_stmt_list, seq);
4586 if (DR_PTR_INFO (dr)
4587 && TREE_CODE (addr_base) == SSA_NAME
4588 && !SSA_NAME_PTR_INFO (addr_base))
4590 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4591 if (offset || byte_offset)
4592 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4595 if (dump_enabled_p ())
4597 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4598 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4599 dump_printf (MSG_NOTE, "\n");
4602 return addr_base;
4606 /* Function vect_create_data_ref_ptr.
4608 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4609 location accessed in the loop by STMT, along with the def-use update
4610 chain to appropriately advance the pointer through the loop iterations.
4611 Also set aliasing information for the pointer. This pointer is used by
4612 the callers to this function to create a memory reference expression for
4613 vector load/store access.
4615 Input:
4616 1. STMT: a stmt that references memory. Expected to be of the form
4617 GIMPLE_ASSIGN <name, data-ref> or
4618 GIMPLE_ASSIGN <data-ref, name>.
4619 2. AGGR_TYPE: the type of the reference, which should be either a vector
4620 or an array.
4621 3. AT_LOOP: the loop where the vector memref is to be created.
4622 4. OFFSET (optional): an offset to be added to the initial address accessed
4623 by the data-ref in STMT.
4624 5. BSI: location where the new stmts are to be placed if there is no loop
4625 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4626 pointing to the initial address.
4627 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4628 to the initial address accessed by the data-ref in STMT. This is
4629 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4630 in bytes.
4631 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4632 to the IV during each iteration of the loop. NULL says to move
4633 by one copy of AGGR_TYPE up or down, depending on the step of the
4634 data reference.
4636 Output:
4637 1. Declare a new ptr to vector_type, and have it point to the base of the
4638 data reference (initial addressed accessed by the data reference).
4639 For example, for vector of type V8HI, the following code is generated:
4641 v8hi *ap;
4642 ap = (v8hi *)initial_address;
4644 if OFFSET is not supplied:
4645 initial_address = &a[init];
4646 if OFFSET is supplied:
4647 initial_address = &a[init + OFFSET];
4648 if BYTE_OFFSET is supplied:
4649 initial_address = &a[init] + BYTE_OFFSET;
4651 Return the initial_address in INITIAL_ADDRESS.
4653 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4654 update the pointer in each iteration of the loop.
4656 Return the increment stmt that updates the pointer in PTR_INCR.
4658 3. Set INV_P to true if the access pattern of the data reference in the
4659 vectorized loop is invariant. Set it to false otherwise.
4661 4. Return the pointer. */
4663 tree
4664 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4665 tree offset, tree *initial_address,
4666 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4667 bool only_init, bool *inv_p, tree byte_offset,
4668 tree iv_step)
4670 const char *base_name;
4671 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4672 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4673 struct loop *loop = NULL;
4674 bool nested_in_vect_loop = false;
4675 struct loop *containing_loop = NULL;
4676 tree aggr_ptr_type;
4677 tree aggr_ptr;
4678 tree new_temp;
4679 gimple_seq new_stmt_list = NULL;
4680 edge pe = NULL;
4681 basic_block new_bb;
4682 tree aggr_ptr_init;
4683 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4684 tree aptr;
4685 gimple_stmt_iterator incr_gsi;
4686 bool insert_after;
4687 tree indx_before_incr, indx_after_incr;
4688 gimple *incr;
4689 tree step;
4690 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4692 gcc_assert (iv_step != NULL_TREE
4693 || TREE_CODE (aggr_type) == ARRAY_TYPE
4694 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4696 if (loop_vinfo)
4698 loop = LOOP_VINFO_LOOP (loop_vinfo);
4699 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4700 containing_loop = (gimple_bb (stmt))->loop_father;
4701 pe = loop_preheader_edge (loop);
4703 else
4705 gcc_assert (bb_vinfo);
4706 only_init = true;
4707 *ptr_incr = NULL;
4710 /* Check the step (evolution) of the load in LOOP, and record
4711 whether it's invariant. */
4712 step = vect_dr_behavior (dr)->step;
4713 if (integer_zerop (step))
4714 *inv_p = true;
4715 else
4716 *inv_p = false;
4718 /* Create an expression for the first address accessed by this load
4719 in LOOP. */
4720 base_name = get_name (DR_BASE_ADDRESS (dr));
4722 if (dump_enabled_p ())
4724 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4725 dump_printf_loc (MSG_NOTE, vect_location,
4726 "create %s-pointer variable to type: ",
4727 get_tree_code_name (TREE_CODE (aggr_type)));
4728 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4729 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4730 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4731 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4732 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4733 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4734 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4735 else
4736 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4737 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4738 dump_printf (MSG_NOTE, "\n");
4741 /* (1) Create the new aggregate-pointer variable.
4742 Vector and array types inherit the alias set of their component
4743 type by default so we need to use a ref-all pointer if the data
4744 reference does not conflict with the created aggregated data
4745 reference because it is not addressable. */
4746 bool need_ref_all = false;
4747 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4748 get_alias_set (DR_REF (dr))))
4749 need_ref_all = true;
4750 /* Likewise for any of the data references in the stmt group. */
4751 else if (DR_GROUP_SIZE (stmt_info) > 1)
4753 gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4756 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4757 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4758 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4759 get_alias_set (DR_REF (sdr))))
4761 need_ref_all = true;
4762 break;
4764 orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4766 while (orig_stmt);
4768 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4769 need_ref_all);
4770 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4773 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4774 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4775 def-use update cycles for the pointer: one relative to the outer-loop
4776 (LOOP), which is what steps (3) and (4) below do. The other is relative
4777 to the inner-loop (which is the inner-most loop containing the dataref),
4778 and this is done be step (5) below.
4780 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4781 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4782 redundant. Steps (3),(4) create the following:
4784 vp0 = &base_addr;
4785 LOOP: vp1 = phi(vp0,vp2)
4788 vp2 = vp1 + step
4789 goto LOOP
4791 If there is an inner-loop nested in loop, then step (5) will also be
4792 applied, and an additional update in the inner-loop will be created:
4794 vp0 = &base_addr;
4795 LOOP: vp1 = phi(vp0,vp2)
4797 inner: vp3 = phi(vp1,vp4)
4798 vp4 = vp3 + inner_step
4799 if () goto inner
4801 vp2 = vp1 + step
4802 if () goto LOOP */
4804 /* (2) Calculate the initial address of the aggregate-pointer, and set
4805 the aggregate-pointer to point to it before the loop. */
4807 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4809 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4810 offset, byte_offset);
4811 if (new_stmt_list)
4813 if (pe)
4815 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4816 gcc_assert (!new_bb);
4818 else
4819 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4822 *initial_address = new_temp;
4823 aggr_ptr_init = new_temp;
4825 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4826 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4827 inner-loop nested in LOOP (during outer-loop vectorization). */
4829 /* No update in loop is required. */
4830 if (only_init && (!loop_vinfo || at_loop == loop))
4831 aptr = aggr_ptr_init;
4832 else
4834 if (iv_step == NULL_TREE)
4836 /* The step of the aggregate pointer is the type size. */
4837 iv_step = TYPE_SIZE_UNIT (aggr_type);
4838 /* One exception to the above is when the scalar step of the load in
4839 LOOP is zero. In this case the step here is also zero. */
4840 if (*inv_p)
4841 iv_step = size_zero_node;
4842 else if (tree_int_cst_sgn (step) == -1)
4843 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4846 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4848 create_iv (aggr_ptr_init,
4849 fold_convert (aggr_ptr_type, iv_step),
4850 aggr_ptr, loop, &incr_gsi, insert_after,
4851 &indx_before_incr, &indx_after_incr);
4852 incr = gsi_stmt (incr_gsi);
4853 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4855 /* Copy the points-to information if it exists. */
4856 if (DR_PTR_INFO (dr))
4858 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4859 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4861 if (ptr_incr)
4862 *ptr_incr = incr;
4864 aptr = indx_before_incr;
4867 if (!nested_in_vect_loop || only_init)
4868 return aptr;
4871 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4872 nested in LOOP, if exists. */
4874 gcc_assert (nested_in_vect_loop);
4875 if (!only_init)
4877 standard_iv_increment_position (containing_loop, &incr_gsi,
4878 &insert_after);
4879 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4880 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4881 &indx_after_incr);
4882 incr = gsi_stmt (incr_gsi);
4883 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4885 /* Copy the points-to information if it exists. */
4886 if (DR_PTR_INFO (dr))
4888 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4889 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4891 if (ptr_incr)
4892 *ptr_incr = incr;
4894 return indx_before_incr;
4896 else
4897 gcc_unreachable ();
4901 /* Function bump_vector_ptr
4903 Increment a pointer (to a vector type) by vector-size. If requested,
4904 i.e. if PTR-INCR is given, then also connect the new increment stmt
4905 to the existing def-use update-chain of the pointer, by modifying
4906 the PTR_INCR as illustrated below:
4908 The pointer def-use update-chain before this function:
4909 DATAREF_PTR = phi (p_0, p_2)
4910 ....
4911 PTR_INCR: p_2 = DATAREF_PTR + step
4913 The pointer def-use update-chain after this function:
4914 DATAREF_PTR = phi (p_0, p_2)
4915 ....
4916 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4917 ....
4918 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4920 Input:
4921 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4922 in the loop.
4923 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4924 the loop. The increment amount across iterations is expected
4925 to be vector_size.
4926 BSI - location where the new update stmt is to be placed.
4927 STMT - the original scalar memory-access stmt that is being vectorized.
4928 BUMP - optional. The offset by which to bump the pointer. If not given,
4929 the offset is assumed to be vector_size.
4931 Output: Return NEW_DATAREF_PTR as illustrated above.
4935 tree
4936 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4937 gimple *stmt, tree bump)
4939 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4940 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4941 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4942 tree update = TYPE_SIZE_UNIT (vectype);
4943 gassign *incr_stmt;
4944 ssa_op_iter iter;
4945 use_operand_p use_p;
4946 tree new_dataref_ptr;
4948 if (bump)
4949 update = bump;
4951 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4952 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4953 else
4954 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4955 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4956 dataref_ptr, update);
4957 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4959 /* Copy the points-to information if it exists. */
4960 if (DR_PTR_INFO (dr))
4962 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4963 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4966 if (!ptr_incr)
4967 return new_dataref_ptr;
4969 /* Update the vector-pointer's cross-iteration increment. */
4970 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4972 tree use = USE_FROM_PTR (use_p);
4974 if (use == dataref_ptr)
4975 SET_USE (use_p, new_dataref_ptr);
4976 else
4977 gcc_assert (operand_equal_p (use, update, 0));
4980 return new_dataref_ptr;
4984 /* Copy memory reference info such as base/clique from the SRC reference
4985 to the DEST MEM_REF. */
4987 void
4988 vect_copy_ref_info (tree dest, tree src)
4990 if (TREE_CODE (dest) != MEM_REF)
4991 return;
4993 tree src_base = src;
4994 while (handled_component_p (src_base))
4995 src_base = TREE_OPERAND (src_base, 0);
4996 if (TREE_CODE (src_base) != MEM_REF
4997 && TREE_CODE (src_base) != TARGET_MEM_REF)
4998 return;
5000 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5001 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5005 /* Function vect_create_destination_var.
5007 Create a new temporary of type VECTYPE. */
5009 tree
5010 vect_create_destination_var (tree scalar_dest, tree vectype)
5012 tree vec_dest;
5013 const char *name;
5014 char *new_name;
5015 tree type;
5016 enum vect_var_kind kind;
5018 kind = vectype
5019 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5020 ? vect_mask_var
5021 : vect_simple_var
5022 : vect_scalar_var;
5023 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5025 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5027 name = get_name (scalar_dest);
5028 if (name)
5029 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5030 else
5031 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5032 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5033 free (new_name);
5035 return vec_dest;
5038 /* Function vect_grouped_store_supported.
5040 Returns TRUE if interleave high and interleave low permutations
5041 are supported, and FALSE otherwise. */
5043 bool
5044 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5046 machine_mode mode = TYPE_MODE (vectype);
5048 /* vect_permute_store_chain requires the group size to be equal to 3 or
5049 be a power of two. */
5050 if (count != 3 && exact_log2 (count) == -1)
5052 if (dump_enabled_p ())
5053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5054 "the size of the group of accesses"
5055 " is not a power of 2 or not eqaul to 3\n");
5056 return false;
5059 /* Check that the permutation is supported. */
5060 if (VECTOR_MODE_P (mode))
5062 unsigned int i;
5063 if (count == 3)
5065 unsigned int j0 = 0, j1 = 0, j2 = 0;
5066 unsigned int i, j;
5068 unsigned int nelt;
5069 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5071 if (dump_enabled_p ())
5072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5073 "cannot handle groups of 3 stores for"
5074 " variable-length vectors\n");
5075 return false;
5078 vec_perm_builder sel (nelt, nelt, 1);
5079 sel.quick_grow (nelt);
5080 vec_perm_indices indices;
5081 for (j = 0; j < 3; j++)
5083 int nelt0 = ((3 - j) * nelt) % 3;
5084 int nelt1 = ((3 - j) * nelt + 1) % 3;
5085 int nelt2 = ((3 - j) * nelt + 2) % 3;
5086 for (i = 0; i < nelt; i++)
5088 if (3 * i + nelt0 < nelt)
5089 sel[3 * i + nelt0] = j0++;
5090 if (3 * i + nelt1 < nelt)
5091 sel[3 * i + nelt1] = nelt + j1++;
5092 if (3 * i + nelt2 < nelt)
5093 sel[3 * i + nelt2] = 0;
5095 indices.new_vector (sel, 2, nelt);
5096 if (!can_vec_perm_const_p (mode, indices))
5098 if (dump_enabled_p ())
5099 dump_printf (MSG_MISSED_OPTIMIZATION,
5100 "permutation op not supported by target.\n");
5101 return false;
5104 for (i = 0; i < nelt; i++)
5106 if (3 * i + nelt0 < nelt)
5107 sel[3 * i + nelt0] = 3 * i + nelt0;
5108 if (3 * i + nelt1 < nelt)
5109 sel[3 * i + nelt1] = 3 * i + nelt1;
5110 if (3 * i + nelt2 < nelt)
5111 sel[3 * i + nelt2] = nelt + j2++;
5113 indices.new_vector (sel, 2, nelt);
5114 if (!can_vec_perm_const_p (mode, indices))
5116 if (dump_enabled_p ())
5117 dump_printf (MSG_MISSED_OPTIMIZATION,
5118 "permutation op not supported by target.\n");
5119 return false;
5122 return true;
5124 else
5126 /* If length is not equal to 3 then only power of 2 is supported. */
5127 gcc_assert (pow2p_hwi (count));
5128 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5130 /* The encoding has 2 interleaved stepped patterns. */
5131 vec_perm_builder sel (nelt, 2, 3);
5132 sel.quick_grow (6);
5133 for (i = 0; i < 3; i++)
5135 sel[i * 2] = i;
5136 sel[i * 2 + 1] = i + nelt;
5138 vec_perm_indices indices (sel, 2, nelt);
5139 if (can_vec_perm_const_p (mode, indices))
5141 for (i = 0; i < 6; i++)
5142 sel[i] += exact_div (nelt, 2);
5143 indices.new_vector (sel, 2, nelt);
5144 if (can_vec_perm_const_p (mode, indices))
5145 return true;
5150 if (dump_enabled_p ())
5151 dump_printf (MSG_MISSED_OPTIMIZATION,
5152 "permutaion op not supported by target.\n");
5153 return false;
5157 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5158 type VECTYPE. MASKED_P says whether the masked form is needed. */
5160 bool
5161 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5162 bool masked_p)
5164 if (masked_p)
5165 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5166 vec_mask_store_lanes_optab,
5167 vectype, count);
5168 else
5169 return vect_lanes_optab_supported_p ("vec_store_lanes",
5170 vec_store_lanes_optab,
5171 vectype, count);
5175 /* Function vect_permute_store_chain.
5177 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5178 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5179 the data correctly for the stores. Return the final references for stores
5180 in RESULT_CHAIN.
5182 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5183 The input is 4 vectors each containing 8 elements. We assign a number to
5184 each element, the input sequence is:
5186 1st vec: 0 1 2 3 4 5 6 7
5187 2nd vec: 8 9 10 11 12 13 14 15
5188 3rd vec: 16 17 18 19 20 21 22 23
5189 4th vec: 24 25 26 27 28 29 30 31
5191 The output sequence should be:
5193 1st vec: 0 8 16 24 1 9 17 25
5194 2nd vec: 2 10 18 26 3 11 19 27
5195 3rd vec: 4 12 20 28 5 13 21 30
5196 4th vec: 6 14 22 30 7 15 23 31
5198 i.e., we interleave the contents of the four vectors in their order.
5200 We use interleave_high/low instructions to create such output. The input of
5201 each interleave_high/low operation is two vectors:
5202 1st vec 2nd vec
5203 0 1 2 3 4 5 6 7
5204 the even elements of the result vector are obtained left-to-right from the
5205 high/low elements of the first vector. The odd elements of the result are
5206 obtained left-to-right from the high/low elements of the second vector.
5207 The output of interleave_high will be: 0 4 1 5
5208 and of interleave_low: 2 6 3 7
5211 The permutation is done in log LENGTH stages. In each stage interleave_high
5212 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5213 where the first argument is taken from the first half of DR_CHAIN and the
5214 second argument from it's second half.
5215 In our example,
5217 I1: interleave_high (1st vec, 3rd vec)
5218 I2: interleave_low (1st vec, 3rd vec)
5219 I3: interleave_high (2nd vec, 4th vec)
5220 I4: interleave_low (2nd vec, 4th vec)
5222 The output for the first stage is:
5224 I1: 0 16 1 17 2 18 3 19
5225 I2: 4 20 5 21 6 22 7 23
5226 I3: 8 24 9 25 10 26 11 27
5227 I4: 12 28 13 29 14 30 15 31
5229 The output of the second stage, i.e. the final result is:
5231 I1: 0 8 16 24 1 9 17 25
5232 I2: 2 10 18 26 3 11 19 27
5233 I3: 4 12 20 28 5 13 21 30
5234 I4: 6 14 22 30 7 15 23 31. */
5236 void
5237 vect_permute_store_chain (vec<tree> dr_chain,
5238 unsigned int length,
5239 gimple *stmt,
5240 gimple_stmt_iterator *gsi,
5241 vec<tree> *result_chain)
5243 tree vect1, vect2, high, low;
5244 gimple *perm_stmt;
5245 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5246 tree perm_mask_low, perm_mask_high;
5247 tree data_ref;
5248 tree perm3_mask_low, perm3_mask_high;
5249 unsigned int i, j, n, log_length = exact_log2 (length);
5251 result_chain->quick_grow (length);
5252 memcpy (result_chain->address (), dr_chain.address (),
5253 length * sizeof (tree));
5255 if (length == 3)
5257 /* vect_grouped_store_supported ensures that this is constant. */
5258 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5259 unsigned int j0 = 0, j1 = 0, j2 = 0;
5261 vec_perm_builder sel (nelt, nelt, 1);
5262 sel.quick_grow (nelt);
5263 vec_perm_indices indices;
5264 for (j = 0; j < 3; j++)
5266 int nelt0 = ((3 - j) * nelt) % 3;
5267 int nelt1 = ((3 - j) * nelt + 1) % 3;
5268 int nelt2 = ((3 - j) * nelt + 2) % 3;
5270 for (i = 0; i < nelt; i++)
5272 if (3 * i + nelt0 < nelt)
5273 sel[3 * i + nelt0] = j0++;
5274 if (3 * i + nelt1 < nelt)
5275 sel[3 * i + nelt1] = nelt + j1++;
5276 if (3 * i + nelt2 < nelt)
5277 sel[3 * i + nelt2] = 0;
5279 indices.new_vector (sel, 2, nelt);
5280 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5282 for (i = 0; i < nelt; i++)
5284 if (3 * i + nelt0 < nelt)
5285 sel[3 * i + nelt0] = 3 * i + nelt0;
5286 if (3 * i + nelt1 < nelt)
5287 sel[3 * i + nelt1] = 3 * i + nelt1;
5288 if (3 * i + nelt2 < nelt)
5289 sel[3 * i + nelt2] = nelt + j2++;
5291 indices.new_vector (sel, 2, nelt);
5292 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5294 vect1 = dr_chain[0];
5295 vect2 = dr_chain[1];
5297 /* Create interleaving stmt:
5298 low = VEC_PERM_EXPR <vect1, vect2,
5299 {j, nelt, *, j + 1, nelt + j + 1, *,
5300 j + 2, nelt + j + 2, *, ...}> */
5301 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5302 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5303 vect2, perm3_mask_low);
5304 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5306 vect1 = data_ref;
5307 vect2 = dr_chain[2];
5308 /* Create interleaving stmt:
5309 low = VEC_PERM_EXPR <vect1, vect2,
5310 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5311 6, 7, nelt + j + 2, ...}> */
5312 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5313 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5314 vect2, perm3_mask_high);
5315 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5316 (*result_chain)[j] = data_ref;
5319 else
5321 /* If length is not equal to 3 then only power of 2 is supported. */
5322 gcc_assert (pow2p_hwi (length));
5324 /* The encoding has 2 interleaved stepped patterns. */
5325 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5326 vec_perm_builder sel (nelt, 2, 3);
5327 sel.quick_grow (6);
5328 for (i = 0; i < 3; i++)
5330 sel[i * 2] = i;
5331 sel[i * 2 + 1] = i + nelt;
5333 vec_perm_indices indices (sel, 2, nelt);
5334 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5336 for (i = 0; i < 6; i++)
5337 sel[i] += exact_div (nelt, 2);
5338 indices.new_vector (sel, 2, nelt);
5339 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5341 for (i = 0, n = log_length; i < n; i++)
5343 for (j = 0; j < length/2; j++)
5345 vect1 = dr_chain[j];
5346 vect2 = dr_chain[j+length/2];
5348 /* Create interleaving stmt:
5349 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5350 ...}> */
5351 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5352 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5353 vect2, perm_mask_high);
5354 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5355 (*result_chain)[2*j] = high;
5357 /* Create interleaving stmt:
5358 low = VEC_PERM_EXPR <vect1, vect2,
5359 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5360 ...}> */
5361 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5362 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5363 vect2, perm_mask_low);
5364 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5365 (*result_chain)[2*j+1] = low;
5367 memcpy (dr_chain.address (), result_chain->address (),
5368 length * sizeof (tree));
5373 /* Function vect_setup_realignment
5375 This function is called when vectorizing an unaligned load using
5376 the dr_explicit_realign[_optimized] scheme.
5377 This function generates the following code at the loop prolog:
5379 p = initial_addr;
5380 x msq_init = *(floor(p)); # prolog load
5381 realignment_token = call target_builtin;
5382 loop:
5383 x msq = phi (msq_init, ---)
5385 The stmts marked with x are generated only for the case of
5386 dr_explicit_realign_optimized.
5388 The code above sets up a new (vector) pointer, pointing to the first
5389 location accessed by STMT, and a "floor-aligned" load using that pointer.
5390 It also generates code to compute the "realignment-token" (if the relevant
5391 target hook was defined), and creates a phi-node at the loop-header bb
5392 whose arguments are the result of the prolog-load (created by this
5393 function) and the result of a load that takes place in the loop (to be
5394 created by the caller to this function).
5396 For the case of dr_explicit_realign_optimized:
5397 The caller to this function uses the phi-result (msq) to create the
5398 realignment code inside the loop, and sets up the missing phi argument,
5399 as follows:
5400 loop:
5401 msq = phi (msq_init, lsq)
5402 lsq = *(floor(p')); # load in loop
5403 result = realign_load (msq, lsq, realignment_token);
5405 For the case of dr_explicit_realign:
5406 loop:
5407 msq = *(floor(p)); # load in loop
5408 p' = p + (VS-1);
5409 lsq = *(floor(p')); # load in loop
5410 result = realign_load (msq, lsq, realignment_token);
5412 Input:
5413 STMT - (scalar) load stmt to be vectorized. This load accesses
5414 a memory location that may be unaligned.
5415 BSI - place where new code is to be inserted.
5416 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5417 is used.
5419 Output:
5420 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5421 target hook, if defined.
5422 Return value - the result of the loop-header phi node. */
5424 tree
5425 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5426 tree *realignment_token,
5427 enum dr_alignment_support alignment_support_scheme,
5428 tree init_addr,
5429 struct loop **at_loop)
5431 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5432 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5433 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5434 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5435 struct loop *loop = NULL;
5436 edge pe = NULL;
5437 tree scalar_dest = gimple_assign_lhs (stmt);
5438 tree vec_dest;
5439 gimple *inc;
5440 tree ptr;
5441 tree data_ref;
5442 basic_block new_bb;
5443 tree msq_init = NULL_TREE;
5444 tree new_temp;
5445 gphi *phi_stmt;
5446 tree msq = NULL_TREE;
5447 gimple_seq stmts = NULL;
5448 bool inv_p;
5449 bool compute_in_loop = false;
5450 bool nested_in_vect_loop = false;
5451 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5452 struct loop *loop_for_initial_load = NULL;
5454 if (loop_vinfo)
5456 loop = LOOP_VINFO_LOOP (loop_vinfo);
5457 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5460 gcc_assert (alignment_support_scheme == dr_explicit_realign
5461 || alignment_support_scheme == dr_explicit_realign_optimized);
5463 /* We need to generate three things:
5464 1. the misalignment computation
5465 2. the extra vector load (for the optimized realignment scheme).
5466 3. the phi node for the two vectors from which the realignment is
5467 done (for the optimized realignment scheme). */
5469 /* 1. Determine where to generate the misalignment computation.
5471 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5472 calculation will be generated by this function, outside the loop (in the
5473 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5474 caller, inside the loop.
5476 Background: If the misalignment remains fixed throughout the iterations of
5477 the loop, then both realignment schemes are applicable, and also the
5478 misalignment computation can be done outside LOOP. This is because we are
5479 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5480 are a multiple of VS (the Vector Size), and therefore the misalignment in
5481 different vectorized LOOP iterations is always the same.
5482 The problem arises only if the memory access is in an inner-loop nested
5483 inside LOOP, which is now being vectorized using outer-loop vectorization.
5484 This is the only case when the misalignment of the memory access may not
5485 remain fixed throughout the iterations of the inner-loop (as explained in
5486 detail in vect_supportable_dr_alignment). In this case, not only is the
5487 optimized realignment scheme not applicable, but also the misalignment
5488 computation (and generation of the realignment token that is passed to
5489 REALIGN_LOAD) have to be done inside the loop.
5491 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5492 or not, which in turn determines if the misalignment is computed inside
5493 the inner-loop, or outside LOOP. */
5495 if (init_addr != NULL_TREE || !loop_vinfo)
5497 compute_in_loop = true;
5498 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5502 /* 2. Determine where to generate the extra vector load.
5504 For the optimized realignment scheme, instead of generating two vector
5505 loads in each iteration, we generate a single extra vector load in the
5506 preheader of the loop, and in each iteration reuse the result of the
5507 vector load from the previous iteration. In case the memory access is in
5508 an inner-loop nested inside LOOP, which is now being vectorized using
5509 outer-loop vectorization, we need to determine whether this initial vector
5510 load should be generated at the preheader of the inner-loop, or can be
5511 generated at the preheader of LOOP. If the memory access has no evolution
5512 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5513 to be generated inside LOOP (in the preheader of the inner-loop). */
5515 if (nested_in_vect_loop)
5517 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5518 bool invariant_in_outerloop =
5519 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5520 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5522 else
5523 loop_for_initial_load = loop;
5524 if (at_loop)
5525 *at_loop = loop_for_initial_load;
5527 if (loop_for_initial_load)
5528 pe = loop_preheader_edge (loop_for_initial_load);
5530 /* 3. For the case of the optimized realignment, create the first vector
5531 load at the loop preheader. */
5533 if (alignment_support_scheme == dr_explicit_realign_optimized)
5535 /* Create msq_init = *(floor(p1)) in the loop preheader */
5536 gassign *new_stmt;
5538 gcc_assert (!compute_in_loop);
5539 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5540 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5541 NULL_TREE, &init_addr, NULL, &inc,
5542 true, &inv_p);
5543 if (TREE_CODE (ptr) == SSA_NAME)
5544 new_temp = copy_ssa_name (ptr);
5545 else
5546 new_temp = make_ssa_name (TREE_TYPE (ptr));
5547 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5548 new_stmt = gimple_build_assign
5549 (new_temp, BIT_AND_EXPR, ptr,
5550 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5551 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5552 gcc_assert (!new_bb);
5553 data_ref
5554 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5555 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5556 vect_copy_ref_info (data_ref, DR_REF (dr));
5557 new_stmt = gimple_build_assign (vec_dest, data_ref);
5558 new_temp = make_ssa_name (vec_dest, new_stmt);
5559 gimple_assign_set_lhs (new_stmt, new_temp);
5560 if (pe)
5562 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5563 gcc_assert (!new_bb);
5565 else
5566 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5568 msq_init = gimple_assign_lhs (new_stmt);
5571 /* 4. Create realignment token using a target builtin, if available.
5572 It is done either inside the containing loop, or before LOOP (as
5573 determined above). */
5575 if (targetm.vectorize.builtin_mask_for_load)
5577 gcall *new_stmt;
5578 tree builtin_decl;
5580 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5581 if (!init_addr)
5583 /* Generate the INIT_ADDR computation outside LOOP. */
5584 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5585 NULL_TREE);
5586 if (loop)
5588 pe = loop_preheader_edge (loop);
5589 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5590 gcc_assert (!new_bb);
5592 else
5593 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5596 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5597 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5598 vec_dest =
5599 vect_create_destination_var (scalar_dest,
5600 gimple_call_return_type (new_stmt));
5601 new_temp = make_ssa_name (vec_dest, new_stmt);
5602 gimple_call_set_lhs (new_stmt, new_temp);
5604 if (compute_in_loop)
5605 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5606 else
5608 /* Generate the misalignment computation outside LOOP. */
5609 pe = loop_preheader_edge (loop);
5610 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5611 gcc_assert (!new_bb);
5614 *realignment_token = gimple_call_lhs (new_stmt);
5616 /* The result of the CALL_EXPR to this builtin is determined from
5617 the value of the parameter and no global variables are touched
5618 which makes the builtin a "const" function. Requiring the
5619 builtin to have the "const" attribute makes it unnecessary
5620 to call mark_call_clobbered. */
5621 gcc_assert (TREE_READONLY (builtin_decl));
5624 if (alignment_support_scheme == dr_explicit_realign)
5625 return msq;
5627 gcc_assert (!compute_in_loop);
5628 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5631 /* 5. Create msq = phi <msq_init, lsq> in loop */
5633 pe = loop_preheader_edge (containing_loop);
5634 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5635 msq = make_ssa_name (vec_dest);
5636 phi_stmt = create_phi_node (msq, containing_loop->header);
5637 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5639 return msq;
5643 /* Function vect_grouped_load_supported.
5645 COUNT is the size of the load group (the number of statements plus the
5646 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5647 only one statement, with a gap of COUNT - 1.
5649 Returns true if a suitable permute exists. */
5651 bool
5652 vect_grouped_load_supported (tree vectype, bool single_element_p,
5653 unsigned HOST_WIDE_INT count)
5655 machine_mode mode = TYPE_MODE (vectype);
5657 /* If this is single-element interleaving with an element distance
5658 that leaves unused vector loads around punt - we at least create
5659 very sub-optimal code in that case (and blow up memory,
5660 see PR65518). */
5661 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5663 if (dump_enabled_p ())
5664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5665 "single-element interleaving not supported "
5666 "for not adjacent vector loads\n");
5667 return false;
5670 /* vect_permute_load_chain requires the group size to be equal to 3 or
5671 be a power of two. */
5672 if (count != 3 && exact_log2 (count) == -1)
5674 if (dump_enabled_p ())
5675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5676 "the size of the group of accesses"
5677 " is not a power of 2 or not equal to 3\n");
5678 return false;
5681 /* Check that the permutation is supported. */
5682 if (VECTOR_MODE_P (mode))
5684 unsigned int i, j;
5685 if (count == 3)
5687 unsigned int nelt;
5688 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5690 if (dump_enabled_p ())
5691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5692 "cannot handle groups of 3 loads for"
5693 " variable-length vectors\n");
5694 return false;
5697 vec_perm_builder sel (nelt, nelt, 1);
5698 sel.quick_grow (nelt);
5699 vec_perm_indices indices;
5700 unsigned int k;
5701 for (k = 0; k < 3; k++)
5703 for (i = 0; i < nelt; i++)
5704 if (3 * i + k < 2 * nelt)
5705 sel[i] = 3 * i + k;
5706 else
5707 sel[i] = 0;
5708 indices.new_vector (sel, 2, nelt);
5709 if (!can_vec_perm_const_p (mode, indices))
5711 if (dump_enabled_p ())
5712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5713 "shuffle of 3 loads is not supported by"
5714 " target\n");
5715 return false;
5717 for (i = 0, j = 0; i < nelt; i++)
5718 if (3 * i + k < 2 * nelt)
5719 sel[i] = i;
5720 else
5721 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5722 indices.new_vector (sel, 2, nelt);
5723 if (!can_vec_perm_const_p (mode, indices))
5725 if (dump_enabled_p ())
5726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5727 "shuffle of 3 loads is not supported by"
5728 " target\n");
5729 return false;
5732 return true;
5734 else
5736 /* If length is not equal to 3 then only power of 2 is supported. */
5737 gcc_assert (pow2p_hwi (count));
5738 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5740 /* The encoding has a single stepped pattern. */
5741 vec_perm_builder sel (nelt, 1, 3);
5742 sel.quick_grow (3);
5743 for (i = 0; i < 3; i++)
5744 sel[i] = i * 2;
5745 vec_perm_indices indices (sel, 2, nelt);
5746 if (can_vec_perm_const_p (mode, indices))
5748 for (i = 0; i < 3; i++)
5749 sel[i] = i * 2 + 1;
5750 indices.new_vector (sel, 2, nelt);
5751 if (can_vec_perm_const_p (mode, indices))
5752 return true;
5757 if (dump_enabled_p ())
5758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5759 "extract even/odd not supported by target\n");
5760 return false;
5763 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5764 type VECTYPE. MASKED_P says whether the masked form is needed. */
5766 bool
5767 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5768 bool masked_p)
5770 if (masked_p)
5771 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5772 vec_mask_load_lanes_optab,
5773 vectype, count);
5774 else
5775 return vect_lanes_optab_supported_p ("vec_load_lanes",
5776 vec_load_lanes_optab,
5777 vectype, count);
5780 /* Function vect_permute_load_chain.
5782 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5783 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5784 the input data correctly. Return the final references for loads in
5785 RESULT_CHAIN.
5787 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5788 The input is 4 vectors each containing 8 elements. We assign a number to each
5789 element, the input sequence is:
5791 1st vec: 0 1 2 3 4 5 6 7
5792 2nd vec: 8 9 10 11 12 13 14 15
5793 3rd vec: 16 17 18 19 20 21 22 23
5794 4th vec: 24 25 26 27 28 29 30 31
5796 The output sequence should be:
5798 1st vec: 0 4 8 12 16 20 24 28
5799 2nd vec: 1 5 9 13 17 21 25 29
5800 3rd vec: 2 6 10 14 18 22 26 30
5801 4th vec: 3 7 11 15 19 23 27 31
5803 i.e., the first output vector should contain the first elements of each
5804 interleaving group, etc.
5806 We use extract_even/odd instructions to create such output. The input of
5807 each extract_even/odd operation is two vectors
5808 1st vec 2nd vec
5809 0 1 2 3 4 5 6 7
5811 and the output is the vector of extracted even/odd elements. The output of
5812 extract_even will be: 0 2 4 6
5813 and of extract_odd: 1 3 5 7
5816 The permutation is done in log LENGTH stages. In each stage extract_even
5817 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5818 their order. In our example,
5820 E1: extract_even (1st vec, 2nd vec)
5821 E2: extract_odd (1st vec, 2nd vec)
5822 E3: extract_even (3rd vec, 4th vec)
5823 E4: extract_odd (3rd vec, 4th vec)
5825 The output for the first stage will be:
5827 E1: 0 2 4 6 8 10 12 14
5828 E2: 1 3 5 7 9 11 13 15
5829 E3: 16 18 20 22 24 26 28 30
5830 E4: 17 19 21 23 25 27 29 31
5832 In order to proceed and create the correct sequence for the next stage (or
5833 for the correct output, if the second stage is the last one, as in our
5834 example), we first put the output of extract_even operation and then the
5835 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5836 The input for the second stage is:
5838 1st vec (E1): 0 2 4 6 8 10 12 14
5839 2nd vec (E3): 16 18 20 22 24 26 28 30
5840 3rd vec (E2): 1 3 5 7 9 11 13 15
5841 4th vec (E4): 17 19 21 23 25 27 29 31
5843 The output of the second stage:
5845 E1: 0 4 8 12 16 20 24 28
5846 E2: 2 6 10 14 18 22 26 30
5847 E3: 1 5 9 13 17 21 25 29
5848 E4: 3 7 11 15 19 23 27 31
5850 And RESULT_CHAIN after reordering:
5852 1st vec (E1): 0 4 8 12 16 20 24 28
5853 2nd vec (E3): 1 5 9 13 17 21 25 29
5854 3rd vec (E2): 2 6 10 14 18 22 26 30
5855 4th vec (E4): 3 7 11 15 19 23 27 31. */
5857 static void
5858 vect_permute_load_chain (vec<tree> dr_chain,
5859 unsigned int length,
5860 gimple *stmt,
5861 gimple_stmt_iterator *gsi,
5862 vec<tree> *result_chain)
5864 tree data_ref, first_vect, second_vect;
5865 tree perm_mask_even, perm_mask_odd;
5866 tree perm3_mask_low, perm3_mask_high;
5867 gimple *perm_stmt;
5868 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5869 unsigned int i, j, log_length = exact_log2 (length);
5871 result_chain->quick_grow (length);
5872 memcpy (result_chain->address (), dr_chain.address (),
5873 length * sizeof (tree));
5875 if (length == 3)
5877 /* vect_grouped_load_supported ensures that this is constant. */
5878 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5879 unsigned int k;
5881 vec_perm_builder sel (nelt, nelt, 1);
5882 sel.quick_grow (nelt);
5883 vec_perm_indices indices;
5884 for (k = 0; k < 3; k++)
5886 for (i = 0; i < nelt; i++)
5887 if (3 * i + k < 2 * nelt)
5888 sel[i] = 3 * i + k;
5889 else
5890 sel[i] = 0;
5891 indices.new_vector (sel, 2, nelt);
5892 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5894 for (i = 0, j = 0; i < nelt; i++)
5895 if (3 * i + k < 2 * nelt)
5896 sel[i] = i;
5897 else
5898 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5899 indices.new_vector (sel, 2, nelt);
5900 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5902 first_vect = dr_chain[0];
5903 second_vect = dr_chain[1];
5905 /* Create interleaving stmt (low part of):
5906 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5907 ...}> */
5908 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5909 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5910 second_vect, perm3_mask_low);
5911 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5913 /* Create interleaving stmt (high part of):
5914 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5915 ...}> */
5916 first_vect = data_ref;
5917 second_vect = dr_chain[2];
5918 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5919 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5920 second_vect, perm3_mask_high);
5921 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5922 (*result_chain)[k] = data_ref;
5925 else
5927 /* If length is not equal to 3 then only power of 2 is supported. */
5928 gcc_assert (pow2p_hwi (length));
5930 /* The encoding has a single stepped pattern. */
5931 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5932 vec_perm_builder sel (nelt, 1, 3);
5933 sel.quick_grow (3);
5934 for (i = 0; i < 3; ++i)
5935 sel[i] = i * 2;
5936 vec_perm_indices indices (sel, 2, nelt);
5937 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5939 for (i = 0; i < 3; ++i)
5940 sel[i] = i * 2 + 1;
5941 indices.new_vector (sel, 2, nelt);
5942 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5944 for (i = 0; i < log_length; i++)
5946 for (j = 0; j < length; j += 2)
5948 first_vect = dr_chain[j];
5949 second_vect = dr_chain[j+1];
5951 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5952 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5953 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5954 first_vect, second_vect,
5955 perm_mask_even);
5956 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5957 (*result_chain)[j/2] = data_ref;
5959 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5960 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5961 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5962 first_vect, second_vect,
5963 perm_mask_odd);
5964 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5965 (*result_chain)[j/2+length/2] = data_ref;
5967 memcpy (dr_chain.address (), result_chain->address (),
5968 length * sizeof (tree));
5973 /* Function vect_shift_permute_load_chain.
5975 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5976 sequence of stmts to reorder the input data accordingly.
5977 Return the final references for loads in RESULT_CHAIN.
5978 Return true if successed, false otherwise.
5980 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5981 The input is 3 vectors each containing 8 elements. We assign a
5982 number to each element, the input sequence is:
5984 1st vec: 0 1 2 3 4 5 6 7
5985 2nd vec: 8 9 10 11 12 13 14 15
5986 3rd vec: 16 17 18 19 20 21 22 23
5988 The output sequence should be:
5990 1st vec: 0 3 6 9 12 15 18 21
5991 2nd vec: 1 4 7 10 13 16 19 22
5992 3rd vec: 2 5 8 11 14 17 20 23
5994 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5996 First we shuffle all 3 vectors to get correct elements order:
5998 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5999 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6000 3rd vec: (16 19 22) (17 20 23) (18 21)
6002 Next we unite and shift vector 3 times:
6004 1st step:
6005 shift right by 6 the concatenation of:
6006 "1st vec" and "2nd vec"
6007 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6008 "2nd vec" and "3rd vec"
6009 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6010 "3rd vec" and "1st vec"
6011 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6012 | New vectors |
6014 So that now new vectors are:
6016 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6017 2nd vec: (10 13) (16 19 22) (17 20 23)
6018 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6020 2nd step:
6021 shift right by 5 the concatenation of:
6022 "1st vec" and "3rd vec"
6023 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6024 "2nd vec" and "1st vec"
6025 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6026 "3rd vec" and "2nd vec"
6027 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6028 | New vectors |
6030 So that now new vectors are:
6032 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6033 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6034 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6036 3rd step:
6037 shift right by 5 the concatenation of:
6038 "1st vec" and "1st vec"
6039 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6040 shift right by 3 the concatenation of:
6041 "2nd vec" and "2nd vec"
6042 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6043 | New vectors |
6045 So that now all vectors are READY:
6046 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6047 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6048 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6050 This algorithm is faster than one in vect_permute_load_chain if:
6051 1. "shift of a concatination" is faster than general permutation.
6052 This is usually so.
6053 2. The TARGET machine can't execute vector instructions in parallel.
6054 This is because each step of the algorithm depends on previous.
6055 The algorithm in vect_permute_load_chain is much more parallel.
6057 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6060 static bool
6061 vect_shift_permute_load_chain (vec<tree> dr_chain,
6062 unsigned int length,
6063 gimple *stmt,
6064 gimple_stmt_iterator *gsi,
6065 vec<tree> *result_chain)
6067 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6068 tree perm2_mask1, perm2_mask2, perm3_mask;
6069 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6070 gimple *perm_stmt;
6072 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6073 unsigned int i;
6074 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6075 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6077 unsigned HOST_WIDE_INT nelt, vf;
6078 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6079 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6080 /* Not supported for variable-length vectors. */
6081 return false;
6083 vec_perm_builder sel (nelt, nelt, 1);
6084 sel.quick_grow (nelt);
6086 result_chain->quick_grow (length);
6087 memcpy (result_chain->address (), dr_chain.address (),
6088 length * sizeof (tree));
6090 if (pow2p_hwi (length) && vf > 4)
6092 unsigned int j, log_length = exact_log2 (length);
6093 for (i = 0; i < nelt / 2; ++i)
6094 sel[i] = i * 2;
6095 for (i = 0; i < nelt / 2; ++i)
6096 sel[nelt / 2 + i] = i * 2 + 1;
6097 vec_perm_indices indices (sel, 2, nelt);
6098 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6100 if (dump_enabled_p ())
6101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6102 "shuffle of 2 fields structure is not \
6103 supported by target\n");
6104 return false;
6106 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6108 for (i = 0; i < nelt / 2; ++i)
6109 sel[i] = i * 2 + 1;
6110 for (i = 0; i < nelt / 2; ++i)
6111 sel[nelt / 2 + i] = i * 2;
6112 indices.new_vector (sel, 2, nelt);
6113 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6115 if (dump_enabled_p ())
6116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6117 "shuffle of 2 fields structure is not \
6118 supported by target\n");
6119 return false;
6121 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6123 /* Generating permutation constant to shift all elements.
6124 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6125 for (i = 0; i < nelt; i++)
6126 sel[i] = nelt / 2 + i;
6127 indices.new_vector (sel, 2, nelt);
6128 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6130 if (dump_enabled_p ())
6131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6132 "shift permutation is not supported by target\n");
6133 return false;
6135 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6137 /* Generating permutation constant to select vector from 2.
6138 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6139 for (i = 0; i < nelt / 2; i++)
6140 sel[i] = i;
6141 for (i = nelt / 2; i < nelt; i++)
6142 sel[i] = nelt + i;
6143 indices.new_vector (sel, 2, nelt);
6144 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6146 if (dump_enabled_p ())
6147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6148 "select is not supported by target\n");
6149 return false;
6151 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6153 for (i = 0; i < log_length; i++)
6155 for (j = 0; j < length; j += 2)
6157 first_vect = dr_chain[j];
6158 second_vect = dr_chain[j + 1];
6160 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6161 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6162 first_vect, first_vect,
6163 perm2_mask1);
6164 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6165 vect[0] = data_ref;
6167 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6168 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6169 second_vect, second_vect,
6170 perm2_mask2);
6171 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6172 vect[1] = data_ref;
6174 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6175 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6176 vect[0], vect[1], shift1_mask);
6177 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6178 (*result_chain)[j/2 + length/2] = data_ref;
6180 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6181 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6182 vect[0], vect[1], select_mask);
6183 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6184 (*result_chain)[j/2] = data_ref;
6186 memcpy (dr_chain.address (), result_chain->address (),
6187 length * sizeof (tree));
6189 return true;
6191 if (length == 3 && vf > 2)
6193 unsigned int k = 0, l = 0;
6195 /* Generating permutation constant to get all elements in rigth order.
6196 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6197 for (i = 0; i < nelt; i++)
6199 if (3 * k + (l % 3) >= nelt)
6201 k = 0;
6202 l += (3 - (nelt % 3));
6204 sel[i] = 3 * k + (l % 3);
6205 k++;
6207 vec_perm_indices indices (sel, 2, nelt);
6208 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6210 if (dump_enabled_p ())
6211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6212 "shuffle of 3 fields structure is not \
6213 supported by target\n");
6214 return false;
6216 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6218 /* Generating permutation constant to shift all elements.
6219 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6220 for (i = 0; i < nelt; i++)
6221 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6222 indices.new_vector (sel, 2, nelt);
6223 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6225 if (dump_enabled_p ())
6226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6227 "shift permutation is not supported by target\n");
6228 return false;
6230 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6232 /* Generating permutation constant to shift all elements.
6233 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6234 for (i = 0; i < nelt; i++)
6235 sel[i] = 2 * (nelt / 3) + 1 + i;
6236 indices.new_vector (sel, 2, nelt);
6237 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6239 if (dump_enabled_p ())
6240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6241 "shift permutation is not supported by target\n");
6242 return false;
6244 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6246 /* Generating permutation constant to shift all elements.
6247 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6248 for (i = 0; i < nelt; i++)
6249 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6250 indices.new_vector (sel, 2, nelt);
6251 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6253 if (dump_enabled_p ())
6254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6255 "shift permutation is not supported by target\n");
6256 return false;
6258 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6260 /* Generating permutation constant to shift all elements.
6261 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6262 for (i = 0; i < nelt; i++)
6263 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6264 indices.new_vector (sel, 2, nelt);
6265 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6267 if (dump_enabled_p ())
6268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6269 "shift permutation is not supported by target\n");
6270 return false;
6272 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6274 for (k = 0; k < 3; k++)
6276 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6277 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6278 dr_chain[k], dr_chain[k],
6279 perm3_mask);
6280 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6281 vect[k] = data_ref;
6284 for (k = 0; k < 3; k++)
6286 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6287 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6288 vect[k % 3], vect[(k + 1) % 3],
6289 shift1_mask);
6290 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6291 vect_shift[k] = data_ref;
6294 for (k = 0; k < 3; k++)
6296 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6297 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6298 vect_shift[(4 - k) % 3],
6299 vect_shift[(3 - k) % 3],
6300 shift2_mask);
6301 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6302 vect[k] = data_ref;
6305 (*result_chain)[3 - (nelt % 3)] = vect[2];
6307 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6308 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6309 vect[0], shift3_mask);
6310 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6311 (*result_chain)[nelt % 3] = data_ref;
6313 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6314 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6315 vect[1], shift4_mask);
6316 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6317 (*result_chain)[0] = data_ref;
6318 return true;
6320 return false;
6323 /* Function vect_transform_grouped_load.
6325 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6326 to perform their permutation and ascribe the result vectorized statements to
6327 the scalar statements.
6330 void
6331 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6332 gimple_stmt_iterator *gsi)
6334 machine_mode mode;
6335 vec<tree> result_chain = vNULL;
6337 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6338 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6339 vectors, that are ready for vector computation. */
6340 result_chain.create (size);
6342 /* If reassociation width for vector type is 2 or greater target machine can
6343 execute 2 or more vector instructions in parallel. Otherwise try to
6344 get chain for loads group using vect_shift_permute_load_chain. */
6345 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6346 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6347 || pow2p_hwi (size)
6348 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6349 gsi, &result_chain))
6350 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6351 vect_record_grouped_load_vectors (stmt, result_chain);
6352 result_chain.release ();
6355 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6356 generated as part of the vectorization of STMT. Assign the statement
6357 for each vector to the associated scalar statement. */
6359 void
6360 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6362 gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6363 gimple *next_stmt, *new_stmt;
6364 unsigned int i, gap_count;
6365 tree tmp_data_ref;
6367 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6368 Since we scan the chain starting from it's first node, their order
6369 corresponds the order of data-refs in RESULT_CHAIN. */
6370 next_stmt = first_stmt;
6371 gap_count = 1;
6372 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6374 if (!next_stmt)
6375 break;
6377 /* Skip the gaps. Loads created for the gaps will be removed by dead
6378 code elimination pass later. No need to check for the first stmt in
6379 the group, since it always exists.
6380 DR_GROUP_GAP is the number of steps in elements from the previous
6381 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6382 correspond to the gaps. */
6383 if (next_stmt != first_stmt
6384 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6386 gap_count++;
6387 continue;
6390 while (next_stmt)
6392 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6393 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6394 copies, and we put the new vector statement in the first available
6395 RELATED_STMT. */
6396 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6397 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6398 else
6400 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6402 gimple *prev_stmt =
6403 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6404 gimple *rel_stmt =
6405 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6406 while (rel_stmt)
6408 prev_stmt = rel_stmt;
6409 rel_stmt =
6410 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6413 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6414 new_stmt;
6418 next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6419 gap_count = 1;
6420 /* If NEXT_STMT accesses the same DR as the previous statement,
6421 put the same TMP_DATA_REF as its vectorized statement; otherwise
6422 get the next data-ref from RESULT_CHAIN. */
6423 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6424 break;
6429 /* Function vect_force_dr_alignment_p.
6431 Returns whether the alignment of a DECL can be forced to be aligned
6432 on ALIGNMENT bit boundary. */
6434 bool
6435 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6437 if (!VAR_P (decl))
6438 return false;
6440 if (decl_in_symtab_p (decl)
6441 && !symtab_node::get (decl)->can_increase_alignment_p ())
6442 return false;
6444 if (TREE_STATIC (decl))
6445 return (alignment <= MAX_OFILE_ALIGNMENT);
6446 else
6447 return (alignment <= MAX_STACK_ALIGNMENT);
6451 /* Return whether the data reference DR is supported with respect to its
6452 alignment.
6453 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6454 it is aligned, i.e., check if it is possible to vectorize it with different
6455 alignment. */
6457 enum dr_alignment_support
6458 vect_supportable_dr_alignment (struct data_reference *dr,
6459 bool check_aligned_accesses)
6461 gimple *stmt = vect_dr_stmt (dr);
6462 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6463 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6464 machine_mode mode = TYPE_MODE (vectype);
6465 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6466 struct loop *vect_loop = NULL;
6467 bool nested_in_vect_loop = false;
6469 if (aligned_access_p (dr) && !check_aligned_accesses)
6470 return dr_aligned;
6472 /* For now assume all conditional loads/stores support unaligned
6473 access without any special code. */
6474 if (is_gimple_call (stmt)
6475 && gimple_call_internal_p (stmt)
6476 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6477 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6478 return dr_unaligned_supported;
6480 if (loop_vinfo)
6482 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6483 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6486 /* Possibly unaligned access. */
6488 /* We can choose between using the implicit realignment scheme (generating
6489 a misaligned_move stmt) and the explicit realignment scheme (generating
6490 aligned loads with a REALIGN_LOAD). There are two variants to the
6491 explicit realignment scheme: optimized, and unoptimized.
6492 We can optimize the realignment only if the step between consecutive
6493 vector loads is equal to the vector size. Since the vector memory
6494 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6495 is guaranteed that the misalignment amount remains the same throughout the
6496 execution of the vectorized loop. Therefore, we can create the
6497 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6498 at the loop preheader.
6500 However, in the case of outer-loop vectorization, when vectorizing a
6501 memory access in the inner-loop nested within the LOOP that is now being
6502 vectorized, while it is guaranteed that the misalignment of the
6503 vectorized memory access will remain the same in different outer-loop
6504 iterations, it is *not* guaranteed that is will remain the same throughout
6505 the execution of the inner-loop. This is because the inner-loop advances
6506 with the original scalar step (and not in steps of VS). If the inner-loop
6507 step happens to be a multiple of VS, then the misalignment remains fixed
6508 and we can use the optimized realignment scheme. For example:
6510 for (i=0; i<N; i++)
6511 for (j=0; j<M; j++)
6512 s += a[i+j];
6514 When vectorizing the i-loop in the above example, the step between
6515 consecutive vector loads is 1, and so the misalignment does not remain
6516 fixed across the execution of the inner-loop, and the realignment cannot
6517 be optimized (as illustrated in the following pseudo vectorized loop):
6519 for (i=0; i<N; i+=4)
6520 for (j=0; j<M; j++){
6521 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6522 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6523 // (assuming that we start from an aligned address).
6526 We therefore have to use the unoptimized realignment scheme:
6528 for (i=0; i<N; i+=4)
6529 for (j=k; j<M; j+=4)
6530 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6531 // that the misalignment of the initial address is
6532 // 0).
6534 The loop can then be vectorized as follows:
6536 for (k=0; k<4; k++){
6537 rt = get_realignment_token (&vp[k]);
6538 for (i=0; i<N; i+=4){
6539 v1 = vp[i+k];
6540 for (j=k; j<M; j+=4){
6541 v2 = vp[i+j+VS-1];
6542 va = REALIGN_LOAD <v1,v2,rt>;
6543 vs += va;
6544 v1 = v2;
6547 } */
6549 if (DR_IS_READ (dr))
6551 bool is_packed = false;
6552 tree type = (TREE_TYPE (DR_REF (dr)));
6554 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6555 && (!targetm.vectorize.builtin_mask_for_load
6556 || targetm.vectorize.builtin_mask_for_load ()))
6558 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6560 /* If we are doing SLP then the accesses need not have the
6561 same alignment, instead it depends on the SLP group size. */
6562 if (loop_vinfo
6563 && STMT_SLP_TYPE (stmt_info)
6564 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6565 * DR_GROUP_SIZE (vinfo_for_stmt
6566 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6567 TYPE_VECTOR_SUBPARTS (vectype)))
6569 else if (!loop_vinfo
6570 || (nested_in_vect_loop
6571 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6572 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6573 return dr_explicit_realign;
6574 else
6575 return dr_explicit_realign_optimized;
6577 if (!known_alignment_for_access_p (dr))
6578 is_packed = not_size_aligned (DR_REF (dr));
6580 if (targetm.vectorize.support_vector_misalignment
6581 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6582 /* Can't software pipeline the loads, but can at least do them. */
6583 return dr_unaligned_supported;
6585 else
6587 bool is_packed = false;
6588 tree type = (TREE_TYPE (DR_REF (dr)));
6590 if (!known_alignment_for_access_p (dr))
6591 is_packed = not_size_aligned (DR_REF (dr));
6593 if (targetm.vectorize.support_vector_misalignment
6594 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6595 return dr_unaligned_supported;
6598 /* Unsupported. */
6599 return dr_unaligned_unsupported;