compiler: improve escape analysis
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob9f848fefd1e7f89d491f47e72881f4d3f23978a8
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 if (is_pattern_stmt_p (stmtinfo_a))
216 stmt_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
217 if (is_pattern_stmt_p (stmtinfo_b))
218 stmt_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
219 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
220 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
223 /* A subroutine of vect_analyze_data_ref_dependence. Handle
224 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
225 distances. These distances are conservatively correct but they don't
226 reflect a guaranteed dependence.
228 Return true if this function does all the work necessary to avoid
229 an alias or false if the caller should use the dependence distances
230 to limit the vectorization factor in the usual way. LOOP_DEPTH is
231 the depth of the loop described by LOOP_VINFO and the other arguments
232 are as for vect_analyze_data_ref_dependence. */
234 static bool
235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
236 loop_vec_info loop_vinfo,
237 int loop_depth, unsigned int *max_vf)
239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
240 lambda_vector dist_v;
241 unsigned int i;
242 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
244 int dist = dist_v[loop_depth];
245 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
247 /* If the user asserted safelen >= DIST consecutive iterations
248 can be executed concurrently, assume independence.
250 ??? An alternative would be to add the alias check even
251 in this case, and vectorize the fallback loop with the
252 maximum VF set to safelen. However, if the user has
253 explicitly given a length, it's less likely that that
254 would be a win. */
255 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
257 if ((unsigned int) loop->safelen < *max_vf)
258 *max_vf = loop->safelen;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
260 continue;
263 /* For dependence distances of 2 or more, we have the option
264 of limiting VF or checking for an alias at runtime.
265 Prefer to check at runtime if we can, to avoid limiting
266 the VF unnecessarily when the bases are in fact independent.
268 Note that the alias checks will be removed if the VF ends up
269 being small enough. */
270 return (!STMT_VINFO_GATHER_SCATTER_P
271 (vinfo_for_stmt (DR_STMT (DDR_A (ddr))))
272 && !STMT_VINFO_GATHER_SCATTER_P
273 (vinfo_for_stmt (DR_STMT (DDR_B (ddr))))
274 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
277 return true;
281 /* Function vect_analyze_data_ref_dependence.
283 Return TRUE if there (might) exist a dependence between a memory-reference
284 DRA and a memory-reference DRB. When versioning for alias may check a
285 dependence at run-time, return FALSE. Adjust *MAX_VF according to
286 the data dependence. */
288 static bool
289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
290 loop_vec_info loop_vinfo,
291 unsigned int *max_vf)
293 unsigned int i;
294 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
295 struct data_reference *dra = DDR_A (ddr);
296 struct data_reference *drb = DDR_B (ddr);
297 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
298 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
299 lambda_vector dist_v;
300 unsigned int loop_depth;
302 /* In loop analysis all data references should be vectorizable. */
303 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
304 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
305 gcc_unreachable ();
307 /* Independent data accesses. */
308 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
309 return false;
311 if (dra == drb
312 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
313 return false;
315 /* We do not have to consider dependences between accesses that belong
316 to the same group, unless the stride could be smaller than the
317 group size. */
318 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
319 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
320 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
321 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
322 return false;
324 /* Even if we have an anti-dependence then, as the vectorized loop covers at
325 least two scalar iterations, there is always also a true dependence.
326 As the vectorizer does not re-order loads and stores we can ignore
327 the anti-dependence if TBAA can disambiguate both DRs similar to the
328 case with known negative distance anti-dependences (positive
329 distance anti-dependences would violate TBAA constraints). */
330 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
331 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
332 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
333 get_alias_set (DR_REF (drb))))
334 return false;
336 /* Unknown data dependence. */
337 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
339 /* If user asserted safelen consecutive iterations can be
340 executed concurrently, assume independence. */
341 if (loop->safelen >= 2)
343 if ((unsigned int) loop->safelen < *max_vf)
344 *max_vf = loop->safelen;
345 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
346 return false;
349 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
350 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "versioning for alias not supported for: "
356 "can't determine dependence between ");
357 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
358 DR_REF (dra));
359 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
360 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
361 DR_REF (drb));
362 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 return true;
367 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "versioning for alias required: "
371 "can't determine dependence between ");
372 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
373 DR_REF (dra));
374 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
375 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
376 DR_REF (drb));
377 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
380 /* Add to list of ddrs that need to be tested at run-time. */
381 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
384 /* Known data dependence. */
385 if (DDR_NUM_DIST_VECTS (ddr) == 0)
387 /* If user asserted safelen consecutive iterations can be
388 executed concurrently, assume independence. */
389 if (loop->safelen >= 2)
391 if ((unsigned int) loop->safelen < *max_vf)
392 *max_vf = loop->safelen;
393 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
394 return false;
397 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
398 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
400 if (dump_enabled_p ())
402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
403 "versioning for alias not supported for: "
404 "bad dist vector for ");
405 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
406 DR_REF (dra));
407 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
408 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
409 DR_REF (drb));
410 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
412 return true;
415 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "versioning for alias required: "
419 "bad dist vector for ");
420 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
421 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
422 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
423 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
425 /* Add to list of ddrs that need to be tested at run-time. */
426 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
429 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
431 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
432 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
433 loop_depth, max_vf))
434 return false;
436 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
438 int dist = dist_v[loop_depth];
440 if (dump_enabled_p ())
441 dump_printf_loc (MSG_NOTE, vect_location,
442 "dependence distance = %d.\n", dist);
444 if (dist == 0)
446 if (dump_enabled_p ())
448 dump_printf_loc (MSG_NOTE, vect_location,
449 "dependence distance == 0 between ");
450 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
451 dump_printf (MSG_NOTE, " and ");
452 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
453 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
456 /* When we perform grouped accesses and perform implicit CSE
457 by detecting equal accesses and doing disambiguation with
458 runtime alias tests like for
459 .. = a[i];
460 .. = a[i+1];
461 a[i] = ..;
462 a[i+1] = ..;
463 *p = ..;
464 .. = a[i];
465 .. = a[i+1];
466 where we will end up loading { a[i], a[i+1] } once, make
467 sure that inserting group loads before the first load and
468 stores after the last store will do the right thing.
469 Similar for groups like
470 a[i] = ...;
471 ... = a[i];
472 a[i+1] = ...;
473 where loads from the group interleave with the store. */
474 if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
475 vect_dr_stmt (drb)))
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
479 "READ_WRITE dependence in interleaving.\n");
480 return true;
483 if (loop->safelen < 2)
485 tree indicator = dr_zero_step_indicator (dra);
486 if (!indicator || integer_zerop (indicator))
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
490 "access also has a zero step\n");
491 return true;
493 else if (TREE_CODE (indicator) != INTEGER_CST)
494 vect_check_nonzero_value (loop_vinfo, indicator);
496 continue;
499 if (dist > 0 && DDR_REVERSED_P (ddr))
501 /* If DDR_REVERSED_P the order of the data-refs in DDR was
502 reversed (to make distance vector positive), and the actual
503 distance is negative. */
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
506 "dependence distance negative.\n");
507 /* Record a negative dependence distance to later limit the
508 amount of stmt copying / unrolling we can perform.
509 Only need to handle read-after-write dependence. */
510 if (DR_IS_READ (drb)
511 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
512 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
513 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
514 continue;
517 unsigned int abs_dist = abs (dist);
518 if (abs_dist >= 2 && abs_dist < *max_vf)
520 /* The dependence distance requires reduction of the maximal
521 vectorization factor. */
522 *max_vf = abs (dist);
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_NOTE, vect_location,
525 "adjusting maximal vectorization factor to %i\n",
526 *max_vf);
529 if (abs_dist >= *max_vf)
531 /* Dependence distance does not create dependence, as far as
532 vectorization is concerned, in this case. */
533 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE, vect_location,
535 "dependence distance >= VF.\n");
536 continue;
539 if (dump_enabled_p ())
541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
542 "not vectorized, possible dependence "
543 "between data-refs ");
544 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
545 dump_printf (MSG_NOTE, " and ");
546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
547 dump_printf (MSG_NOTE, "\n");
550 return true;
553 return false;
556 /* Function vect_analyze_data_ref_dependences.
558 Examine all the data references in the loop, and make sure there do not
559 exist any data dependences between them. Set *MAX_VF according to
560 the maximum vectorization factor the data dependences allow. */
562 bool
563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
564 unsigned int *max_vf)
566 unsigned int i;
567 struct data_dependence_relation *ddr;
569 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
571 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
573 LOOP_VINFO_DDRS (loop_vinfo)
574 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
575 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
576 /* We need read-read dependences to compute
577 STMT_VINFO_SAME_ALIGN_REFS. */
578 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
579 &LOOP_VINFO_DDRS (loop_vinfo),
580 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
581 return false;
584 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
586 /* For epilogues we either have no aliases or alias versioning
587 was applied to original loop. Therefore we may just get max_vf
588 using VF of original loop. */
589 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
590 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
591 else
592 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
593 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
594 return false;
596 return true;
600 /* Function vect_slp_analyze_data_ref_dependence.
602 Return TRUE if there (might) exist a dependence between a memory-reference
603 DRA and a memory-reference DRB. When versioning for alias may check a
604 dependence at run-time, return FALSE. Adjust *MAX_VF according to
605 the data dependence. */
607 static bool
608 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
610 struct data_reference *dra = DDR_A (ddr);
611 struct data_reference *drb = DDR_B (ddr);
613 /* We need to check dependences of statements marked as unvectorizable
614 as well, they still can prohibit vectorization. */
616 /* Independent data accesses. */
617 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
618 return false;
620 if (dra == drb)
621 return false;
623 /* Read-read is OK. */
624 if (DR_IS_READ (dra) && DR_IS_READ (drb))
625 return false;
627 /* If dra and drb are part of the same interleaving chain consider
628 them independent. */
629 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (vect_dr_stmt (dra)))
630 && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dra)))
631 == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (drb)))))
632 return false;
634 /* Unknown data dependence. */
635 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
637 if (dump_enabled_p ())
639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
640 "can't determine dependence between ");
641 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
642 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
643 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
644 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
647 else if (dump_enabled_p ())
649 dump_printf_loc (MSG_NOTE, vect_location,
650 "determined dependence between ");
651 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
652 dump_printf (MSG_NOTE, " and ");
653 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
654 dump_printf (MSG_NOTE, "\n");
657 return true;
661 /* Analyze dependences involved in the transform of SLP NODE. STORES
662 contain the vector of scalar stores of this instance if we are
663 disambiguating the loads. */
665 static bool
666 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
667 vec<gimple *> stores, gimple *last_store)
669 /* This walks over all stmts involved in the SLP load/store done
670 in NODE verifying we can sink them up to the last stmt in the
671 group. */
672 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
673 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
675 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
676 if (access == last_access)
677 continue;
678 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
679 ao_ref ref;
680 bool ref_initialized_p = false;
681 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
682 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
684 gimple *stmt = gsi_stmt (gsi);
685 if (! gimple_vuse (stmt)
686 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
687 continue;
689 /* If we couldn't record a (single) data reference for this
690 stmt we have to resort to the alias oracle. */
691 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
692 if (!dr_b)
694 /* We are moving a store or sinking a load - this means
695 we cannot use TBAA for disambiguation. */
696 if (!ref_initialized_p)
697 ao_ref_init (&ref, DR_REF (dr_a));
698 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
699 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
700 return false;
701 continue;
704 bool dependent = false;
705 /* If we run into a store of this same instance (we've just
706 marked those) then delay dependence checking until we run
707 into the last store because this is where it will have
708 been sunk to (and we verify if we can do that as well). */
709 if (gimple_visited_p (stmt))
711 if (stmt != last_store)
712 continue;
713 unsigned i;
714 gimple *store;
715 FOR_EACH_VEC_ELT (stores, i, store)
717 data_reference *store_dr
718 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
719 ddr_p ddr = initialize_data_dependence_relation
720 (dr_a, store_dr, vNULL);
721 dependent = vect_slp_analyze_data_ref_dependence (ddr);
722 free_dependence_relation (ddr);
723 if (dependent)
724 break;
727 else
729 ddr_p ddr = initialize_data_dependence_relation (dr_a,
730 dr_b, vNULL);
731 dependent = vect_slp_analyze_data_ref_dependence (ddr);
732 free_dependence_relation (ddr);
734 if (dependent)
735 return false;
738 return true;
742 /* Function vect_analyze_data_ref_dependences.
744 Examine all the data references in the basic-block, and make sure there
745 do not exist any data dependences between them. Set *MAX_VF according to
746 the maximum vectorization factor the data dependences allow. */
748 bool
749 vect_slp_analyze_instance_dependence (slp_instance instance)
751 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
753 /* The stores of this instance are at the root of the SLP tree. */
754 slp_tree store = SLP_INSTANCE_TREE (instance);
755 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
756 store = NULL;
758 /* Verify we can sink stores to the vectorized stmt insert location. */
759 gimple *last_store = NULL;
760 if (store)
762 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
763 return false;
765 /* Mark stores in this instance and remember the last one. */
766 last_store = vect_find_last_scalar_stmt_in_slp (store);
767 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
768 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
771 bool res = true;
773 /* Verify we can sink loads to the vectorized stmt insert location,
774 special-casing stores of this instance. */
775 slp_tree load;
776 unsigned int i;
777 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
778 if (! vect_slp_analyze_node_dependences (instance, load,
779 store
780 ? SLP_TREE_SCALAR_STMTS (store)
781 : vNULL, last_store))
783 res = false;
784 break;
787 /* Unset the visited flag. */
788 if (store)
789 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
790 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
792 return res;
795 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
796 the statement that contains DRB, which is useful for recording in the
797 dump file. */
799 static void
800 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
801 innermost_loop_behavior *drb)
803 bool existed;
804 innermost_loop_behavior *&entry
805 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
806 if (!existed || entry->base_alignment < drb->base_alignment)
808 entry = drb;
809 if (dump_enabled_p ())
811 dump_printf_loc (MSG_NOTE, vect_location,
812 "recording new base alignment for ");
813 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
814 dump_printf (MSG_NOTE, "\n");
815 dump_printf_loc (MSG_NOTE, vect_location,
816 " alignment: %d\n", drb->base_alignment);
817 dump_printf_loc (MSG_NOTE, vect_location,
818 " misalignment: %d\n", drb->base_misalignment);
819 dump_printf_loc (MSG_NOTE, vect_location,
820 " based on: ");
821 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
826 /* If the region we're going to vectorize is reached, all unconditional
827 data references occur at least once. We can therefore pool the base
828 alignment guarantees from each unconditional reference. Do this by
829 going through all the data references in VINFO and checking whether
830 the containing statement makes the reference unconditionally. If so,
831 record the alignment of the base address in VINFO so that it can be
832 used for all other references with the same base. */
834 void
835 vect_record_base_alignments (vec_info *vinfo)
837 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
838 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
839 data_reference *dr;
840 unsigned int i;
841 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
843 gimple *stmt = vect_dr_stmt (dr);
844 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
845 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
846 && STMT_VINFO_VECTORIZABLE (stmt_info)
847 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
849 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
851 /* If DR is nested in the loop that is being vectorized, we can also
852 record the alignment of the base wrt the outer loop. */
853 if (loop && nested_in_vect_loop_p (loop, stmt))
854 vect_record_base_alignment
855 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
860 /* Return the target alignment for the vectorized form of DR. */
862 static unsigned int
863 vect_calculate_target_alignment (struct data_reference *dr)
865 gimple *stmt = vect_dr_stmt (dr);
866 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
867 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
868 return targetm.vectorize.preferred_vector_alignment (vectype);
871 /* Function vect_compute_data_ref_alignment
873 Compute the misalignment of the data reference DR.
875 Output:
876 1. DR_MISALIGNMENT (DR) is defined.
878 FOR NOW: No analysis is actually performed. Misalignment is calculated
879 only for trivial cases. TODO. */
881 static void
882 vect_compute_data_ref_alignment (struct data_reference *dr)
884 gimple *stmt = vect_dr_stmt (dr);
885 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
886 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
887 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
888 struct loop *loop = NULL;
889 tree ref = DR_REF (dr);
890 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
892 if (dump_enabled_p ())
893 dump_printf_loc (MSG_NOTE, vect_location,
894 "vect_compute_data_ref_alignment:\n");
896 if (loop_vinfo)
897 loop = LOOP_VINFO_LOOP (loop_vinfo);
899 /* Initialize misalignment to unknown. */
900 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
902 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
903 return;
905 innermost_loop_behavior *drb = vect_dr_behavior (dr);
906 bool step_preserves_misalignment_p;
908 unsigned HOST_WIDE_INT vector_alignment
909 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
910 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
912 /* No step for BB vectorization. */
913 if (!loop)
915 gcc_assert (integer_zerop (drb->step));
916 step_preserves_misalignment_p = true;
919 /* In case the dataref is in an inner-loop of the loop that is being
920 vectorized (LOOP), we use the base and misalignment information
921 relative to the outer-loop (LOOP). This is ok only if the misalignment
922 stays the same throughout the execution of the inner-loop, which is why
923 we have to check that the stride of the dataref in the inner-loop evenly
924 divides by the vector alignment. */
925 else if (nested_in_vect_loop_p (loop, stmt))
927 step_preserves_misalignment_p
928 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
930 if (dump_enabled_p ())
932 if (step_preserves_misalignment_p)
933 dump_printf_loc (MSG_NOTE, vect_location,
934 "inner step divides the vector alignment.\n");
935 else
936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
937 "inner step doesn't divide the vector"
938 " alignment.\n");
942 /* Similarly we can only use base and misalignment information relative to
943 an innermost loop if the misalignment stays the same throughout the
944 execution of the loop. As above, this is the case if the stride of
945 the dataref evenly divides by the alignment. */
946 else
948 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
949 step_preserves_misalignment_p
950 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
952 if (!step_preserves_misalignment_p && dump_enabled_p ())
953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
954 "step doesn't divide the vector alignment.\n");
957 unsigned int base_alignment = drb->base_alignment;
958 unsigned int base_misalignment = drb->base_misalignment;
960 /* Calculate the maximum of the pooled base address alignment and the
961 alignment that we can compute for DR itself. */
962 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
963 if (entry && base_alignment < (*entry)->base_alignment)
965 base_alignment = (*entry)->base_alignment;
966 base_misalignment = (*entry)->base_misalignment;
969 if (drb->offset_alignment < vector_alignment
970 || !step_preserves_misalignment_p
971 /* We need to know whether the step wrt the vectorized loop is
972 negative when computing the starting misalignment below. */
973 || TREE_CODE (drb->step) != INTEGER_CST)
975 if (dump_enabled_p ())
977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
978 "Unknown alignment for access: ");
979 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
980 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
982 return;
985 if (base_alignment < vector_alignment)
987 unsigned int max_alignment;
988 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
989 if (max_alignment < vector_alignment
990 || !vect_can_force_dr_alignment_p (base,
991 vector_alignment * BITS_PER_UNIT))
993 if (dump_enabled_p ())
995 dump_printf_loc (MSG_NOTE, vect_location,
996 "can't force alignment of ref: ");
997 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
998 dump_printf (MSG_NOTE, "\n");
1000 return;
1003 /* Force the alignment of the decl.
1004 NOTE: This is the only change to the code we make during
1005 the analysis phase, before deciding to vectorize the loop. */
1006 if (dump_enabled_p ())
1008 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1009 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1010 dump_printf (MSG_NOTE, "\n");
1013 DR_VECT_AUX (dr)->base_decl = base;
1014 DR_VECT_AUX (dr)->base_misaligned = true;
1015 base_misalignment = 0;
1017 poly_int64 misalignment
1018 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1020 /* If this is a backward running DR then first access in the larger
1021 vectype actually is N-1 elements before the address in the DR.
1022 Adjust misalign accordingly. */
1023 if (tree_int_cst_sgn (drb->step) < 0)
1024 /* PLUS because STEP is negative. */
1025 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1026 * TREE_INT_CST_LOW (drb->step));
1028 unsigned int const_misalignment;
1029 if (!known_misalignment (misalignment, vector_alignment,
1030 &const_misalignment))
1032 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035 "Non-constant misalignment for access: ");
1036 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1037 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1039 return;
1042 SET_DR_MISALIGNMENT (dr, const_misalignment);
1044 if (dump_enabled_p ())
1046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1047 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1048 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1049 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1052 return;
1055 /* Function vect_update_misalignment_for_peel.
1056 Sets DR's misalignment
1057 - to 0 if it has the same alignment as DR_PEEL,
1058 - to the misalignment computed using NPEEL if DR's salignment is known,
1059 - to -1 (unknown) otherwise.
1061 DR - the data reference whose misalignment is to be adjusted.
1062 DR_PEEL - the data reference whose misalignment is being made
1063 zero in the vector loop by the peel.
1064 NPEEL - the number of iterations in the peel loop if the misalignment
1065 of DR_PEEL is known at compile time. */
1067 static void
1068 vect_update_misalignment_for_peel (struct data_reference *dr,
1069 struct data_reference *dr_peel, int npeel)
1071 unsigned int i;
1072 vec<dr_p> same_aligned_drs;
1073 struct data_reference *current_dr;
1074 int dr_size = vect_get_scalar_dr_size (dr);
1075 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1076 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
1077 stmt_vec_info peel_stmt_info = vinfo_for_stmt (vect_dr_stmt (dr_peel));
1079 /* For interleaved data accesses the step in the loop must be multiplied by
1080 the size of the interleaving group. */
1081 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1082 dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1083 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1084 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1086 /* It can be assumed that the data refs with the same alignment as dr_peel
1087 are aligned in the vector loop. */
1088 same_aligned_drs
1089 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (vect_dr_stmt (dr_peel)));
1090 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1092 if (current_dr != dr)
1093 continue;
1094 gcc_assert (!known_alignment_for_access_p (dr)
1095 || !known_alignment_for_access_p (dr_peel)
1096 || (DR_MISALIGNMENT (dr) / dr_size
1097 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1098 SET_DR_MISALIGNMENT (dr, 0);
1099 return;
1102 if (known_alignment_for_access_p (dr)
1103 && known_alignment_for_access_p (dr_peel))
1105 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1106 int misal = DR_MISALIGNMENT (dr);
1107 misal += negative ? -npeel * dr_size : npeel * dr_size;
1108 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1109 SET_DR_MISALIGNMENT (dr, misal);
1110 return;
1113 if (dump_enabled_p ())
1114 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1115 "to unknown (-1).\n");
1116 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1120 /* Function verify_data_ref_alignment
1122 Return TRUE if DR can be handled with respect to alignment. */
1124 static bool
1125 verify_data_ref_alignment (data_reference_p dr)
1127 enum dr_alignment_support supportable_dr_alignment
1128 = vect_supportable_dr_alignment (dr, false);
1129 if (!supportable_dr_alignment)
1131 if (dump_enabled_p ())
1133 if (DR_IS_READ (dr))
1134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1135 "not vectorized: unsupported unaligned load.");
1136 else
1137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1138 "not vectorized: unsupported unaligned "
1139 "store.");
1141 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1142 DR_REF (dr));
1143 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1145 return false;
1148 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1149 dump_printf_loc (MSG_NOTE, vect_location,
1150 "Vectorizing an unaligned access.\n");
1152 return true;
1155 /* Function vect_verify_datarefs_alignment
1157 Return TRUE if all data references in the loop can be
1158 handled with respect to alignment. */
1160 bool
1161 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1163 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1164 struct data_reference *dr;
1165 unsigned int i;
1167 FOR_EACH_VEC_ELT (datarefs, i, dr)
1169 gimple *stmt = vect_dr_stmt (dr);
1170 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1172 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1173 continue;
1175 /* For interleaving, only the alignment of the first access matters. */
1176 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1177 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1178 continue;
1180 /* Strided accesses perform only component accesses, alignment is
1181 irrelevant for them. */
1182 if (STMT_VINFO_STRIDED_P (stmt_info)
1183 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1184 continue;
1186 if (! verify_data_ref_alignment (dr))
1187 return false;
1190 return true;
1193 /* Given an memory reference EXP return whether its alignment is less
1194 than its size. */
1196 static bool
1197 not_size_aligned (tree exp)
1199 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1200 return true;
1202 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1203 > get_object_alignment (exp));
1206 /* Function vector_alignment_reachable_p
1208 Return true if vector alignment for DR is reachable by peeling
1209 a few loop iterations. Return false otherwise. */
1211 static bool
1212 vector_alignment_reachable_p (struct data_reference *dr)
1214 gimple *stmt = vect_dr_stmt (dr);
1215 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1216 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1218 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1220 /* For interleaved access we peel only if number of iterations in
1221 the prolog loop ({VF - misalignment}), is a multiple of the
1222 number of the interleaved accesses. */
1223 int elem_size, mis_in_elements;
1225 /* FORNOW: handle only known alignment. */
1226 if (!known_alignment_for_access_p (dr))
1227 return false;
1229 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1230 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1231 elem_size = vector_element_size (vector_size, nelements);
1232 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1234 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1235 return false;
1238 /* If misalignment is known at the compile time then allow peeling
1239 only if natural alignment is reachable through peeling. */
1240 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1242 HOST_WIDE_INT elmsize =
1243 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1244 if (dump_enabled_p ())
1246 dump_printf_loc (MSG_NOTE, vect_location,
1247 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1248 dump_printf (MSG_NOTE,
1249 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1251 if (DR_MISALIGNMENT (dr) % elmsize)
1253 if (dump_enabled_p ())
1254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1255 "data size does not divide the misalignment.\n");
1256 return false;
1260 if (!known_alignment_for_access_p (dr))
1262 tree type = TREE_TYPE (DR_REF (dr));
1263 bool is_packed = not_size_aligned (DR_REF (dr));
1264 if (dump_enabled_p ())
1265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1266 "Unknown misalignment, %snaturally aligned\n",
1267 is_packed ? "not " : "");
1268 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1271 return true;
1275 /* Calculate the cost of the memory access represented by DR. */
1277 static void
1278 vect_get_data_access_cost (struct data_reference *dr,
1279 unsigned int *inside_cost,
1280 unsigned int *outside_cost,
1281 stmt_vector_for_cost *body_cost_vec,
1282 stmt_vector_for_cost *prologue_cost_vec)
1284 gimple *stmt = vect_dr_stmt (dr);
1285 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1286 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1287 int ncopies;
1289 if (PURE_SLP_STMT (stmt_info))
1290 ncopies = 1;
1291 else
1292 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1294 if (DR_IS_READ (dr))
1295 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1296 prologue_cost_vec, body_cost_vec, false);
1297 else
1298 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1300 if (dump_enabled_p ())
1301 dump_printf_loc (MSG_NOTE, vect_location,
1302 "vect_get_data_access_cost: inside_cost = %d, "
1303 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1307 typedef struct _vect_peel_info
1309 struct data_reference *dr;
1310 int npeel;
1311 unsigned int count;
1312 } *vect_peel_info;
1314 typedef struct _vect_peel_extended_info
1316 struct _vect_peel_info peel_info;
1317 unsigned int inside_cost;
1318 unsigned int outside_cost;
1319 } *vect_peel_extended_info;
1322 /* Peeling hashtable helpers. */
1324 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1326 static inline hashval_t hash (const _vect_peel_info *);
1327 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1330 inline hashval_t
1331 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1333 return (hashval_t) peel_info->npeel;
1336 inline bool
1337 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1339 return (a->npeel == b->npeel);
1343 /* Insert DR into peeling hash table with NPEEL as key. */
1345 static void
1346 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1347 loop_vec_info loop_vinfo, struct data_reference *dr,
1348 int npeel)
1350 struct _vect_peel_info elem, *slot;
1351 _vect_peel_info **new_slot;
1352 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1354 elem.npeel = npeel;
1355 slot = peeling_htab->find (&elem);
1356 if (slot)
1357 slot->count++;
1358 else
1360 slot = XNEW (struct _vect_peel_info);
1361 slot->npeel = npeel;
1362 slot->dr = dr;
1363 slot->count = 1;
1364 new_slot = peeling_htab->find_slot (slot, INSERT);
1365 *new_slot = slot;
1368 if (!supportable_dr_alignment
1369 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1370 slot->count += VECT_MAX_COST;
1374 /* Traverse peeling hash table to find peeling option that aligns maximum
1375 number of data accesses. */
1378 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1379 _vect_peel_extended_info *max)
1381 vect_peel_info elem = *slot;
1383 if (elem->count > max->peel_info.count
1384 || (elem->count == max->peel_info.count
1385 && max->peel_info.npeel > elem->npeel))
1387 max->peel_info.npeel = elem->npeel;
1388 max->peel_info.count = elem->count;
1389 max->peel_info.dr = elem->dr;
1392 return 1;
1395 /* Get the costs of peeling NPEEL iterations checking data access costs
1396 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1397 misalignment will be zero after peeling. */
1399 static void
1400 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1401 struct data_reference *dr0,
1402 unsigned int *inside_cost,
1403 unsigned int *outside_cost,
1404 stmt_vector_for_cost *body_cost_vec,
1405 stmt_vector_for_cost *prologue_cost_vec,
1406 unsigned int npeel,
1407 bool unknown_misalignment)
1409 unsigned i;
1410 data_reference *dr;
1412 FOR_EACH_VEC_ELT (datarefs, i, dr)
1414 gimple *stmt = vect_dr_stmt (dr);
1415 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1416 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1417 continue;
1419 /* For interleaving, only the alignment of the first access
1420 matters. */
1421 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1422 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1423 continue;
1425 /* Strided accesses perform only component accesses, alignment is
1426 irrelevant for them. */
1427 if (STMT_VINFO_STRIDED_P (stmt_info)
1428 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1429 continue;
1431 int save_misalignment;
1432 save_misalignment = DR_MISALIGNMENT (dr);
1433 if (npeel == 0)
1435 else if (unknown_misalignment && dr == dr0)
1436 SET_DR_MISALIGNMENT (dr, 0);
1437 else
1438 vect_update_misalignment_for_peel (dr, dr0, npeel);
1439 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1440 body_cost_vec, prologue_cost_vec);
1441 SET_DR_MISALIGNMENT (dr, save_misalignment);
1445 /* Traverse peeling hash table and calculate cost for each peeling option.
1446 Find the one with the lowest cost. */
1449 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1450 _vect_peel_extended_info *min)
1452 vect_peel_info elem = *slot;
1453 int dummy;
1454 unsigned int inside_cost = 0, outside_cost = 0;
1455 gimple *stmt = vect_dr_stmt (elem->dr);
1456 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1457 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1458 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1459 epilogue_cost_vec;
1461 prologue_cost_vec.create (2);
1462 body_cost_vec.create (2);
1463 epilogue_cost_vec.create (2);
1465 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1466 elem->dr, &inside_cost, &outside_cost,
1467 &body_cost_vec, &prologue_cost_vec,
1468 elem->npeel, false);
1470 body_cost_vec.release ();
1472 outside_cost += vect_get_known_peeling_cost
1473 (loop_vinfo, elem->npeel, &dummy,
1474 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1475 &prologue_cost_vec, &epilogue_cost_vec);
1477 /* Prologue and epilogue costs are added to the target model later.
1478 These costs depend only on the scalar iteration cost, the
1479 number of peeling iterations finally chosen, and the number of
1480 misaligned statements. So discard the information found here. */
1481 prologue_cost_vec.release ();
1482 epilogue_cost_vec.release ();
1484 if (inside_cost < min->inside_cost
1485 || (inside_cost == min->inside_cost
1486 && outside_cost < min->outside_cost))
1488 min->inside_cost = inside_cost;
1489 min->outside_cost = outside_cost;
1490 min->peel_info.dr = elem->dr;
1491 min->peel_info.npeel = elem->npeel;
1492 min->peel_info.count = elem->count;
1495 return 1;
1499 /* Choose best peeling option by traversing peeling hash table and either
1500 choosing an option with the lowest cost (if cost model is enabled) or the
1501 option that aligns as many accesses as possible. */
1503 static struct _vect_peel_extended_info
1504 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1505 loop_vec_info loop_vinfo)
1507 struct _vect_peel_extended_info res;
1509 res.peel_info.dr = NULL;
1511 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1513 res.inside_cost = INT_MAX;
1514 res.outside_cost = INT_MAX;
1515 peeling_htab->traverse <_vect_peel_extended_info *,
1516 vect_peeling_hash_get_lowest_cost> (&res);
1518 else
1520 res.peel_info.count = 0;
1521 peeling_htab->traverse <_vect_peel_extended_info *,
1522 vect_peeling_hash_get_most_frequent> (&res);
1523 res.inside_cost = 0;
1524 res.outside_cost = 0;
1527 return res;
1530 /* Return true if the new peeling NPEEL is supported. */
1532 static bool
1533 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1534 unsigned npeel)
1536 unsigned i;
1537 struct data_reference *dr = NULL;
1538 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1539 gimple *stmt;
1540 stmt_vec_info stmt_info;
1541 enum dr_alignment_support supportable_dr_alignment;
1543 /* Ensure that all data refs can be vectorized after the peel. */
1544 FOR_EACH_VEC_ELT (datarefs, i, dr)
1546 int save_misalignment;
1548 if (dr == dr0)
1549 continue;
1551 stmt = vect_dr_stmt (dr);
1552 stmt_info = vinfo_for_stmt (stmt);
1553 /* For interleaving, only the alignment of the first access
1554 matters. */
1555 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1556 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1557 continue;
1559 /* Strided accesses perform only component accesses, alignment is
1560 irrelevant for them. */
1561 if (STMT_VINFO_STRIDED_P (stmt_info)
1562 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1563 continue;
1565 save_misalignment = DR_MISALIGNMENT (dr);
1566 vect_update_misalignment_for_peel (dr, dr0, npeel);
1567 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1568 SET_DR_MISALIGNMENT (dr, save_misalignment);
1570 if (!supportable_dr_alignment)
1571 return false;
1574 return true;
1577 /* Function vect_enhance_data_refs_alignment
1579 This pass will use loop versioning and loop peeling in order to enhance
1580 the alignment of data references in the loop.
1582 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1583 original loop is to be vectorized. Any other loops that are created by
1584 the transformations performed in this pass - are not supposed to be
1585 vectorized. This restriction will be relaxed.
1587 This pass will require a cost model to guide it whether to apply peeling
1588 or versioning or a combination of the two. For example, the scheme that
1589 intel uses when given a loop with several memory accesses, is as follows:
1590 choose one memory access ('p') which alignment you want to force by doing
1591 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1592 other accesses are not necessarily aligned, or (2) use loop versioning to
1593 generate one loop in which all accesses are aligned, and another loop in
1594 which only 'p' is necessarily aligned.
1596 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1597 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1598 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1600 Devising a cost model is the most critical aspect of this work. It will
1601 guide us on which access to peel for, whether to use loop versioning, how
1602 many versions to create, etc. The cost model will probably consist of
1603 generic considerations as well as target specific considerations (on
1604 powerpc for example, misaligned stores are more painful than misaligned
1605 loads).
1607 Here are the general steps involved in alignment enhancements:
1609 -- original loop, before alignment analysis:
1610 for (i=0; i<N; i++){
1611 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1612 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1615 -- After vect_compute_data_refs_alignment:
1616 for (i=0; i<N; i++){
1617 x = q[i]; # DR_MISALIGNMENT(q) = 3
1618 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1621 -- Possibility 1: we do loop versioning:
1622 if (p is aligned) {
1623 for (i=0; i<N; i++){ # loop 1A
1624 x = q[i]; # DR_MISALIGNMENT(q) = 3
1625 p[i] = y; # DR_MISALIGNMENT(p) = 0
1628 else {
1629 for (i=0; i<N; i++){ # loop 1B
1630 x = q[i]; # DR_MISALIGNMENT(q) = 3
1631 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1635 -- Possibility 2: we do loop peeling:
1636 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1637 x = q[i];
1638 p[i] = y;
1640 for (i = 3; i < N; i++){ # loop 2A
1641 x = q[i]; # DR_MISALIGNMENT(q) = 0
1642 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1645 -- Possibility 3: combination of loop peeling and versioning:
1646 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1647 x = q[i];
1648 p[i] = y;
1650 if (p is aligned) {
1651 for (i = 3; i<N; i++){ # loop 3A
1652 x = q[i]; # DR_MISALIGNMENT(q) = 0
1653 p[i] = y; # DR_MISALIGNMENT(p) = 0
1656 else {
1657 for (i = 3; i<N; i++){ # loop 3B
1658 x = q[i]; # DR_MISALIGNMENT(q) = 0
1659 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1663 These loops are later passed to loop_transform to be vectorized. The
1664 vectorizer will use the alignment information to guide the transformation
1665 (whether to generate regular loads/stores, or with special handling for
1666 misalignment). */
1668 bool
1669 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1671 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1672 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1673 enum dr_alignment_support supportable_dr_alignment;
1674 struct data_reference *dr0 = NULL, *first_store = NULL;
1675 struct data_reference *dr;
1676 unsigned int i, j;
1677 bool do_peeling = false;
1678 bool do_versioning = false;
1679 bool stat;
1680 gimple *stmt;
1681 stmt_vec_info stmt_info;
1682 unsigned int npeel = 0;
1683 bool one_misalignment_known = false;
1684 bool one_misalignment_unknown = false;
1685 bool one_dr_unsupportable = false;
1686 struct data_reference *unsupportable_dr = NULL;
1687 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1688 unsigned possible_npeel_number = 1;
1689 tree vectype;
1690 unsigned int mis, same_align_drs_max = 0;
1691 hash_table<peel_info_hasher> peeling_htab (1);
1693 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1695 /* Reset data so we can safely be called multiple times. */
1696 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1697 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1699 /* While cost model enhancements are expected in the future, the high level
1700 view of the code at this time is as follows:
1702 A) If there is a misaligned access then see if peeling to align
1703 this access can make all data references satisfy
1704 vect_supportable_dr_alignment. If so, update data structures
1705 as needed and return true.
1707 B) If peeling wasn't possible and there is a data reference with an
1708 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1709 then see if loop versioning checks can be used to make all data
1710 references satisfy vect_supportable_dr_alignment. If so, update
1711 data structures as needed and return true.
1713 C) If neither peeling nor versioning were successful then return false if
1714 any data reference does not satisfy vect_supportable_dr_alignment.
1716 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1718 Note, Possibility 3 above (which is peeling and versioning together) is not
1719 being done at this time. */
1721 /* (1) Peeling to force alignment. */
1723 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1724 Considerations:
1725 + How many accesses will become aligned due to the peeling
1726 - How many accesses will become unaligned due to the peeling,
1727 and the cost of misaligned accesses.
1728 - The cost of peeling (the extra runtime checks, the increase
1729 in code size). */
1731 FOR_EACH_VEC_ELT (datarefs, i, dr)
1733 stmt = vect_dr_stmt (dr);
1734 stmt_info = vinfo_for_stmt (stmt);
1736 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1737 continue;
1739 /* For interleaving, only the alignment of the first access
1740 matters. */
1741 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1742 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1743 continue;
1745 /* For scatter-gather or invariant accesses there is nothing
1746 to enhance. */
1747 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1748 || integer_zerop (DR_STEP (dr)))
1749 continue;
1751 /* Strided accesses perform only component accesses, alignment is
1752 irrelevant for them. */
1753 if (STMT_VINFO_STRIDED_P (stmt_info)
1754 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1755 continue;
1757 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1758 do_peeling = vector_alignment_reachable_p (dr);
1759 if (do_peeling)
1761 if (known_alignment_for_access_p (dr))
1763 unsigned int npeel_tmp = 0;
1764 bool negative = tree_int_cst_compare (DR_STEP (dr),
1765 size_zero_node) < 0;
1767 vectype = STMT_VINFO_VECTYPE (stmt_info);
1768 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1769 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1770 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1771 if (DR_MISALIGNMENT (dr) != 0)
1772 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1774 /* For multiple types, it is possible that the bigger type access
1775 will have more than one peeling option. E.g., a loop with two
1776 types: one of size (vector size / 4), and the other one of
1777 size (vector size / 8). Vectorization factor will 8. If both
1778 accesses are misaligned by 3, the first one needs one scalar
1779 iteration to be aligned, and the second one needs 5. But the
1780 first one will be aligned also by peeling 5 scalar
1781 iterations, and in that case both accesses will be aligned.
1782 Hence, except for the immediate peeling amount, we also want
1783 to try to add full vector size, while we don't exceed
1784 vectorization factor.
1785 We do this automatically for cost model, since we calculate
1786 cost for every peeling option. */
1787 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1789 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1790 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1791 possible_npeel_number
1792 = vect_get_num_vectors (nscalars, vectype);
1794 /* NPEEL_TMP is 0 when there is no misalignment, but also
1795 allow peeling NELEMENTS. */
1796 if (DR_MISALIGNMENT (dr) == 0)
1797 possible_npeel_number++;
1800 /* Save info about DR in the hash table. Also include peeling
1801 amounts according to the explanation above. */
1802 for (j = 0; j < possible_npeel_number; j++)
1804 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1805 dr, npeel_tmp);
1806 npeel_tmp += target_align / dr_size;
1809 one_misalignment_known = true;
1811 else
1813 /* If we don't know any misalignment values, we prefer
1814 peeling for data-ref that has the maximum number of data-refs
1815 with the same alignment, unless the target prefers to align
1816 stores over load. */
1817 unsigned same_align_drs
1818 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1819 if (!dr0
1820 || same_align_drs_max < same_align_drs)
1822 same_align_drs_max = same_align_drs;
1823 dr0 = dr;
1825 /* For data-refs with the same number of related
1826 accesses prefer the one where the misalign
1827 computation will be invariant in the outermost loop. */
1828 else if (same_align_drs_max == same_align_drs)
1830 struct loop *ivloop0, *ivloop;
1831 ivloop0 = outermost_invariant_loop_for_expr
1832 (loop, DR_BASE_ADDRESS (dr0));
1833 ivloop = outermost_invariant_loop_for_expr
1834 (loop, DR_BASE_ADDRESS (dr));
1835 if ((ivloop && !ivloop0)
1836 || (ivloop && ivloop0
1837 && flow_loop_nested_p (ivloop, ivloop0)))
1838 dr0 = dr;
1841 one_misalignment_unknown = true;
1843 /* Check for data refs with unsupportable alignment that
1844 can be peeled. */
1845 if (!supportable_dr_alignment)
1847 one_dr_unsupportable = true;
1848 unsupportable_dr = dr;
1851 if (!first_store && DR_IS_WRITE (dr))
1852 first_store = dr;
1855 else
1857 if (!aligned_access_p (dr))
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 "vector alignment may not be reachable\n");
1862 break;
1867 /* Check if we can possibly peel the loop. */
1868 if (!vect_can_advance_ivs_p (loop_vinfo)
1869 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1870 || loop->inner)
1871 do_peeling = false;
1873 struct _vect_peel_extended_info peel_for_known_alignment;
1874 struct _vect_peel_extended_info peel_for_unknown_alignment;
1875 struct _vect_peel_extended_info best_peel;
1877 peel_for_unknown_alignment.inside_cost = INT_MAX;
1878 peel_for_unknown_alignment.outside_cost = INT_MAX;
1879 peel_for_unknown_alignment.peel_info.count = 0;
1881 if (do_peeling
1882 && one_misalignment_unknown)
1884 /* Check if the target requires to prefer stores over loads, i.e., if
1885 misaligned stores are more expensive than misaligned loads (taking
1886 drs with same alignment into account). */
1887 unsigned int load_inside_cost = 0;
1888 unsigned int load_outside_cost = 0;
1889 unsigned int store_inside_cost = 0;
1890 unsigned int store_outside_cost = 0;
1891 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1893 stmt_vector_for_cost dummy;
1894 dummy.create (2);
1895 vect_get_peeling_costs_all_drs (datarefs, dr0,
1896 &load_inside_cost,
1897 &load_outside_cost,
1898 &dummy, &dummy, estimated_npeels, true);
1899 dummy.release ();
1901 if (first_store)
1903 dummy.create (2);
1904 vect_get_peeling_costs_all_drs (datarefs, first_store,
1905 &store_inside_cost,
1906 &store_outside_cost,
1907 &dummy, &dummy,
1908 estimated_npeels, true);
1909 dummy.release ();
1911 else
1913 store_inside_cost = INT_MAX;
1914 store_outside_cost = INT_MAX;
1917 if (load_inside_cost > store_inside_cost
1918 || (load_inside_cost == store_inside_cost
1919 && load_outside_cost > store_outside_cost))
1921 dr0 = first_store;
1922 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1923 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1925 else
1927 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1928 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1931 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1932 prologue_cost_vec.create (2);
1933 epilogue_cost_vec.create (2);
1935 int dummy2;
1936 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1937 (loop_vinfo, estimated_npeels, &dummy2,
1938 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1939 &prologue_cost_vec, &epilogue_cost_vec);
1941 prologue_cost_vec.release ();
1942 epilogue_cost_vec.release ();
1944 peel_for_unknown_alignment.peel_info.count = 1
1945 + STMT_VINFO_SAME_ALIGN_REFS
1946 (vinfo_for_stmt (vect_dr_stmt (dr0))).length ();
1949 peel_for_unknown_alignment.peel_info.npeel = 0;
1950 peel_for_unknown_alignment.peel_info.dr = dr0;
1952 best_peel = peel_for_unknown_alignment;
1954 peel_for_known_alignment.inside_cost = INT_MAX;
1955 peel_for_known_alignment.outside_cost = INT_MAX;
1956 peel_for_known_alignment.peel_info.count = 0;
1957 peel_for_known_alignment.peel_info.dr = NULL;
1959 if (do_peeling && one_misalignment_known)
1961 /* Peeling is possible, but there is no data access that is not supported
1962 unless aligned. So we try to choose the best possible peeling from
1963 the hash table. */
1964 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1965 (&peeling_htab, loop_vinfo);
1968 /* Compare costs of peeling for known and unknown alignment. */
1969 if (peel_for_known_alignment.peel_info.dr != NULL
1970 && peel_for_unknown_alignment.inside_cost
1971 >= peel_for_known_alignment.inside_cost)
1973 best_peel = peel_for_known_alignment;
1975 /* If the best peeling for known alignment has NPEEL == 0, perform no
1976 peeling at all except if there is an unsupportable dr that we can
1977 align. */
1978 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1979 do_peeling = false;
1982 /* If there is an unsupportable data ref, prefer this over all choices so far
1983 since we'd have to discard a chosen peeling except when it accidentally
1984 aligned the unsupportable data ref. */
1985 if (one_dr_unsupportable)
1986 dr0 = unsupportable_dr;
1987 else if (do_peeling)
1989 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1990 TODO: Use nopeel_outside_cost or get rid of it? */
1991 unsigned nopeel_inside_cost = 0;
1992 unsigned nopeel_outside_cost = 0;
1994 stmt_vector_for_cost dummy;
1995 dummy.create (2);
1996 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1997 &nopeel_outside_cost, &dummy, &dummy,
1998 0, false);
1999 dummy.release ();
2001 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2002 costs will be recorded. */
2003 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2004 prologue_cost_vec.create (2);
2005 epilogue_cost_vec.create (2);
2007 int dummy2;
2008 nopeel_outside_cost += vect_get_known_peeling_cost
2009 (loop_vinfo, 0, &dummy2,
2010 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2011 &prologue_cost_vec, &epilogue_cost_vec);
2013 prologue_cost_vec.release ();
2014 epilogue_cost_vec.release ();
2016 npeel = best_peel.peel_info.npeel;
2017 dr0 = best_peel.peel_info.dr;
2019 /* If no peeling is not more expensive than the best peeling we
2020 have so far, don't perform any peeling. */
2021 if (nopeel_inside_cost <= best_peel.inside_cost)
2022 do_peeling = false;
2025 if (do_peeling)
2027 stmt = vect_dr_stmt (dr0);
2028 stmt_info = vinfo_for_stmt (stmt);
2029 vectype = STMT_VINFO_VECTYPE (stmt_info);
2031 if (known_alignment_for_access_p (dr0))
2033 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2034 size_zero_node) < 0;
2035 if (!npeel)
2037 /* Since it's known at compile time, compute the number of
2038 iterations in the peeled loop (the peeling factor) for use in
2039 updating DR_MISALIGNMENT values. The peeling factor is the
2040 vectorization factor minus the misalignment as an element
2041 count. */
2042 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2043 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2044 npeel = ((mis & (target_align - 1))
2045 / vect_get_scalar_dr_size (dr0));
2048 /* For interleaved data access every iteration accesses all the
2049 members of the group, therefore we divide the number of iterations
2050 by the group size. */
2051 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr0));
2052 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2053 npeel /= DR_GROUP_SIZE (stmt_info);
2055 if (dump_enabled_p ())
2056 dump_printf_loc (MSG_NOTE, vect_location,
2057 "Try peeling by %d\n", npeel);
2060 /* Ensure that all datarefs can be vectorized after the peel. */
2061 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2062 do_peeling = false;
2064 /* Check if all datarefs are supportable and log. */
2065 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2067 stat = vect_verify_datarefs_alignment (loop_vinfo);
2068 if (!stat)
2069 do_peeling = false;
2070 else
2071 return stat;
2074 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2075 if (do_peeling)
2077 unsigned max_allowed_peel
2078 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2079 if (max_allowed_peel != (unsigned)-1)
2081 unsigned max_peel = npeel;
2082 if (max_peel == 0)
2084 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2085 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2087 if (max_peel > max_allowed_peel)
2089 do_peeling = false;
2090 if (dump_enabled_p ())
2091 dump_printf_loc (MSG_NOTE, vect_location,
2092 "Disable peeling, max peels reached: %d\n", max_peel);
2097 /* Cost model #2 - if peeling may result in a remaining loop not
2098 iterating enough to be vectorized then do not peel. Since this
2099 is a cost heuristic rather than a correctness decision, use the
2100 most likely runtime value for variable vectorization factors. */
2101 if (do_peeling
2102 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2104 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2105 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2106 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2107 < assumed_vf + max_peel)
2108 do_peeling = false;
2111 if (do_peeling)
2113 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2114 If the misalignment of DR_i is identical to that of dr0 then set
2115 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2116 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2117 by the peeling factor times the element size of DR_i (MOD the
2118 vectorization factor times the size). Otherwise, the
2119 misalignment of DR_i must be set to unknown. */
2120 FOR_EACH_VEC_ELT (datarefs, i, dr)
2121 if (dr != dr0)
2123 /* Strided accesses perform only component accesses, alignment
2124 is irrelevant for them. */
2125 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2126 if (STMT_VINFO_STRIDED_P (stmt_info)
2127 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2128 continue;
2130 vect_update_misalignment_for_peel (dr, dr0, npeel);
2133 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2134 if (npeel)
2135 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2136 else
2137 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2138 = DR_MISALIGNMENT (dr0);
2139 SET_DR_MISALIGNMENT (dr0, 0);
2140 if (dump_enabled_p ())
2142 dump_printf_loc (MSG_NOTE, vect_location,
2143 "Alignment of access forced using peeling.\n");
2144 dump_printf_loc (MSG_NOTE, vect_location,
2145 "Peeling for alignment will be applied.\n");
2148 /* The inside-loop cost will be accounted for in vectorizable_load
2149 and vectorizable_store correctly with adjusted alignments.
2150 Drop the body_cst_vec on the floor here. */
2151 stat = vect_verify_datarefs_alignment (loop_vinfo);
2152 gcc_assert (stat);
2153 return stat;
2157 /* (2) Versioning to force alignment. */
2159 /* Try versioning if:
2160 1) optimize loop for speed
2161 2) there is at least one unsupported misaligned data ref with an unknown
2162 misalignment, and
2163 3) all misaligned data refs with a known misalignment are supported, and
2164 4) the number of runtime alignment checks is within reason. */
2166 do_versioning =
2167 optimize_loop_nest_for_speed_p (loop)
2168 && (!loop->inner); /* FORNOW */
2170 if (do_versioning)
2172 FOR_EACH_VEC_ELT (datarefs, i, dr)
2174 stmt = vect_dr_stmt (dr);
2175 stmt_info = vinfo_for_stmt (stmt);
2177 /* For interleaving, only the alignment of the first access
2178 matters. */
2179 if (aligned_access_p (dr)
2180 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2181 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2182 continue;
2184 if (STMT_VINFO_STRIDED_P (stmt_info))
2186 /* Strided loads perform only component accesses, alignment is
2187 irrelevant for them. */
2188 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2189 continue;
2190 do_versioning = false;
2191 break;
2194 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2196 if (!supportable_dr_alignment)
2198 gimple *stmt;
2199 int mask;
2200 tree vectype;
2202 if (known_alignment_for_access_p (dr)
2203 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2204 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2206 do_versioning = false;
2207 break;
2210 stmt = vect_dr_stmt (dr);
2211 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2212 gcc_assert (vectype);
2214 /* At present we don't support versioning for alignment
2215 with variable VF, since there's no guarantee that the
2216 VF is a power of two. We could relax this if we added
2217 a way of enforcing a power-of-two size. */
2218 unsigned HOST_WIDE_INT size;
2219 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2221 do_versioning = false;
2222 break;
2225 /* The rightmost bits of an aligned address must be zeros.
2226 Construct the mask needed for this test. For example,
2227 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2228 mask must be 15 = 0xf. */
2229 mask = size - 1;
2231 /* FORNOW: use the same mask to test all potentially unaligned
2232 references in the loop. The vectorizer currently supports
2233 a single vector size, see the reference to
2234 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2235 vectorization factor is computed. */
2236 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2237 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2238 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2239 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2240 vect_dr_stmt (dr));
2244 /* Versioning requires at least one misaligned data reference. */
2245 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2246 do_versioning = false;
2247 else if (!do_versioning)
2248 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2251 if (do_versioning)
2253 vec<gimple *> may_misalign_stmts
2254 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2255 gimple *stmt;
2257 /* It can now be assumed that the data references in the statements
2258 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2259 of the loop being vectorized. */
2260 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2262 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2263 dr = STMT_VINFO_DATA_REF (stmt_info);
2264 SET_DR_MISALIGNMENT (dr, 0);
2265 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_NOTE, vect_location,
2267 "Alignment of access forced using versioning.\n");
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_NOTE, vect_location,
2272 "Versioning for alignment will be applied.\n");
2274 /* Peeling and versioning can't be done together at this time. */
2275 gcc_assert (! (do_peeling && do_versioning));
2277 stat = vect_verify_datarefs_alignment (loop_vinfo);
2278 gcc_assert (stat);
2279 return stat;
2282 /* This point is reached if neither peeling nor versioning is being done. */
2283 gcc_assert (! (do_peeling || do_versioning));
2285 stat = vect_verify_datarefs_alignment (loop_vinfo);
2286 return stat;
2290 /* Function vect_find_same_alignment_drs.
2292 Update group and alignment relations according to the chosen
2293 vectorization factor. */
2295 static void
2296 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2298 struct data_reference *dra = DDR_A (ddr);
2299 struct data_reference *drb = DDR_B (ddr);
2300 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2301 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2303 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2304 return;
2306 if (dra == drb)
2307 return;
2309 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2310 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2311 return;
2313 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2314 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2315 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2316 return;
2318 /* Two references with distance zero have the same alignment. */
2319 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2320 - wi::to_poly_offset (DR_INIT (drb)));
2321 if (maybe_ne (diff, 0))
2323 /* Get the wider of the two alignments. */
2324 unsigned int align_a = (vect_calculate_target_alignment (dra)
2325 / BITS_PER_UNIT);
2326 unsigned int align_b = (vect_calculate_target_alignment (drb)
2327 / BITS_PER_UNIT);
2328 unsigned int max_align = MAX (align_a, align_b);
2330 /* Require the gap to be a multiple of the larger vector alignment. */
2331 if (!multiple_p (diff, max_align))
2332 return;
2335 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2336 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2337 if (dump_enabled_p ())
2339 dump_printf_loc (MSG_NOTE, vect_location,
2340 "accesses have the same alignment: ");
2341 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2342 dump_printf (MSG_NOTE, " and ");
2343 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2344 dump_printf (MSG_NOTE, "\n");
2349 /* Function vect_analyze_data_refs_alignment
2351 Analyze the alignment of the data-references in the loop.
2352 Return FALSE if a data reference is found that cannot be vectorized. */
2354 bool
2355 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2357 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2359 /* Mark groups of data references with same alignment using
2360 data dependence information. */
2361 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2362 struct data_dependence_relation *ddr;
2363 unsigned int i;
2365 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2366 vect_find_same_alignment_drs (ddr);
2368 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2369 struct data_reference *dr;
2371 vect_record_base_alignments (vinfo);
2372 FOR_EACH_VEC_ELT (datarefs, i, dr)
2374 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2375 if (STMT_VINFO_VECTORIZABLE (stmt_info))
2376 vect_compute_data_ref_alignment (dr);
2379 return true;
2383 /* Analyze alignment of DRs of stmts in NODE. */
2385 static bool
2386 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2388 /* We vectorize from the first scalar stmt in the node unless
2389 the node is permuted in which case we start from the first
2390 element in the group. */
2391 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2392 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2393 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2394 first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2396 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2397 vect_compute_data_ref_alignment (dr);
2398 /* For creating the data-ref pointer we need alignment of the
2399 first element anyway. */
2400 if (dr != first_dr)
2401 vect_compute_data_ref_alignment (first_dr);
2402 if (! verify_data_ref_alignment (dr))
2404 if (dump_enabled_p ())
2405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2406 "not vectorized: bad data alignment in basic "
2407 "block.\n");
2408 return false;
2411 return true;
2414 /* Function vect_slp_analyze_instance_alignment
2416 Analyze the alignment of the data-references in the SLP instance.
2417 Return FALSE if a data reference is found that cannot be vectorized. */
2419 bool
2420 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2422 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2424 slp_tree node;
2425 unsigned i;
2426 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2427 if (! vect_slp_analyze_and_verify_node_alignment (node))
2428 return false;
2430 node = SLP_INSTANCE_TREE (instance);
2431 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2432 && ! vect_slp_analyze_and_verify_node_alignment
2433 (SLP_INSTANCE_TREE (instance)))
2434 return false;
2436 return true;
2440 /* Analyze groups of accesses: check that DR belongs to a group of
2441 accesses of legal size, step, etc. Detect gaps, single element
2442 interleaving, and other special cases. Set grouped access info.
2443 Collect groups of strided stores for further use in SLP analysis.
2444 Worker for vect_analyze_group_access. */
2446 static bool
2447 vect_analyze_group_access_1 (struct data_reference *dr)
2449 tree step = DR_STEP (dr);
2450 tree scalar_type = TREE_TYPE (DR_REF (dr));
2451 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2452 gimple *stmt = vect_dr_stmt (dr);
2453 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2454 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2455 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2456 HOST_WIDE_INT dr_step = -1;
2457 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2458 bool slp_impossible = false;
2460 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2461 size of the interleaving group (including gaps). */
2462 if (tree_fits_shwi_p (step))
2464 dr_step = tree_to_shwi (step);
2465 /* Check that STEP is a multiple of type size. Otherwise there is
2466 a non-element-sized gap at the end of the group which we
2467 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2468 ??? As we can handle non-constant step fine here we should
2469 simply remove uses of DR_GROUP_GAP between the last and first
2470 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2471 simply not include that gap. */
2472 if ((dr_step % type_size) != 0)
2474 if (dump_enabled_p ())
2476 dump_printf_loc (MSG_NOTE, vect_location,
2477 "Step ");
2478 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2479 dump_printf (MSG_NOTE,
2480 " is not a multiple of the element size for ");
2481 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2482 dump_printf (MSG_NOTE, "\n");
2484 return false;
2486 groupsize = absu_hwi (dr_step) / type_size;
2488 else
2489 groupsize = 0;
2491 /* Not consecutive access is possible only if it is a part of interleaving. */
2492 if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2494 /* Check if it this DR is a part of interleaving, and is a single
2495 element of the group that is accessed in the loop. */
2497 /* Gaps are supported only for loads. STEP must be a multiple of the type
2498 size. */
2499 if (DR_IS_READ (dr)
2500 && (dr_step % type_size) == 0
2501 && groupsize > 0)
2503 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2504 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2505 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2506 if (dump_enabled_p ())
2508 dump_printf_loc (MSG_NOTE, vect_location,
2509 "Detected single element interleaving ");
2510 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2511 dump_printf (MSG_NOTE, " step ");
2512 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2513 dump_printf (MSG_NOTE, "\n");
2516 return true;
2519 if (dump_enabled_p ())
2521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2522 "not consecutive access ");
2523 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2526 if (bb_vinfo)
2528 /* Mark the statement as unvectorizable. */
2529 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
2530 return true;
2533 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2534 STMT_VINFO_STRIDED_P (stmt_info) = true;
2535 return true;
2538 if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2540 /* First stmt in the interleaving chain. Check the chain. */
2541 gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2542 struct data_reference *data_ref = dr;
2543 unsigned int count = 1;
2544 tree prev_init = DR_INIT (data_ref);
2545 gimple *prev = stmt;
2546 HOST_WIDE_INT diff, gaps = 0;
2548 /* By construction, all group members have INTEGER_CST DR_INITs. */
2549 while (next)
2551 /* Skip same data-refs. In case that two or more stmts share
2552 data-ref (supported only for loads), we vectorize only the first
2553 stmt, and the rest get their vectorized loads from the first
2554 one. */
2555 if (!tree_int_cst_compare (DR_INIT (data_ref),
2556 DR_INIT (STMT_VINFO_DATA_REF (
2557 vinfo_for_stmt (next)))))
2559 if (DR_IS_WRITE (data_ref))
2561 if (dump_enabled_p ())
2562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2563 "Two store stmts share the same dr.\n");
2564 return false;
2567 if (dump_enabled_p ())
2568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2569 "Two or more load stmts share the same dr.\n");
2571 /* For load use the same data-ref load. */
2572 DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2574 prev = next;
2575 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2576 continue;
2579 prev = next;
2580 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2582 /* All group members have the same STEP by construction. */
2583 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2585 /* Check that the distance between two accesses is equal to the type
2586 size. Otherwise, we have gaps. */
2587 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2588 - TREE_INT_CST_LOW (prev_init)) / type_size;
2589 if (diff != 1)
2591 /* FORNOW: SLP of accesses with gaps is not supported. */
2592 slp_impossible = true;
2593 if (DR_IS_WRITE (data_ref))
2595 if (dump_enabled_p ())
2596 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2597 "interleaved store with gaps\n");
2598 return false;
2601 gaps += diff - 1;
2604 last_accessed_element += diff;
2606 /* Store the gap from the previous member of the group. If there is no
2607 gap in the access, DR_GROUP_GAP is always 1. */
2608 DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2610 prev_init = DR_INIT (data_ref);
2611 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2612 /* Count the number of data-refs in the chain. */
2613 count++;
2616 if (groupsize == 0)
2617 groupsize = count + gaps;
2619 /* This could be UINT_MAX but as we are generating code in a very
2620 inefficient way we have to cap earlier. See PR78699 for example. */
2621 if (groupsize > 4096)
2623 if (dump_enabled_p ())
2624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2625 "group is too large\n");
2626 return false;
2629 /* Check that the size of the interleaving is equal to count for stores,
2630 i.e., that there are no gaps. */
2631 if (groupsize != count
2632 && !DR_IS_READ (dr))
2634 if (dump_enabled_p ())
2635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2636 "interleaved store with gaps\n");
2637 return false;
2640 /* If there is a gap after the last load in the group it is the
2641 difference between the groupsize and the last accessed
2642 element.
2643 When there is no gap, this difference should be 0. */
2644 DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2646 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2647 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE, vect_location,
2650 "Detected interleaving ");
2651 if (DR_IS_READ (dr))
2652 dump_printf (MSG_NOTE, "load ");
2653 else
2654 dump_printf (MSG_NOTE, "store ");
2655 dump_printf (MSG_NOTE, "of size %u starting with ",
2656 (unsigned)groupsize);
2657 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2658 if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2659 dump_printf_loc (MSG_NOTE, vect_location,
2660 "There is a gap of %u elements after the group\n",
2661 DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2664 /* SLP: create an SLP data structure for every interleaving group of
2665 stores for further analysis in vect_analyse_slp. */
2666 if (DR_IS_WRITE (dr) && !slp_impossible)
2668 if (loop_vinfo)
2669 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2670 if (bb_vinfo)
2671 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2675 return true;
2678 /* Analyze groups of accesses: check that DR belongs to a group of
2679 accesses of legal size, step, etc. Detect gaps, single element
2680 interleaving, and other special cases. Set grouped access info.
2681 Collect groups of strided stores for further use in SLP analysis. */
2683 static bool
2684 vect_analyze_group_access (struct data_reference *dr)
2686 if (!vect_analyze_group_access_1 (dr))
2688 /* Dissolve the group if present. */
2689 gimple *next;
2690 gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dr)));
2691 while (stmt)
2693 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2694 next = DR_GROUP_NEXT_ELEMENT (vinfo);
2695 DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2696 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2697 stmt = next;
2699 return false;
2701 return true;
2704 /* Analyze the access pattern of the data-reference DR.
2705 In case of non-consecutive accesses call vect_analyze_group_access() to
2706 analyze groups of accesses. */
2708 static bool
2709 vect_analyze_data_ref_access (struct data_reference *dr)
2711 tree step = DR_STEP (dr);
2712 tree scalar_type = TREE_TYPE (DR_REF (dr));
2713 gimple *stmt = vect_dr_stmt (dr);
2714 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2715 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2716 struct loop *loop = NULL;
2718 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2719 return true;
2721 if (loop_vinfo)
2722 loop = LOOP_VINFO_LOOP (loop_vinfo);
2724 if (loop_vinfo && !step)
2726 if (dump_enabled_p ())
2727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2728 "bad data-ref access in loop\n");
2729 return false;
2732 /* Allow loads with zero step in inner-loop vectorization. */
2733 if (loop_vinfo && integer_zerop (step))
2735 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2736 if (!nested_in_vect_loop_p (loop, stmt))
2737 return DR_IS_READ (dr);
2738 /* Allow references with zero step for outer loops marked
2739 with pragma omp simd only - it guarantees absence of
2740 loop-carried dependencies between inner loop iterations. */
2741 if (loop->safelen < 2)
2743 if (dump_enabled_p ())
2744 dump_printf_loc (MSG_NOTE, vect_location,
2745 "zero step in inner loop of nest\n");
2746 return false;
2750 if (loop && nested_in_vect_loop_p (loop, stmt))
2752 /* Interleaved accesses are not yet supported within outer-loop
2753 vectorization for references in the inner-loop. */
2754 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2756 /* For the rest of the analysis we use the outer-loop step. */
2757 step = STMT_VINFO_DR_STEP (stmt_info);
2758 if (integer_zerop (step))
2760 if (dump_enabled_p ())
2761 dump_printf_loc (MSG_NOTE, vect_location,
2762 "zero step in outer loop.\n");
2763 return DR_IS_READ (dr);
2767 /* Consecutive? */
2768 if (TREE_CODE (step) == INTEGER_CST)
2770 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2771 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2772 || (dr_step < 0
2773 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2775 /* Mark that it is not interleaving. */
2776 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2777 return true;
2781 if (loop && nested_in_vect_loop_p (loop, stmt))
2783 if (dump_enabled_p ())
2784 dump_printf_loc (MSG_NOTE, vect_location,
2785 "grouped access in outer loop.\n");
2786 return false;
2790 /* Assume this is a DR handled by non-constant strided load case. */
2791 if (TREE_CODE (step) != INTEGER_CST)
2792 return (STMT_VINFO_STRIDED_P (stmt_info)
2793 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2794 || vect_analyze_group_access (dr)));
2796 /* Not consecutive access - check if it's a part of interleaving group. */
2797 return vect_analyze_group_access (dr);
2800 /* Compare two data-references DRA and DRB to group them into chunks
2801 suitable for grouping. */
2803 static int
2804 dr_group_sort_cmp (const void *dra_, const void *drb_)
2806 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2807 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2808 int cmp;
2810 /* Stabilize sort. */
2811 if (dra == drb)
2812 return 0;
2814 /* DRs in different loops never belong to the same group. */
2815 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2816 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2817 if (loopa != loopb)
2818 return loopa->num < loopb->num ? -1 : 1;
2820 /* Ordering of DRs according to base. */
2821 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2822 DR_BASE_ADDRESS (drb));
2823 if (cmp != 0)
2824 return cmp;
2826 /* And according to DR_OFFSET. */
2827 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2828 if (cmp != 0)
2829 return cmp;
2831 /* Put reads before writes. */
2832 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2833 return DR_IS_READ (dra) ? -1 : 1;
2835 /* Then sort after access size. */
2836 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2837 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2838 if (cmp != 0)
2839 return cmp;
2841 /* And after step. */
2842 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2843 if (cmp != 0)
2844 return cmp;
2846 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2847 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2848 if (cmp == 0)
2849 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2850 return cmp;
2853 /* If OP is the result of a conversion, return the unconverted value,
2854 otherwise return null. */
2856 static tree
2857 strip_conversion (tree op)
2859 if (TREE_CODE (op) != SSA_NAME)
2860 return NULL_TREE;
2861 gimple *stmt = SSA_NAME_DEF_STMT (op);
2862 if (!is_gimple_assign (stmt)
2863 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2864 return NULL_TREE;
2865 return gimple_assign_rhs1 (stmt);
2868 /* Return true if vectorizable_* routines can handle statements STMT1
2869 and STMT2 being in a single group. */
2871 static bool
2872 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2874 if (gimple_assign_single_p (stmt1))
2875 return gimple_assign_single_p (stmt2);
2877 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2879 /* Check for two masked loads or two masked stores. */
2880 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2881 return false;
2882 internal_fn ifn = gimple_call_internal_fn (stmt1);
2883 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2884 return false;
2885 if (ifn != gimple_call_internal_fn (stmt2))
2886 return false;
2888 /* Check that the masks are the same. Cope with casts of masks,
2889 like those created by build_mask_conversion. */
2890 tree mask1 = gimple_call_arg (stmt1, 2);
2891 tree mask2 = gimple_call_arg (stmt2, 2);
2892 if (!operand_equal_p (mask1, mask2, 0))
2894 mask1 = strip_conversion (mask1);
2895 if (!mask1)
2896 return false;
2897 mask2 = strip_conversion (mask2);
2898 if (!mask2)
2899 return false;
2900 if (!operand_equal_p (mask1, mask2, 0))
2901 return false;
2903 return true;
2906 return false;
2909 /* Function vect_analyze_data_ref_accesses.
2911 Analyze the access pattern of all the data references in the loop.
2913 FORNOW: the only access pattern that is considered vectorizable is a
2914 simple step 1 (consecutive) access.
2916 FORNOW: handle only arrays and pointer accesses. */
2918 bool
2919 vect_analyze_data_ref_accesses (vec_info *vinfo)
2921 unsigned int i;
2922 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2923 struct data_reference *dr;
2925 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2927 if (datarefs.is_empty ())
2928 return true;
2930 /* Sort the array of datarefs to make building the interleaving chains
2931 linear. Don't modify the original vector's order, it is needed for
2932 determining what dependencies are reversed. */
2933 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2934 datarefs_copy.qsort (dr_group_sort_cmp);
2936 /* Build the interleaving chains. */
2937 for (i = 0; i < datarefs_copy.length () - 1;)
2939 data_reference_p dra = datarefs_copy[i];
2940 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2941 stmt_vec_info lastinfo = NULL;
2942 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2943 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2945 ++i;
2946 continue;
2948 for (i = i + 1; i < datarefs_copy.length (); ++i)
2950 data_reference_p drb = datarefs_copy[i];
2951 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2952 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2953 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2954 break;
2956 /* ??? Imperfect sorting (non-compatible types, non-modulo
2957 accesses, same accesses) can lead to a group to be artificially
2958 split here as we don't just skip over those. If it really
2959 matters we can push those to a worklist and re-iterate
2960 over them. The we can just skip ahead to the next DR here. */
2962 /* DRs in a different loop should not be put into the same
2963 interleaving group. */
2964 if (gimple_bb (DR_STMT (dra))->loop_father
2965 != gimple_bb (DR_STMT (drb))->loop_father)
2966 break;
2968 /* Check that the data-refs have same first location (except init)
2969 and they are both either store or load (not load and store,
2970 not masked loads or stores). */
2971 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2972 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2973 DR_BASE_ADDRESS (drb)) != 0
2974 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2975 || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2976 break;
2978 /* Check that the data-refs have the same constant size. */
2979 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2980 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2981 if (!tree_fits_uhwi_p (sza)
2982 || !tree_fits_uhwi_p (szb)
2983 || !tree_int_cst_equal (sza, szb))
2984 break;
2986 /* Check that the data-refs have the same step. */
2987 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2988 break;
2990 /* Check the types are compatible.
2991 ??? We don't distinguish this during sorting. */
2992 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2993 TREE_TYPE (DR_REF (drb))))
2994 break;
2996 /* Check that the DR_INITs are compile-time constants. */
2997 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2998 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2999 break;
3001 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3002 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3003 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3004 HOST_WIDE_INT init_prev
3005 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3006 gcc_assert (init_a <= init_b
3007 && init_a <= init_prev
3008 && init_prev <= init_b);
3010 /* Do not place the same access in the interleaving chain twice. */
3011 if (init_b == init_prev)
3013 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3014 < gimple_uid (DR_STMT (drb)));
3015 /* ??? For now we simply "drop" the later reference which is
3016 otherwise the same rather than finishing off this group.
3017 In the end we'd want to re-process duplicates forming
3018 multiple groups from the refs, likely by just collecting
3019 all candidates (including duplicates and split points
3020 below) in a vector and then process them together. */
3021 continue;
3024 /* If init_b == init_a + the size of the type * k, we have an
3025 interleaving, and DRA is accessed before DRB. */
3026 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3027 if (type_size_a == 0
3028 || (init_b - init_a) % type_size_a != 0)
3029 break;
3031 /* If we have a store, the accesses are adjacent. This splits
3032 groups into chunks we support (we don't support vectorization
3033 of stores with gaps). */
3034 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3035 break;
3037 /* If the step (if not zero or non-constant) is greater than the
3038 difference between data-refs' inits this splits groups into
3039 suitable sizes. */
3040 if (tree_fits_shwi_p (DR_STEP (dra)))
3042 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3043 if (step != 0 && step <= (init_b - init_a))
3044 break;
3047 if (dump_enabled_p ())
3049 dump_printf_loc (MSG_NOTE, vect_location,
3050 "Detected interleaving ");
3051 if (DR_IS_READ (dra))
3052 dump_printf (MSG_NOTE, "load ");
3053 else
3054 dump_printf (MSG_NOTE, "store ");
3055 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3056 dump_printf (MSG_NOTE, " and ");
3057 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3058 dump_printf (MSG_NOTE, "\n");
3061 /* Link the found element into the group list. */
3062 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3064 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3065 lastinfo = stmtinfo_a;
3067 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3068 DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3069 lastinfo = stmtinfo_b;
3073 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3074 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr)))
3075 && !vect_analyze_data_ref_access (dr))
3077 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3079 "not vectorized: complicated access pattern.\n");
3081 if (is_a <bb_vec_info> (vinfo))
3083 /* Mark the statement as not vectorizable. */
3084 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
3085 continue;
3087 else
3089 datarefs_copy.release ();
3090 return false;
3094 datarefs_copy.release ();
3095 return true;
3098 /* Function vect_vfa_segment_size.
3100 Input:
3101 DR: The data reference.
3102 LENGTH_FACTOR: segment length to consider.
3104 Return a value suitable for the dr_with_seg_len::seg_len field.
3105 This is the "distance travelled" by the pointer from the first
3106 iteration in the segment to the last. Note that it does not include
3107 the size of the access; in effect it only describes the first byte. */
3109 static tree
3110 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3112 length_factor = size_binop (MINUS_EXPR,
3113 fold_convert (sizetype, length_factor),
3114 size_one_node);
3115 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3116 length_factor);
3119 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3120 gives the worst-case number of bytes covered by the segment. */
3122 static unsigned HOST_WIDE_INT
3123 vect_vfa_access_size (data_reference *dr)
3125 stmt_vec_info stmt_vinfo = vinfo_for_stmt (vect_dr_stmt (dr));
3126 tree ref_type = TREE_TYPE (DR_REF (dr));
3127 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3128 unsigned HOST_WIDE_INT access_size = ref_size;
3129 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3131 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3132 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3134 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3135 && (vect_supportable_dr_alignment (dr, false)
3136 == dr_explicit_realign_optimized))
3138 /* We might access a full vector's worth. */
3139 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3140 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3142 return access_size;
3145 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3147 static unsigned int
3148 vect_vfa_align (const data_reference *dr)
3150 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3153 /* Function vect_no_alias_p.
3155 Given data references A and B with equal base and offset, see whether
3156 the alias relation can be decided at compilation time. Return 1 if
3157 it can and the references alias, 0 if it can and the references do
3158 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3159 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3160 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3162 static int
3163 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3164 tree segment_length_a, tree segment_length_b,
3165 unsigned HOST_WIDE_INT access_size_a,
3166 unsigned HOST_WIDE_INT access_size_b)
3168 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3169 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3170 poly_uint64 const_length_a;
3171 poly_uint64 const_length_b;
3173 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3174 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3175 [a, a+12) */
3176 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3178 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3179 offset_a = (offset_a + access_size_a) - const_length_a;
3181 else
3182 const_length_a = tree_to_poly_uint64 (segment_length_a);
3183 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3185 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3186 offset_b = (offset_b + access_size_b) - const_length_b;
3188 else
3189 const_length_b = tree_to_poly_uint64 (segment_length_b);
3191 const_length_a += access_size_a;
3192 const_length_b += access_size_b;
3194 if (ranges_known_overlap_p (offset_a, const_length_a,
3195 offset_b, const_length_b))
3196 return 1;
3198 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3199 offset_b, const_length_b))
3200 return 0;
3202 return -1;
3205 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3206 in DDR is >= VF. */
3208 static bool
3209 dependence_distance_ge_vf (data_dependence_relation *ddr,
3210 unsigned int loop_depth, poly_uint64 vf)
3212 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3213 || DDR_NUM_DIST_VECTS (ddr) == 0)
3214 return false;
3216 /* If the dependence is exact, we should have limited the VF instead. */
3217 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3219 unsigned int i;
3220 lambda_vector dist_v;
3221 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3223 HOST_WIDE_INT dist = dist_v[loop_depth];
3224 if (dist != 0
3225 && !(dist > 0 && DDR_REVERSED_P (ddr))
3226 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3227 return false;
3230 if (dump_enabled_p ())
3232 dump_printf_loc (MSG_NOTE, vect_location,
3233 "dependence distance between ");
3234 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3235 dump_printf (MSG_NOTE, " and ");
3236 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3237 dump_printf (MSG_NOTE, " is >= VF\n");
3240 return true;
3243 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3245 static void
3246 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3248 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3249 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3250 dump_printf (dump_kind, ") >= ");
3251 dump_dec (dump_kind, lower_bound.min_value);
3254 /* Record that the vectorized loop requires the vec_lower_bound described
3255 by EXPR, UNSIGNED_P and MIN_VALUE. */
3257 static void
3258 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3259 poly_uint64 min_value)
3261 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3262 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3263 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3265 unsigned_p &= lower_bounds[i].unsigned_p;
3266 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3267 if (lower_bounds[i].unsigned_p != unsigned_p
3268 || maybe_lt (lower_bounds[i].min_value, min_value))
3270 lower_bounds[i].unsigned_p = unsigned_p;
3271 lower_bounds[i].min_value = min_value;
3272 if (dump_enabled_p ())
3274 dump_printf_loc (MSG_NOTE, vect_location,
3275 "updating run-time check to ");
3276 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3277 dump_printf (MSG_NOTE, "\n");
3280 return;
3283 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3284 if (dump_enabled_p ())
3286 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3287 dump_lower_bound (MSG_NOTE, lower_bound);
3288 dump_printf (MSG_NOTE, "\n");
3290 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3293 /* Return true if it's unlikely that the step of the vectorized form of DR
3294 will span fewer than GAP bytes. */
3296 static bool
3297 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3299 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
3300 HOST_WIDE_INT count
3301 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3302 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3303 count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3304 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3307 /* Return true if we know that there is no alias between DR_A and DR_B
3308 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3309 *LOWER_BOUND_OUT to this N. */
3311 static bool
3312 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3313 poly_uint64 *lower_bound_out)
3315 /* Check that there is a constant gap of known sign between DR_A
3316 and DR_B. */
3317 poly_int64 init_a, init_b;
3318 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3319 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3320 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3321 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3322 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3323 || !ordered_p (init_a, init_b))
3324 return false;
3326 /* Sort DR_A and DR_B by the address they access. */
3327 if (maybe_lt (init_b, init_a))
3329 std::swap (init_a, init_b);
3330 std::swap (dr_a, dr_b);
3333 /* If the two accesses could be dependent within a scalar iteration,
3334 make sure that we'd retain their order. */
3335 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3336 && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3337 vect_dr_stmt (dr_b)))
3338 return false;
3340 /* There is no alias if abs (DR_STEP) is greater than or equal to
3341 the bytes spanned by the combination of the two accesses. */
3342 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3343 return true;
3346 /* Function vect_prune_runtime_alias_test_list.
3348 Prune a list of ddrs to be tested at run-time by versioning for alias.
3349 Merge several alias checks into one if possible.
3350 Return FALSE if resulting list of ddrs is longer then allowed by
3351 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3353 bool
3354 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3356 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3357 hash_set <tree_pair_hash> compared_objects;
3359 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3360 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3361 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3362 vec<vec_object_pair> &check_unequal_addrs
3363 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3364 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3365 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3367 ddr_p ddr;
3368 unsigned int i;
3369 tree length_factor;
3371 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3373 /* Step values are irrelevant for aliasing if the number of vector
3374 iterations is equal to the number of scalar iterations (which can
3375 happen for fully-SLP loops). */
3376 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3378 if (!ignore_step_p)
3380 /* Convert the checks for nonzero steps into bound tests. */
3381 tree value;
3382 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3383 vect_check_lower_bound (loop_vinfo, value, true, 1);
3386 if (may_alias_ddrs.is_empty ())
3387 return true;
3389 comp_alias_ddrs.create (may_alias_ddrs.length ());
3391 unsigned int loop_depth
3392 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3393 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3395 /* First, we collect all data ref pairs for aliasing checks. */
3396 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3398 int comp_res;
3399 poly_uint64 lower_bound;
3400 struct data_reference *dr_a, *dr_b;
3401 gimple *dr_group_first_a, *dr_group_first_b;
3402 tree segment_length_a, segment_length_b;
3403 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3404 unsigned int align_a, align_b;
3405 gimple *stmt_a, *stmt_b;
3407 /* Ignore the alias if the VF we chose ended up being no greater
3408 than the dependence distance. */
3409 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3410 continue;
3412 if (DDR_OBJECT_A (ddr))
3414 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3415 if (!compared_objects.add (new_pair))
3417 if (dump_enabled_p ())
3419 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3420 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3421 dump_printf (MSG_NOTE, " and ");
3422 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3423 dump_printf (MSG_NOTE, " have different addresses\n");
3425 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3427 continue;
3430 dr_a = DDR_A (ddr);
3431 stmt_a = vect_dr_stmt (DDR_A (ddr));
3433 dr_b = DDR_B (ddr);
3434 stmt_b = vect_dr_stmt (DDR_B (ddr));
3436 /* Skip the pair if inter-iteration dependencies are irrelevant
3437 and intra-iteration dependencies are guaranteed to be honored. */
3438 if (ignore_step_p
3439 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3440 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3442 if (dump_enabled_p ())
3444 dump_printf_loc (MSG_NOTE, vect_location,
3445 "no need for alias check between ");
3446 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3447 dump_printf (MSG_NOTE, " and ");
3448 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3449 dump_printf (MSG_NOTE, " when VF is 1\n");
3451 continue;
3454 /* See whether we can handle the alias using a bounds check on
3455 the step, and whether that's likely to be the best approach.
3456 (It might not be, for example, if the minimum step is much larger
3457 than the number of bytes handled by one vector iteration.) */
3458 if (!ignore_step_p
3459 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3460 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3461 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3462 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3464 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3465 if (dump_enabled_p ())
3467 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3468 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3469 dump_printf (MSG_NOTE, " and ");
3470 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3471 dump_printf (MSG_NOTE, " when the step ");
3472 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3473 dump_printf (MSG_NOTE, " is outside ");
3474 if (unsigned_p)
3475 dump_printf (MSG_NOTE, "[0");
3476 else
3478 dump_printf (MSG_NOTE, "(");
3479 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3481 dump_printf (MSG_NOTE, ", ");
3482 dump_dec (MSG_NOTE, lower_bound);
3483 dump_printf (MSG_NOTE, ")\n");
3485 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3486 lower_bound);
3487 continue;
3490 dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3491 if (dr_group_first_a)
3493 stmt_a = dr_group_first_a;
3494 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3497 dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3498 if (dr_group_first_b)
3500 stmt_b = dr_group_first_b;
3501 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3504 if (ignore_step_p)
3506 segment_length_a = size_zero_node;
3507 segment_length_b = size_zero_node;
3509 else
3511 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3512 length_factor = scalar_loop_iters;
3513 else
3514 length_factor = size_int (vect_factor);
3515 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3516 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3518 access_size_a = vect_vfa_access_size (dr_a);
3519 access_size_b = vect_vfa_access_size (dr_b);
3520 align_a = vect_vfa_align (dr_a);
3521 align_b = vect_vfa_align (dr_b);
3523 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3524 DR_BASE_ADDRESS (dr_b));
3525 if (comp_res == 0)
3526 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3527 DR_OFFSET (dr_b));
3529 /* See whether the alias is known at compilation time. */
3530 if (comp_res == 0
3531 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3532 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3533 && poly_int_tree_p (segment_length_a)
3534 && poly_int_tree_p (segment_length_b))
3536 int res = vect_compile_time_alias (dr_a, dr_b,
3537 segment_length_a,
3538 segment_length_b,
3539 access_size_a,
3540 access_size_b);
3541 if (res >= 0 && dump_enabled_p ())
3543 dump_printf_loc (MSG_NOTE, vect_location,
3544 "can tell at compile time that ");
3545 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3546 dump_printf (MSG_NOTE, " and ");
3547 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3548 if (res == 0)
3549 dump_printf (MSG_NOTE, " do not alias\n");
3550 else
3551 dump_printf (MSG_NOTE, " alias\n");
3554 if (res == 0)
3555 continue;
3557 if (res == 1)
3559 if (dump_enabled_p ())
3560 dump_printf_loc (MSG_NOTE, vect_location,
3561 "not vectorized: compilation time alias.\n");
3562 return false;
3566 dr_with_seg_len_pair_t dr_with_seg_len_pair
3567 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3568 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3570 /* Canonicalize pairs by sorting the two DR members. */
3571 if (comp_res > 0)
3572 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3574 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3577 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3579 unsigned int count = (comp_alias_ddrs.length ()
3580 + check_unequal_addrs.length ());
3582 dump_printf_loc (MSG_NOTE, vect_location,
3583 "improved number of alias checks from %d to %d\n",
3584 may_alias_ddrs.length (), count);
3585 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3587 if (dump_enabled_p ())
3588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589 "number of versioning for alias "
3590 "run-time tests exceeds %d "
3591 "(--param vect-max-version-for-alias-checks)\n",
3592 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3593 return false;
3596 return true;
3599 /* Check whether we can use an internal function for a gather load
3600 or scatter store. READ_P is true for loads and false for stores.
3601 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3602 the type of the memory elements being loaded or stored. OFFSET_BITS
3603 is the number of bits in each scalar offset and OFFSET_SIGN is the
3604 sign of the offset. SCALE is the amount by which the offset should
3605 be multiplied *after* it has been converted to address width.
3607 Return true if the function is supported, storing the function
3608 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3610 bool
3611 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3612 tree memory_type, unsigned int offset_bits,
3613 signop offset_sign, int scale,
3614 internal_fn *ifn_out, tree *element_type_out)
3616 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3617 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3618 if (offset_bits > element_bits)
3619 /* Internal functions require the offset to be the same width as
3620 the vector elements. We can extend narrower offsets, but it isn't
3621 safe to truncate wider offsets. */
3622 return false;
3624 if (element_bits != memory_bits)
3625 /* For now the vector elements must be the same width as the
3626 memory elements. */
3627 return false;
3629 /* Work out which function we need. */
3630 internal_fn ifn;
3631 if (read_p)
3632 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3633 else
3634 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3636 /* Test whether the target supports this combination. */
3637 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3638 offset_sign, scale))
3639 return false;
3641 *ifn_out = ifn;
3642 *element_type_out = TREE_TYPE (vectype);
3643 return true;
3646 /* CALL is a call to an internal gather load or scatter store function.
3647 Describe the operation in INFO. */
3649 static void
3650 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3652 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3653 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3654 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3656 info->ifn = gimple_call_internal_fn (call);
3657 info->decl = NULL_TREE;
3658 info->base = gimple_call_arg (call, 0);
3659 info->offset = gimple_call_arg (call, 1);
3660 info->offset_dt = vect_unknown_def_type;
3661 info->offset_vectype = NULL_TREE;
3662 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3663 info->element_type = TREE_TYPE (vectype);
3664 info->memory_type = TREE_TYPE (DR_REF (dr));
3667 /* Return true if a non-affine read or write in STMT is suitable for a
3668 gather load or scatter store. Describe the operation in *INFO if so. */
3670 bool
3671 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3672 gather_scatter_info *info)
3674 HOST_WIDE_INT scale = 1;
3675 poly_int64 pbitpos, pbitsize;
3676 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3677 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3678 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3679 tree offtype = NULL_TREE;
3680 tree decl = NULL_TREE, base, off;
3681 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3682 tree memory_type = TREE_TYPE (DR_REF (dr));
3683 machine_mode pmode;
3684 int punsignedp, reversep, pvolatilep = 0;
3685 internal_fn ifn;
3686 tree element_type;
3687 bool masked_p = false;
3689 /* See whether this is already a call to a gather/scatter internal function.
3690 If not, see whether it's a masked load or store. */
3691 gcall *call = dyn_cast <gcall *> (stmt);
3692 if (call && gimple_call_internal_p (call))
3694 ifn = gimple_call_internal_fn (stmt);
3695 if (internal_gather_scatter_fn_p (ifn))
3697 vect_describe_gather_scatter_call (call, info);
3698 return true;
3700 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3703 /* True if we should aim to use internal functions rather than
3704 built-in functions. */
3705 bool use_ifn_p = (DR_IS_READ (dr)
3706 ? supports_vec_gather_load_p ()
3707 : supports_vec_scatter_store_p ());
3709 base = DR_REF (dr);
3710 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3711 see if we can use the def stmt of the address. */
3712 if (masked_p
3713 && TREE_CODE (base) == MEM_REF
3714 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3715 && integer_zerop (TREE_OPERAND (base, 1))
3716 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3718 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3719 if (is_gimple_assign (def_stmt)
3720 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3721 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3724 /* The gather and scatter builtins need address of the form
3725 loop_invariant + vector * {1, 2, 4, 8}
3727 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3728 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3729 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3730 multiplications and additions in it. To get a vector, we need
3731 a single SSA_NAME that will be defined in the loop and will
3732 contain everything that is not loop invariant and that can be
3733 vectorized. The following code attempts to find such a preexistng
3734 SSA_NAME OFF and put the loop invariants into a tree BASE
3735 that can be gimplified before the loop. */
3736 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3737 &punsignedp, &reversep, &pvolatilep);
3738 if (reversep)
3739 return false;
3741 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3743 if (TREE_CODE (base) == MEM_REF)
3745 if (!integer_zerop (TREE_OPERAND (base, 1)))
3747 if (off == NULL_TREE)
3748 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3749 else
3750 off = size_binop (PLUS_EXPR, off,
3751 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3753 base = TREE_OPERAND (base, 0);
3755 else
3756 base = build_fold_addr_expr (base);
3758 if (off == NULL_TREE)
3759 off = size_zero_node;
3761 /* If base is not loop invariant, either off is 0, then we start with just
3762 the constant offset in the loop invariant BASE and continue with base
3763 as OFF, otherwise give up.
3764 We could handle that case by gimplifying the addition of base + off
3765 into some SSA_NAME and use that as off, but for now punt. */
3766 if (!expr_invariant_in_loop_p (loop, base))
3768 if (!integer_zerop (off))
3769 return false;
3770 off = base;
3771 base = size_int (pbytepos);
3773 /* Otherwise put base + constant offset into the loop invariant BASE
3774 and continue with OFF. */
3775 else
3777 base = fold_convert (sizetype, base);
3778 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3781 /* OFF at this point may be either a SSA_NAME or some tree expression
3782 from get_inner_reference. Try to peel off loop invariants from it
3783 into BASE as long as possible. */
3784 STRIP_NOPS (off);
3785 while (offtype == NULL_TREE)
3787 enum tree_code code;
3788 tree op0, op1, add = NULL_TREE;
3790 if (TREE_CODE (off) == SSA_NAME)
3792 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3794 if (expr_invariant_in_loop_p (loop, off))
3795 return false;
3797 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3798 break;
3800 op0 = gimple_assign_rhs1 (def_stmt);
3801 code = gimple_assign_rhs_code (def_stmt);
3802 op1 = gimple_assign_rhs2 (def_stmt);
3804 else
3806 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3807 return false;
3808 code = TREE_CODE (off);
3809 extract_ops_from_tree (off, &code, &op0, &op1);
3811 switch (code)
3813 case POINTER_PLUS_EXPR:
3814 case PLUS_EXPR:
3815 if (expr_invariant_in_loop_p (loop, op0))
3817 add = op0;
3818 off = op1;
3819 do_add:
3820 add = fold_convert (sizetype, add);
3821 if (scale != 1)
3822 add = size_binop (MULT_EXPR, add, size_int (scale));
3823 base = size_binop (PLUS_EXPR, base, add);
3824 continue;
3826 if (expr_invariant_in_loop_p (loop, op1))
3828 add = op1;
3829 off = op0;
3830 goto do_add;
3832 break;
3833 case MINUS_EXPR:
3834 if (expr_invariant_in_loop_p (loop, op1))
3836 add = fold_convert (sizetype, op1);
3837 add = size_binop (MINUS_EXPR, size_zero_node, add);
3838 off = op0;
3839 goto do_add;
3841 break;
3842 case MULT_EXPR:
3843 if (scale == 1 && tree_fits_shwi_p (op1))
3845 int new_scale = tree_to_shwi (op1);
3846 /* Only treat this as a scaling operation if the target
3847 supports it. */
3848 if (use_ifn_p
3849 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3850 vectype, memory_type, 1,
3851 TYPE_SIGN (TREE_TYPE (op0)),
3852 new_scale, &ifn,
3853 &element_type))
3854 break;
3855 scale = new_scale;
3856 off = op0;
3857 continue;
3859 break;
3860 case SSA_NAME:
3861 off = op0;
3862 continue;
3863 CASE_CONVERT:
3864 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3865 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3866 break;
3867 if (TYPE_PRECISION (TREE_TYPE (op0))
3868 == TYPE_PRECISION (TREE_TYPE (off)))
3870 off = op0;
3871 continue;
3874 /* The internal functions need the offset to be the same width
3875 as the elements of VECTYPE. Don't include operations that
3876 cast the offset from that width to a different width. */
3877 if (use_ifn_p
3878 && (int_size_in_bytes (TREE_TYPE (vectype))
3879 == int_size_in_bytes (TREE_TYPE (off))))
3880 break;
3882 if (TYPE_PRECISION (TREE_TYPE (op0))
3883 < TYPE_PRECISION (TREE_TYPE (off)))
3885 off = op0;
3886 offtype = TREE_TYPE (off);
3887 STRIP_NOPS (off);
3888 continue;
3890 break;
3891 default:
3892 break;
3894 break;
3897 /* If at the end OFF still isn't a SSA_NAME or isn't
3898 defined in the loop, punt. */
3899 if (TREE_CODE (off) != SSA_NAME
3900 || expr_invariant_in_loop_p (loop, off))
3901 return false;
3903 if (offtype == NULL_TREE)
3904 offtype = TREE_TYPE (off);
3906 if (use_ifn_p)
3908 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3909 memory_type, TYPE_PRECISION (offtype),
3910 TYPE_SIGN (offtype), scale, &ifn,
3911 &element_type))
3912 return false;
3914 else
3916 if (DR_IS_READ (dr))
3918 if (targetm.vectorize.builtin_gather)
3919 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3921 else
3923 if (targetm.vectorize.builtin_scatter)
3924 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3927 if (!decl)
3928 return false;
3930 ifn = IFN_LAST;
3931 element_type = TREE_TYPE (vectype);
3934 info->ifn = ifn;
3935 info->decl = decl;
3936 info->base = base;
3937 info->offset = off;
3938 info->offset_dt = vect_unknown_def_type;
3939 info->offset_vectype = NULL_TREE;
3940 info->scale = scale;
3941 info->element_type = element_type;
3942 info->memory_type = memory_type;
3943 return true;
3946 /* Find the data references in STMT, analyze them with respect to LOOP and
3947 append them to DATAREFS. Return false if datarefs in this stmt cannot
3948 be handled. */
3950 bool
3951 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3952 vec<data_reference_p> *datarefs)
3954 /* We can ignore clobbers for dataref analysis - they are removed during
3955 loop vectorization and BB vectorization checks dependences with a
3956 stmt walk. */
3957 if (gimple_clobber_p (stmt))
3958 return true;
3960 if (gimple_has_volatile_ops (stmt))
3962 if (dump_enabled_p ())
3964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3965 "not vectorized: volatile type ");
3966 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3968 return false;
3971 if (stmt_can_throw_internal (stmt))
3973 if (dump_enabled_p ())
3975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3976 "not vectorized: statement can throw an "
3977 "exception ");
3978 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3980 return false;
3983 auto_vec<data_reference_p, 2> refs;
3984 if (!find_data_references_in_stmt (loop, stmt, &refs))
3985 return false;
3987 if (refs.is_empty ())
3988 return true;
3990 if (refs.length () > 1)
3992 if (dump_enabled_p ())
3994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3995 "not vectorized: more than one data ref "
3996 "in stmt: ");
3997 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3999 return false;
4002 if (gcall *call = dyn_cast <gcall *> (stmt))
4003 if (!gimple_call_internal_p (call)
4004 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4005 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4007 if (dump_enabled_p ())
4009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4010 "not vectorized: dr in a call ");
4011 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4013 return false;
4016 data_reference_p dr = refs.pop ();
4017 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4018 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4020 if (dump_enabled_p ())
4022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4023 "not vectorized: statement is bitfield "
4024 "access ");
4025 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4027 return false;
4030 if (DR_BASE_ADDRESS (dr)
4031 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4033 if (dump_enabled_p ())
4034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4035 "not vectorized: base addr of dr is a "
4036 "constant\n");
4037 return false;
4040 /* Check whether this may be a SIMD lane access and adjust the
4041 DR to make it easier for us to handle it. */
4042 if (loop
4043 && loop->simduid
4044 && (!DR_BASE_ADDRESS (dr)
4045 || !DR_OFFSET (dr)
4046 || !DR_INIT (dr)
4047 || !DR_STEP (dr)))
4049 struct data_reference *newdr
4050 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4051 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4052 if (DR_BASE_ADDRESS (newdr)
4053 && DR_OFFSET (newdr)
4054 && DR_INIT (newdr)
4055 && DR_STEP (newdr)
4056 && integer_zerop (DR_STEP (newdr)))
4058 tree off = DR_OFFSET (newdr);
4059 STRIP_NOPS (off);
4060 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4061 && TREE_CODE (off) == MULT_EXPR
4062 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4064 tree step = TREE_OPERAND (off, 1);
4065 off = TREE_OPERAND (off, 0);
4066 STRIP_NOPS (off);
4067 if (CONVERT_EXPR_P (off)
4068 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4069 < TYPE_PRECISION (TREE_TYPE (off))))
4070 off = TREE_OPERAND (off, 0);
4071 if (TREE_CODE (off) == SSA_NAME)
4073 gimple *def = SSA_NAME_DEF_STMT (off);
4074 tree reft = TREE_TYPE (DR_REF (newdr));
4075 if (is_gimple_call (def)
4076 && gimple_call_internal_p (def)
4077 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4079 tree arg = gimple_call_arg (def, 0);
4080 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4081 arg = SSA_NAME_VAR (arg);
4082 if (arg == loop->simduid
4083 /* For now. */
4084 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4086 DR_OFFSET (newdr) = ssize_int (0);
4087 DR_STEP (newdr) = step;
4088 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4089 DR_STEP_ALIGNMENT (newdr)
4090 = highest_pow2_factor (step);
4091 /* Mark as simd-lane access. */
4092 newdr->aux = (void *)-1;
4093 free_data_ref (dr);
4094 datarefs->safe_push (newdr);
4095 return true;
4101 free_data_ref (newdr);
4104 datarefs->safe_push (dr);
4105 return true;
4108 /* Function vect_analyze_data_refs.
4110 Find all the data references in the loop or basic block.
4112 The general structure of the analysis of data refs in the vectorizer is as
4113 follows:
4114 1- vect_analyze_data_refs(loop/bb): call
4115 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4116 in the loop/bb and their dependences.
4117 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4118 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4119 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4123 bool
4124 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4126 struct loop *loop = NULL;
4127 unsigned int i;
4128 struct data_reference *dr;
4129 tree scalar_type;
4131 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4133 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4134 loop = LOOP_VINFO_LOOP (loop_vinfo);
4136 /* Go through the data-refs, check that the analysis succeeded. Update
4137 pointer from stmt_vec_info struct to DR and vectype. */
4139 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4140 FOR_EACH_VEC_ELT (datarefs, i, dr)
4142 gimple *stmt;
4143 stmt_vec_info stmt_info;
4144 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4145 poly_uint64 vf;
4147 gcc_assert (DR_REF (dr));
4148 stmt = vect_dr_stmt (dr);
4149 stmt_info = vinfo_for_stmt (stmt);
4151 /* Check that analysis of the data-ref succeeded. */
4152 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4153 || !DR_STEP (dr))
4155 bool maybe_gather
4156 = DR_IS_READ (dr)
4157 && !TREE_THIS_VOLATILE (DR_REF (dr))
4158 && (targetm.vectorize.builtin_gather != NULL
4159 || supports_vec_gather_load_p ());
4160 bool maybe_scatter
4161 = DR_IS_WRITE (dr)
4162 && !TREE_THIS_VOLATILE (DR_REF (dr))
4163 && (targetm.vectorize.builtin_scatter != NULL
4164 || supports_vec_scatter_store_p ());
4166 /* If target supports vector gather loads or scatter stores,
4167 see if they can't be used. */
4168 if (is_a <loop_vec_info> (vinfo)
4169 && !nested_in_vect_loop_p (loop, stmt))
4171 if (maybe_gather || maybe_scatter)
4173 if (maybe_gather)
4174 gatherscatter = GATHER;
4175 else
4176 gatherscatter = SCATTER;
4180 if (gatherscatter == SG_NONE)
4182 if (dump_enabled_p ())
4184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4185 "not vectorized: data ref analysis "
4186 "failed ");
4187 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4189 if (is_a <bb_vec_info> (vinfo))
4191 /* In BB vectorization the ref can still participate
4192 in dependence analysis, we just can't vectorize it. */
4193 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4194 continue;
4196 return false;
4200 /* See if this was detected as SIMD lane access. */
4201 if (dr->aux == (void *)-1)
4203 if (nested_in_vect_loop_p (loop, stmt))
4205 if (dump_enabled_p ())
4207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4208 "not vectorized: data ref analysis "
4209 "failed ");
4210 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4212 return false;
4214 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4217 tree base = get_base_address (DR_REF (dr));
4218 if (base && VAR_P (base) && DECL_NONALIASED (base))
4220 if (dump_enabled_p ())
4222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4223 "not vectorized: base object not addressable "
4224 "for stmt: ");
4225 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4227 if (is_a <bb_vec_info> (vinfo))
4229 /* In BB vectorization the ref can still participate
4230 in dependence analysis, we just can't vectorize it. */
4231 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4232 continue;
4234 return false;
4237 if (is_a <loop_vec_info> (vinfo)
4238 && DR_STEP (dr)
4239 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4241 if (nested_in_vect_loop_p (loop, stmt))
4243 if (dump_enabled_p ())
4245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4246 "not vectorized: not suitable for strided "
4247 "load ");
4248 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4250 return false;
4252 STMT_VINFO_STRIDED_P (stmt_info) = true;
4255 /* Update DR field in stmt_vec_info struct. */
4257 /* If the dataref is in an inner-loop of the loop that is considered for
4258 for vectorization, we also want to analyze the access relative to
4259 the outer-loop (DR contains information only relative to the
4260 inner-most enclosing loop). We do that by building a reference to the
4261 first location accessed by the inner-loop, and analyze it relative to
4262 the outer-loop. */
4263 if (loop && nested_in_vect_loop_p (loop, stmt))
4265 /* Build a reference to the first location accessed by the
4266 inner loop: *(BASE + INIT + OFFSET). By construction,
4267 this address must be invariant in the inner loop, so we
4268 can consider it as being used in the outer loop. */
4269 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4270 tree offset = unshare_expr (DR_OFFSET (dr));
4271 tree init = unshare_expr (DR_INIT (dr));
4272 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4273 init, offset);
4274 tree init_addr = fold_build_pointer_plus (base, init_offset);
4275 tree init_ref = build_fold_indirect_ref (init_addr);
4277 if (dump_enabled_p ())
4279 dump_printf_loc (MSG_NOTE, vect_location,
4280 "analyze in outer loop: ");
4281 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4282 dump_printf (MSG_NOTE, "\n");
4285 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4286 init_ref, loop))
4287 /* dr_analyze_innermost already explained the failure. */
4288 return false;
4290 if (dump_enabled_p ())
4292 dump_printf_loc (MSG_NOTE, vect_location,
4293 "\touter base_address: ");
4294 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4295 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4296 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4297 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4298 STMT_VINFO_DR_OFFSET (stmt_info));
4299 dump_printf (MSG_NOTE,
4300 "\n\touter constant offset from base address: ");
4301 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4302 STMT_VINFO_DR_INIT (stmt_info));
4303 dump_printf (MSG_NOTE, "\n\touter step: ");
4304 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4305 STMT_VINFO_DR_STEP (stmt_info));
4306 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4307 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4308 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4309 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4310 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4311 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4312 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4313 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4317 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4318 STMT_VINFO_DATA_REF (stmt_info) = dr;
4320 /* Set vectype for STMT. */
4321 scalar_type = TREE_TYPE (DR_REF (dr));
4322 STMT_VINFO_VECTYPE (stmt_info)
4323 = get_vectype_for_scalar_type (scalar_type);
4324 if (!STMT_VINFO_VECTYPE (stmt_info))
4326 if (dump_enabled_p ())
4328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4329 "not vectorized: no vectype for stmt: ");
4330 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4331 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4332 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4333 scalar_type);
4334 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4337 if (is_a <bb_vec_info> (vinfo))
4339 /* No vector type is fine, the ref can still participate
4340 in dependence analysis, we just can't vectorize it. */
4341 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4342 continue;
4344 return false;
4346 else
4348 if (dump_enabled_p ())
4350 dump_printf_loc (MSG_NOTE, vect_location,
4351 "got vectype for stmt: ");
4352 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4353 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4354 STMT_VINFO_VECTYPE (stmt_info));
4355 dump_printf (MSG_NOTE, "\n");
4359 /* Adjust the minimal vectorization factor according to the
4360 vector type. */
4361 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4362 *min_vf = upper_bound (*min_vf, vf);
4364 if (gatherscatter != SG_NONE)
4366 gather_scatter_info gs_info;
4367 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4368 &gs_info)
4369 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4371 if (dump_enabled_p ())
4373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4374 (gatherscatter == GATHER) ?
4375 "not vectorized: not suitable for gather "
4376 "load " :
4377 "not vectorized: not suitable for scatter "
4378 "store ");
4379 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4381 return false;
4383 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4387 /* We used to stop processing and prune the list here. Verify we no
4388 longer need to. */
4389 gcc_assert (i == datarefs.length ());
4391 return true;
4395 /* Function vect_get_new_vect_var.
4397 Returns a name for a new variable. The current naming scheme appends the
4398 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4399 the name of vectorizer generated variables, and appends that to NAME if
4400 provided. */
4402 tree
4403 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4405 const char *prefix;
4406 tree new_vect_var;
4408 switch (var_kind)
4410 case vect_simple_var:
4411 prefix = "vect";
4412 break;
4413 case vect_scalar_var:
4414 prefix = "stmp";
4415 break;
4416 case vect_mask_var:
4417 prefix = "mask";
4418 break;
4419 case vect_pointer_var:
4420 prefix = "vectp";
4421 break;
4422 default:
4423 gcc_unreachable ();
4426 if (name)
4428 char* tmp = concat (prefix, "_", name, NULL);
4429 new_vect_var = create_tmp_reg (type, tmp);
4430 free (tmp);
4432 else
4433 new_vect_var = create_tmp_reg (type, prefix);
4435 return new_vect_var;
4438 /* Like vect_get_new_vect_var but return an SSA name. */
4440 tree
4441 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4443 const char *prefix;
4444 tree new_vect_var;
4446 switch (var_kind)
4448 case vect_simple_var:
4449 prefix = "vect";
4450 break;
4451 case vect_scalar_var:
4452 prefix = "stmp";
4453 break;
4454 case vect_pointer_var:
4455 prefix = "vectp";
4456 break;
4457 default:
4458 gcc_unreachable ();
4461 if (name)
4463 char* tmp = concat (prefix, "_", name, NULL);
4464 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4465 free (tmp);
4467 else
4468 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4470 return new_vect_var;
4473 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4475 static void
4476 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4478 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4479 int misalign = DR_MISALIGNMENT (dr);
4480 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4481 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4482 else
4483 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4484 DR_TARGET_ALIGNMENT (dr), misalign);
4487 /* Function vect_create_addr_base_for_vector_ref.
4489 Create an expression that computes the address of the first memory location
4490 that will be accessed for a data reference.
4492 Input:
4493 STMT: The statement containing the data reference.
4494 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4495 OFFSET: Optional. If supplied, it is be added to the initial address.
4496 LOOP: Specify relative to which loop-nest should the address be computed.
4497 For example, when the dataref is in an inner-loop nested in an
4498 outer-loop that is now being vectorized, LOOP can be either the
4499 outer-loop, or the inner-loop. The first memory location accessed
4500 by the following dataref ('in' points to short):
4502 for (i=0; i<N; i++)
4503 for (j=0; j<M; j++)
4504 s += in[i+j]
4506 is as follows:
4507 if LOOP=i_loop: &in (relative to i_loop)
4508 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4509 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4510 initial address. Unlike OFFSET, which is number of elements to
4511 be added, BYTE_OFFSET is measured in bytes.
4513 Output:
4514 1. Return an SSA_NAME whose value is the address of the memory location of
4515 the first vector of the data reference.
4516 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4517 these statement(s) which define the returned SSA_NAME.
4519 FORNOW: We are only handling array accesses with step 1. */
4521 tree
4522 vect_create_addr_base_for_vector_ref (gimple *stmt,
4523 gimple_seq *new_stmt_list,
4524 tree offset,
4525 tree byte_offset)
4527 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4528 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4529 const char *base_name;
4530 tree addr_base;
4531 tree dest;
4532 gimple_seq seq = NULL;
4533 tree vect_ptr_type;
4534 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4535 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4536 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4538 tree data_ref_base = unshare_expr (drb->base_address);
4539 tree base_offset = unshare_expr (drb->offset);
4540 tree init = unshare_expr (drb->init);
4542 if (loop_vinfo)
4543 base_name = get_name (data_ref_base);
4544 else
4546 base_offset = ssize_int (0);
4547 init = ssize_int (0);
4548 base_name = get_name (DR_REF (dr));
4551 /* Create base_offset */
4552 base_offset = size_binop (PLUS_EXPR,
4553 fold_convert (sizetype, base_offset),
4554 fold_convert (sizetype, init));
4556 if (offset)
4558 offset = fold_build2 (MULT_EXPR, sizetype,
4559 fold_convert (sizetype, offset), step);
4560 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4561 base_offset, offset);
4563 if (byte_offset)
4565 byte_offset = fold_convert (sizetype, byte_offset);
4566 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4567 base_offset, byte_offset);
4570 /* base + base_offset */
4571 if (loop_vinfo)
4572 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4573 else
4575 addr_base = build1 (ADDR_EXPR,
4576 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4577 unshare_expr (DR_REF (dr)));
4580 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4581 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4582 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4583 gimple_seq_add_seq (new_stmt_list, seq);
4585 if (DR_PTR_INFO (dr)
4586 && TREE_CODE (addr_base) == SSA_NAME
4587 && !SSA_NAME_PTR_INFO (addr_base))
4589 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4590 if (offset || byte_offset)
4591 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4594 if (dump_enabled_p ())
4596 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4597 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4598 dump_printf (MSG_NOTE, "\n");
4601 return addr_base;
4605 /* Function vect_create_data_ref_ptr.
4607 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4608 location accessed in the loop by STMT, along with the def-use update
4609 chain to appropriately advance the pointer through the loop iterations.
4610 Also set aliasing information for the pointer. This pointer is used by
4611 the callers to this function to create a memory reference expression for
4612 vector load/store access.
4614 Input:
4615 1. STMT: a stmt that references memory. Expected to be of the form
4616 GIMPLE_ASSIGN <name, data-ref> or
4617 GIMPLE_ASSIGN <data-ref, name>.
4618 2. AGGR_TYPE: the type of the reference, which should be either a vector
4619 or an array.
4620 3. AT_LOOP: the loop where the vector memref is to be created.
4621 4. OFFSET (optional): an offset to be added to the initial address accessed
4622 by the data-ref in STMT.
4623 5. BSI: location where the new stmts are to be placed if there is no loop
4624 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4625 pointing to the initial address.
4626 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4627 to the initial address accessed by the data-ref in STMT. This is
4628 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4629 in bytes.
4630 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4631 to the IV during each iteration of the loop. NULL says to move
4632 by one copy of AGGR_TYPE up or down, depending on the step of the
4633 data reference.
4635 Output:
4636 1. Declare a new ptr to vector_type, and have it point to the base of the
4637 data reference (initial addressed accessed by the data reference).
4638 For example, for vector of type V8HI, the following code is generated:
4640 v8hi *ap;
4641 ap = (v8hi *)initial_address;
4643 if OFFSET is not supplied:
4644 initial_address = &a[init];
4645 if OFFSET is supplied:
4646 initial_address = &a[init + OFFSET];
4647 if BYTE_OFFSET is supplied:
4648 initial_address = &a[init] + BYTE_OFFSET;
4650 Return the initial_address in INITIAL_ADDRESS.
4652 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4653 update the pointer in each iteration of the loop.
4655 Return the increment stmt that updates the pointer in PTR_INCR.
4657 3. Set INV_P to true if the access pattern of the data reference in the
4658 vectorized loop is invariant. Set it to false otherwise.
4660 4. Return the pointer. */
4662 tree
4663 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4664 tree offset, tree *initial_address,
4665 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4666 bool only_init, bool *inv_p, tree byte_offset,
4667 tree iv_step)
4669 const char *base_name;
4670 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4671 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4672 struct loop *loop = NULL;
4673 bool nested_in_vect_loop = false;
4674 struct loop *containing_loop = NULL;
4675 tree aggr_ptr_type;
4676 tree aggr_ptr;
4677 tree new_temp;
4678 gimple_seq new_stmt_list = NULL;
4679 edge pe = NULL;
4680 basic_block new_bb;
4681 tree aggr_ptr_init;
4682 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4683 tree aptr;
4684 gimple_stmt_iterator incr_gsi;
4685 bool insert_after;
4686 tree indx_before_incr, indx_after_incr;
4687 gimple *incr;
4688 tree step;
4689 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4691 gcc_assert (iv_step != NULL_TREE
4692 || TREE_CODE (aggr_type) == ARRAY_TYPE
4693 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4695 if (loop_vinfo)
4697 loop = LOOP_VINFO_LOOP (loop_vinfo);
4698 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4699 containing_loop = (gimple_bb (stmt))->loop_father;
4700 pe = loop_preheader_edge (loop);
4702 else
4704 gcc_assert (bb_vinfo);
4705 only_init = true;
4706 *ptr_incr = NULL;
4709 /* Check the step (evolution) of the load in LOOP, and record
4710 whether it's invariant. */
4711 step = vect_dr_behavior (dr)->step;
4712 if (integer_zerop (step))
4713 *inv_p = true;
4714 else
4715 *inv_p = false;
4717 /* Create an expression for the first address accessed by this load
4718 in LOOP. */
4719 base_name = get_name (DR_BASE_ADDRESS (dr));
4721 if (dump_enabled_p ())
4723 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4724 dump_printf_loc (MSG_NOTE, vect_location,
4725 "create %s-pointer variable to type: ",
4726 get_tree_code_name (TREE_CODE (aggr_type)));
4727 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4728 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4729 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4730 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4731 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4732 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4733 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4734 else
4735 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4736 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4737 dump_printf (MSG_NOTE, "\n");
4740 /* (1) Create the new aggregate-pointer variable.
4741 Vector and array types inherit the alias set of their component
4742 type by default so we need to use a ref-all pointer if the data
4743 reference does not conflict with the created aggregated data
4744 reference because it is not addressable. */
4745 bool need_ref_all = false;
4746 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4747 get_alias_set (DR_REF (dr))))
4748 need_ref_all = true;
4749 /* Likewise for any of the data references in the stmt group. */
4750 else if (DR_GROUP_SIZE (stmt_info) > 1)
4752 gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4755 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4756 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4757 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4758 get_alias_set (DR_REF (sdr))))
4760 need_ref_all = true;
4761 break;
4763 orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4765 while (orig_stmt);
4767 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4768 need_ref_all);
4769 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4772 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4773 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4774 def-use update cycles for the pointer: one relative to the outer-loop
4775 (LOOP), which is what steps (3) and (4) below do. The other is relative
4776 to the inner-loop (which is the inner-most loop containing the dataref),
4777 and this is done be step (5) below.
4779 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4780 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4781 redundant. Steps (3),(4) create the following:
4783 vp0 = &base_addr;
4784 LOOP: vp1 = phi(vp0,vp2)
4787 vp2 = vp1 + step
4788 goto LOOP
4790 If there is an inner-loop nested in loop, then step (5) will also be
4791 applied, and an additional update in the inner-loop will be created:
4793 vp0 = &base_addr;
4794 LOOP: vp1 = phi(vp0,vp2)
4796 inner: vp3 = phi(vp1,vp4)
4797 vp4 = vp3 + inner_step
4798 if () goto inner
4800 vp2 = vp1 + step
4801 if () goto LOOP */
4803 /* (2) Calculate the initial address of the aggregate-pointer, and set
4804 the aggregate-pointer to point to it before the loop. */
4806 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4808 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4809 offset, byte_offset);
4810 if (new_stmt_list)
4812 if (pe)
4814 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4815 gcc_assert (!new_bb);
4817 else
4818 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4821 *initial_address = new_temp;
4822 aggr_ptr_init = new_temp;
4824 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4825 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4826 inner-loop nested in LOOP (during outer-loop vectorization). */
4828 /* No update in loop is required. */
4829 if (only_init && (!loop_vinfo || at_loop == loop))
4830 aptr = aggr_ptr_init;
4831 else
4833 if (iv_step == NULL_TREE)
4835 /* The step of the aggregate pointer is the type size. */
4836 iv_step = TYPE_SIZE_UNIT (aggr_type);
4837 /* One exception to the above is when the scalar step of the load in
4838 LOOP is zero. In this case the step here is also zero. */
4839 if (*inv_p)
4840 iv_step = size_zero_node;
4841 else if (tree_int_cst_sgn (step) == -1)
4842 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4845 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4847 create_iv (aggr_ptr_init,
4848 fold_convert (aggr_ptr_type, iv_step),
4849 aggr_ptr, loop, &incr_gsi, insert_after,
4850 &indx_before_incr, &indx_after_incr);
4851 incr = gsi_stmt (incr_gsi);
4852 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4854 /* Copy the points-to information if it exists. */
4855 if (DR_PTR_INFO (dr))
4857 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4858 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4860 if (ptr_incr)
4861 *ptr_incr = incr;
4863 aptr = indx_before_incr;
4866 if (!nested_in_vect_loop || only_init)
4867 return aptr;
4870 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4871 nested in LOOP, if exists. */
4873 gcc_assert (nested_in_vect_loop);
4874 if (!only_init)
4876 standard_iv_increment_position (containing_loop, &incr_gsi,
4877 &insert_after);
4878 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4879 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4880 &indx_after_incr);
4881 incr = gsi_stmt (incr_gsi);
4882 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4884 /* Copy the points-to information if it exists. */
4885 if (DR_PTR_INFO (dr))
4887 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4888 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4890 if (ptr_incr)
4891 *ptr_incr = incr;
4893 return indx_before_incr;
4895 else
4896 gcc_unreachable ();
4900 /* Function bump_vector_ptr
4902 Increment a pointer (to a vector type) by vector-size. If requested,
4903 i.e. if PTR-INCR is given, then also connect the new increment stmt
4904 to the existing def-use update-chain of the pointer, by modifying
4905 the PTR_INCR as illustrated below:
4907 The pointer def-use update-chain before this function:
4908 DATAREF_PTR = phi (p_0, p_2)
4909 ....
4910 PTR_INCR: p_2 = DATAREF_PTR + step
4912 The pointer def-use update-chain after this function:
4913 DATAREF_PTR = phi (p_0, p_2)
4914 ....
4915 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4916 ....
4917 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4919 Input:
4920 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4921 in the loop.
4922 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4923 the loop. The increment amount across iterations is expected
4924 to be vector_size.
4925 BSI - location where the new update stmt is to be placed.
4926 STMT - the original scalar memory-access stmt that is being vectorized.
4927 BUMP - optional. The offset by which to bump the pointer. If not given,
4928 the offset is assumed to be vector_size.
4930 Output: Return NEW_DATAREF_PTR as illustrated above.
4934 tree
4935 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4936 gimple *stmt, tree bump)
4938 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4939 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4940 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4941 tree update = TYPE_SIZE_UNIT (vectype);
4942 gassign *incr_stmt;
4943 ssa_op_iter iter;
4944 use_operand_p use_p;
4945 tree new_dataref_ptr;
4947 if (bump)
4948 update = bump;
4950 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4951 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4952 else
4953 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4954 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4955 dataref_ptr, update);
4956 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4958 /* Copy the points-to information if it exists. */
4959 if (DR_PTR_INFO (dr))
4961 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4962 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4965 if (!ptr_incr)
4966 return new_dataref_ptr;
4968 /* Update the vector-pointer's cross-iteration increment. */
4969 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4971 tree use = USE_FROM_PTR (use_p);
4973 if (use == dataref_ptr)
4974 SET_USE (use_p, new_dataref_ptr);
4975 else
4976 gcc_assert (operand_equal_p (use, update, 0));
4979 return new_dataref_ptr;
4983 /* Copy memory reference info such as base/clique from the SRC reference
4984 to the DEST MEM_REF. */
4986 void
4987 vect_copy_ref_info (tree dest, tree src)
4989 if (TREE_CODE (dest) != MEM_REF)
4990 return;
4992 tree src_base = src;
4993 while (handled_component_p (src_base))
4994 src_base = TREE_OPERAND (src_base, 0);
4995 if (TREE_CODE (src_base) != MEM_REF
4996 && TREE_CODE (src_base) != TARGET_MEM_REF)
4997 return;
4999 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5000 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5004 /* Function vect_create_destination_var.
5006 Create a new temporary of type VECTYPE. */
5008 tree
5009 vect_create_destination_var (tree scalar_dest, tree vectype)
5011 tree vec_dest;
5012 const char *name;
5013 char *new_name;
5014 tree type;
5015 enum vect_var_kind kind;
5017 kind = vectype
5018 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5019 ? vect_mask_var
5020 : vect_simple_var
5021 : vect_scalar_var;
5022 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5024 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5026 name = get_name (scalar_dest);
5027 if (name)
5028 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5029 else
5030 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5031 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5032 free (new_name);
5034 return vec_dest;
5037 /* Function vect_grouped_store_supported.
5039 Returns TRUE if interleave high and interleave low permutations
5040 are supported, and FALSE otherwise. */
5042 bool
5043 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5045 machine_mode mode = TYPE_MODE (vectype);
5047 /* vect_permute_store_chain requires the group size to be equal to 3 or
5048 be a power of two. */
5049 if (count != 3 && exact_log2 (count) == -1)
5051 if (dump_enabled_p ())
5052 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5053 "the size of the group of accesses"
5054 " is not a power of 2 or not eqaul to 3\n");
5055 return false;
5058 /* Check that the permutation is supported. */
5059 if (VECTOR_MODE_P (mode))
5061 unsigned int i;
5062 if (count == 3)
5064 unsigned int j0 = 0, j1 = 0, j2 = 0;
5065 unsigned int i, j;
5067 unsigned int nelt;
5068 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5070 if (dump_enabled_p ())
5071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5072 "cannot handle groups of 3 stores for"
5073 " variable-length vectors\n");
5074 return false;
5077 vec_perm_builder sel (nelt, nelt, 1);
5078 sel.quick_grow (nelt);
5079 vec_perm_indices indices;
5080 for (j = 0; j < 3; j++)
5082 int nelt0 = ((3 - j) * nelt) % 3;
5083 int nelt1 = ((3 - j) * nelt + 1) % 3;
5084 int nelt2 = ((3 - j) * nelt + 2) % 3;
5085 for (i = 0; i < nelt; i++)
5087 if (3 * i + nelt0 < nelt)
5088 sel[3 * i + nelt0] = j0++;
5089 if (3 * i + nelt1 < nelt)
5090 sel[3 * i + nelt1] = nelt + j1++;
5091 if (3 * i + nelt2 < nelt)
5092 sel[3 * i + nelt2] = 0;
5094 indices.new_vector (sel, 2, nelt);
5095 if (!can_vec_perm_const_p (mode, indices))
5097 if (dump_enabled_p ())
5098 dump_printf (MSG_MISSED_OPTIMIZATION,
5099 "permutation op not supported by target.\n");
5100 return false;
5103 for (i = 0; i < nelt; i++)
5105 if (3 * i + nelt0 < nelt)
5106 sel[3 * i + nelt0] = 3 * i + nelt0;
5107 if (3 * i + nelt1 < nelt)
5108 sel[3 * i + nelt1] = 3 * i + nelt1;
5109 if (3 * i + nelt2 < nelt)
5110 sel[3 * i + nelt2] = nelt + j2++;
5112 indices.new_vector (sel, 2, nelt);
5113 if (!can_vec_perm_const_p (mode, indices))
5115 if (dump_enabled_p ())
5116 dump_printf (MSG_MISSED_OPTIMIZATION,
5117 "permutation op not supported by target.\n");
5118 return false;
5121 return true;
5123 else
5125 /* If length is not equal to 3 then only power of 2 is supported. */
5126 gcc_assert (pow2p_hwi (count));
5127 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5129 /* The encoding has 2 interleaved stepped patterns. */
5130 vec_perm_builder sel (nelt, 2, 3);
5131 sel.quick_grow (6);
5132 for (i = 0; i < 3; i++)
5134 sel[i * 2] = i;
5135 sel[i * 2 + 1] = i + nelt;
5137 vec_perm_indices indices (sel, 2, nelt);
5138 if (can_vec_perm_const_p (mode, indices))
5140 for (i = 0; i < 6; i++)
5141 sel[i] += exact_div (nelt, 2);
5142 indices.new_vector (sel, 2, nelt);
5143 if (can_vec_perm_const_p (mode, indices))
5144 return true;
5149 if (dump_enabled_p ())
5150 dump_printf (MSG_MISSED_OPTIMIZATION,
5151 "permutaion op not supported by target.\n");
5152 return false;
5156 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5157 type VECTYPE. MASKED_P says whether the masked form is needed. */
5159 bool
5160 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5161 bool masked_p)
5163 if (masked_p)
5164 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5165 vec_mask_store_lanes_optab,
5166 vectype, count);
5167 else
5168 return vect_lanes_optab_supported_p ("vec_store_lanes",
5169 vec_store_lanes_optab,
5170 vectype, count);
5174 /* Function vect_permute_store_chain.
5176 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5177 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5178 the data correctly for the stores. Return the final references for stores
5179 in RESULT_CHAIN.
5181 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5182 The input is 4 vectors each containing 8 elements. We assign a number to
5183 each element, the input sequence is:
5185 1st vec: 0 1 2 3 4 5 6 7
5186 2nd vec: 8 9 10 11 12 13 14 15
5187 3rd vec: 16 17 18 19 20 21 22 23
5188 4th vec: 24 25 26 27 28 29 30 31
5190 The output sequence should be:
5192 1st vec: 0 8 16 24 1 9 17 25
5193 2nd vec: 2 10 18 26 3 11 19 27
5194 3rd vec: 4 12 20 28 5 13 21 30
5195 4th vec: 6 14 22 30 7 15 23 31
5197 i.e., we interleave the contents of the four vectors in their order.
5199 We use interleave_high/low instructions to create such output. The input of
5200 each interleave_high/low operation is two vectors:
5201 1st vec 2nd vec
5202 0 1 2 3 4 5 6 7
5203 the even elements of the result vector are obtained left-to-right from the
5204 high/low elements of the first vector. The odd elements of the result are
5205 obtained left-to-right from the high/low elements of the second vector.
5206 The output of interleave_high will be: 0 4 1 5
5207 and of interleave_low: 2 6 3 7
5210 The permutation is done in log LENGTH stages. In each stage interleave_high
5211 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5212 where the first argument is taken from the first half of DR_CHAIN and the
5213 second argument from it's second half.
5214 In our example,
5216 I1: interleave_high (1st vec, 3rd vec)
5217 I2: interleave_low (1st vec, 3rd vec)
5218 I3: interleave_high (2nd vec, 4th vec)
5219 I4: interleave_low (2nd vec, 4th vec)
5221 The output for the first stage is:
5223 I1: 0 16 1 17 2 18 3 19
5224 I2: 4 20 5 21 6 22 7 23
5225 I3: 8 24 9 25 10 26 11 27
5226 I4: 12 28 13 29 14 30 15 31
5228 The output of the second stage, i.e. the final result is:
5230 I1: 0 8 16 24 1 9 17 25
5231 I2: 2 10 18 26 3 11 19 27
5232 I3: 4 12 20 28 5 13 21 30
5233 I4: 6 14 22 30 7 15 23 31. */
5235 void
5236 vect_permute_store_chain (vec<tree> dr_chain,
5237 unsigned int length,
5238 gimple *stmt,
5239 gimple_stmt_iterator *gsi,
5240 vec<tree> *result_chain)
5242 tree vect1, vect2, high, low;
5243 gimple *perm_stmt;
5244 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5245 tree perm_mask_low, perm_mask_high;
5246 tree data_ref;
5247 tree perm3_mask_low, perm3_mask_high;
5248 unsigned int i, j, n, log_length = exact_log2 (length);
5250 result_chain->quick_grow (length);
5251 memcpy (result_chain->address (), dr_chain.address (),
5252 length * sizeof (tree));
5254 if (length == 3)
5256 /* vect_grouped_store_supported ensures that this is constant. */
5257 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5258 unsigned int j0 = 0, j1 = 0, j2 = 0;
5260 vec_perm_builder sel (nelt, nelt, 1);
5261 sel.quick_grow (nelt);
5262 vec_perm_indices indices;
5263 for (j = 0; j < 3; j++)
5265 int nelt0 = ((3 - j) * nelt) % 3;
5266 int nelt1 = ((3 - j) * nelt + 1) % 3;
5267 int nelt2 = ((3 - j) * nelt + 2) % 3;
5269 for (i = 0; i < nelt; i++)
5271 if (3 * i + nelt0 < nelt)
5272 sel[3 * i + nelt0] = j0++;
5273 if (3 * i + nelt1 < nelt)
5274 sel[3 * i + nelt1] = nelt + j1++;
5275 if (3 * i + nelt2 < nelt)
5276 sel[3 * i + nelt2] = 0;
5278 indices.new_vector (sel, 2, nelt);
5279 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5281 for (i = 0; i < nelt; i++)
5283 if (3 * i + nelt0 < nelt)
5284 sel[3 * i + nelt0] = 3 * i + nelt0;
5285 if (3 * i + nelt1 < nelt)
5286 sel[3 * i + nelt1] = 3 * i + nelt1;
5287 if (3 * i + nelt2 < nelt)
5288 sel[3 * i + nelt2] = nelt + j2++;
5290 indices.new_vector (sel, 2, nelt);
5291 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5293 vect1 = dr_chain[0];
5294 vect2 = dr_chain[1];
5296 /* Create interleaving stmt:
5297 low = VEC_PERM_EXPR <vect1, vect2,
5298 {j, nelt, *, j + 1, nelt + j + 1, *,
5299 j + 2, nelt + j + 2, *, ...}> */
5300 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5301 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5302 vect2, perm3_mask_low);
5303 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5305 vect1 = data_ref;
5306 vect2 = dr_chain[2];
5307 /* Create interleaving stmt:
5308 low = VEC_PERM_EXPR <vect1, vect2,
5309 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5310 6, 7, nelt + j + 2, ...}> */
5311 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5312 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5313 vect2, perm3_mask_high);
5314 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5315 (*result_chain)[j] = data_ref;
5318 else
5320 /* If length is not equal to 3 then only power of 2 is supported. */
5321 gcc_assert (pow2p_hwi (length));
5323 /* The encoding has 2 interleaved stepped patterns. */
5324 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5325 vec_perm_builder sel (nelt, 2, 3);
5326 sel.quick_grow (6);
5327 for (i = 0; i < 3; i++)
5329 sel[i * 2] = i;
5330 sel[i * 2 + 1] = i + nelt;
5332 vec_perm_indices indices (sel, 2, nelt);
5333 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5335 for (i = 0; i < 6; i++)
5336 sel[i] += exact_div (nelt, 2);
5337 indices.new_vector (sel, 2, nelt);
5338 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5340 for (i = 0, n = log_length; i < n; i++)
5342 for (j = 0; j < length/2; j++)
5344 vect1 = dr_chain[j];
5345 vect2 = dr_chain[j+length/2];
5347 /* Create interleaving stmt:
5348 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5349 ...}> */
5350 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5351 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5352 vect2, perm_mask_high);
5353 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5354 (*result_chain)[2*j] = high;
5356 /* Create interleaving stmt:
5357 low = VEC_PERM_EXPR <vect1, vect2,
5358 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5359 ...}> */
5360 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5361 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5362 vect2, perm_mask_low);
5363 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5364 (*result_chain)[2*j+1] = low;
5366 memcpy (dr_chain.address (), result_chain->address (),
5367 length * sizeof (tree));
5372 /* Function vect_setup_realignment
5374 This function is called when vectorizing an unaligned load using
5375 the dr_explicit_realign[_optimized] scheme.
5376 This function generates the following code at the loop prolog:
5378 p = initial_addr;
5379 x msq_init = *(floor(p)); # prolog load
5380 realignment_token = call target_builtin;
5381 loop:
5382 x msq = phi (msq_init, ---)
5384 The stmts marked with x are generated only for the case of
5385 dr_explicit_realign_optimized.
5387 The code above sets up a new (vector) pointer, pointing to the first
5388 location accessed by STMT, and a "floor-aligned" load using that pointer.
5389 It also generates code to compute the "realignment-token" (if the relevant
5390 target hook was defined), and creates a phi-node at the loop-header bb
5391 whose arguments are the result of the prolog-load (created by this
5392 function) and the result of a load that takes place in the loop (to be
5393 created by the caller to this function).
5395 For the case of dr_explicit_realign_optimized:
5396 The caller to this function uses the phi-result (msq) to create the
5397 realignment code inside the loop, and sets up the missing phi argument,
5398 as follows:
5399 loop:
5400 msq = phi (msq_init, lsq)
5401 lsq = *(floor(p')); # load in loop
5402 result = realign_load (msq, lsq, realignment_token);
5404 For the case of dr_explicit_realign:
5405 loop:
5406 msq = *(floor(p)); # load in loop
5407 p' = p + (VS-1);
5408 lsq = *(floor(p')); # load in loop
5409 result = realign_load (msq, lsq, realignment_token);
5411 Input:
5412 STMT - (scalar) load stmt to be vectorized. This load accesses
5413 a memory location that may be unaligned.
5414 BSI - place where new code is to be inserted.
5415 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5416 is used.
5418 Output:
5419 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5420 target hook, if defined.
5421 Return value - the result of the loop-header phi node. */
5423 tree
5424 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5425 tree *realignment_token,
5426 enum dr_alignment_support alignment_support_scheme,
5427 tree init_addr,
5428 struct loop **at_loop)
5430 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5431 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5432 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5433 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5434 struct loop *loop = NULL;
5435 edge pe = NULL;
5436 tree scalar_dest = gimple_assign_lhs (stmt);
5437 tree vec_dest;
5438 gimple *inc;
5439 tree ptr;
5440 tree data_ref;
5441 basic_block new_bb;
5442 tree msq_init = NULL_TREE;
5443 tree new_temp;
5444 gphi *phi_stmt;
5445 tree msq = NULL_TREE;
5446 gimple_seq stmts = NULL;
5447 bool inv_p;
5448 bool compute_in_loop = false;
5449 bool nested_in_vect_loop = false;
5450 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5451 struct loop *loop_for_initial_load = NULL;
5453 if (loop_vinfo)
5455 loop = LOOP_VINFO_LOOP (loop_vinfo);
5456 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5459 gcc_assert (alignment_support_scheme == dr_explicit_realign
5460 || alignment_support_scheme == dr_explicit_realign_optimized);
5462 /* We need to generate three things:
5463 1. the misalignment computation
5464 2. the extra vector load (for the optimized realignment scheme).
5465 3. the phi node for the two vectors from which the realignment is
5466 done (for the optimized realignment scheme). */
5468 /* 1. Determine where to generate the misalignment computation.
5470 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5471 calculation will be generated by this function, outside the loop (in the
5472 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5473 caller, inside the loop.
5475 Background: If the misalignment remains fixed throughout the iterations of
5476 the loop, then both realignment schemes are applicable, and also the
5477 misalignment computation can be done outside LOOP. This is because we are
5478 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5479 are a multiple of VS (the Vector Size), and therefore the misalignment in
5480 different vectorized LOOP iterations is always the same.
5481 The problem arises only if the memory access is in an inner-loop nested
5482 inside LOOP, which is now being vectorized using outer-loop vectorization.
5483 This is the only case when the misalignment of the memory access may not
5484 remain fixed throughout the iterations of the inner-loop (as explained in
5485 detail in vect_supportable_dr_alignment). In this case, not only is the
5486 optimized realignment scheme not applicable, but also the misalignment
5487 computation (and generation of the realignment token that is passed to
5488 REALIGN_LOAD) have to be done inside the loop.
5490 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5491 or not, which in turn determines if the misalignment is computed inside
5492 the inner-loop, or outside LOOP. */
5494 if (init_addr != NULL_TREE || !loop_vinfo)
5496 compute_in_loop = true;
5497 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5501 /* 2. Determine where to generate the extra vector load.
5503 For the optimized realignment scheme, instead of generating two vector
5504 loads in each iteration, we generate a single extra vector load in the
5505 preheader of the loop, and in each iteration reuse the result of the
5506 vector load from the previous iteration. In case the memory access is in
5507 an inner-loop nested inside LOOP, which is now being vectorized using
5508 outer-loop vectorization, we need to determine whether this initial vector
5509 load should be generated at the preheader of the inner-loop, or can be
5510 generated at the preheader of LOOP. If the memory access has no evolution
5511 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5512 to be generated inside LOOP (in the preheader of the inner-loop). */
5514 if (nested_in_vect_loop)
5516 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5517 bool invariant_in_outerloop =
5518 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5519 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5521 else
5522 loop_for_initial_load = loop;
5523 if (at_loop)
5524 *at_loop = loop_for_initial_load;
5526 if (loop_for_initial_load)
5527 pe = loop_preheader_edge (loop_for_initial_load);
5529 /* 3. For the case of the optimized realignment, create the first vector
5530 load at the loop preheader. */
5532 if (alignment_support_scheme == dr_explicit_realign_optimized)
5534 /* Create msq_init = *(floor(p1)) in the loop preheader */
5535 gassign *new_stmt;
5537 gcc_assert (!compute_in_loop);
5538 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5539 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5540 NULL_TREE, &init_addr, NULL, &inc,
5541 true, &inv_p);
5542 if (TREE_CODE (ptr) == SSA_NAME)
5543 new_temp = copy_ssa_name (ptr);
5544 else
5545 new_temp = make_ssa_name (TREE_TYPE (ptr));
5546 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5547 new_stmt = gimple_build_assign
5548 (new_temp, BIT_AND_EXPR, ptr,
5549 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5550 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5551 gcc_assert (!new_bb);
5552 data_ref
5553 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5554 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5555 vect_copy_ref_info (data_ref, DR_REF (dr));
5556 new_stmt = gimple_build_assign (vec_dest, data_ref);
5557 new_temp = make_ssa_name (vec_dest, new_stmt);
5558 gimple_assign_set_lhs (new_stmt, new_temp);
5559 if (pe)
5561 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5562 gcc_assert (!new_bb);
5564 else
5565 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5567 msq_init = gimple_assign_lhs (new_stmt);
5570 /* 4. Create realignment token using a target builtin, if available.
5571 It is done either inside the containing loop, or before LOOP (as
5572 determined above). */
5574 if (targetm.vectorize.builtin_mask_for_load)
5576 gcall *new_stmt;
5577 tree builtin_decl;
5579 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5580 if (!init_addr)
5582 /* Generate the INIT_ADDR computation outside LOOP. */
5583 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5584 NULL_TREE);
5585 if (loop)
5587 pe = loop_preheader_edge (loop);
5588 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5589 gcc_assert (!new_bb);
5591 else
5592 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5595 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5596 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5597 vec_dest =
5598 vect_create_destination_var (scalar_dest,
5599 gimple_call_return_type (new_stmt));
5600 new_temp = make_ssa_name (vec_dest, new_stmt);
5601 gimple_call_set_lhs (new_stmt, new_temp);
5603 if (compute_in_loop)
5604 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5605 else
5607 /* Generate the misalignment computation outside LOOP. */
5608 pe = loop_preheader_edge (loop);
5609 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5610 gcc_assert (!new_bb);
5613 *realignment_token = gimple_call_lhs (new_stmt);
5615 /* The result of the CALL_EXPR to this builtin is determined from
5616 the value of the parameter and no global variables are touched
5617 which makes the builtin a "const" function. Requiring the
5618 builtin to have the "const" attribute makes it unnecessary
5619 to call mark_call_clobbered. */
5620 gcc_assert (TREE_READONLY (builtin_decl));
5623 if (alignment_support_scheme == dr_explicit_realign)
5624 return msq;
5626 gcc_assert (!compute_in_loop);
5627 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5630 /* 5. Create msq = phi <msq_init, lsq> in loop */
5632 pe = loop_preheader_edge (containing_loop);
5633 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5634 msq = make_ssa_name (vec_dest);
5635 phi_stmt = create_phi_node (msq, containing_loop->header);
5636 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5638 return msq;
5642 /* Function vect_grouped_load_supported.
5644 COUNT is the size of the load group (the number of statements plus the
5645 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5646 only one statement, with a gap of COUNT - 1.
5648 Returns true if a suitable permute exists. */
5650 bool
5651 vect_grouped_load_supported (tree vectype, bool single_element_p,
5652 unsigned HOST_WIDE_INT count)
5654 machine_mode mode = TYPE_MODE (vectype);
5656 /* If this is single-element interleaving with an element distance
5657 that leaves unused vector loads around punt - we at least create
5658 very sub-optimal code in that case (and blow up memory,
5659 see PR65518). */
5660 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5662 if (dump_enabled_p ())
5663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5664 "single-element interleaving not supported "
5665 "for not adjacent vector loads\n");
5666 return false;
5669 /* vect_permute_load_chain requires the group size to be equal to 3 or
5670 be a power of two. */
5671 if (count != 3 && exact_log2 (count) == -1)
5673 if (dump_enabled_p ())
5674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5675 "the size of the group of accesses"
5676 " is not a power of 2 or not equal to 3\n");
5677 return false;
5680 /* Check that the permutation is supported. */
5681 if (VECTOR_MODE_P (mode))
5683 unsigned int i, j;
5684 if (count == 3)
5686 unsigned int nelt;
5687 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5689 if (dump_enabled_p ())
5690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5691 "cannot handle groups of 3 loads for"
5692 " variable-length vectors\n");
5693 return false;
5696 vec_perm_builder sel (nelt, nelt, 1);
5697 sel.quick_grow (nelt);
5698 vec_perm_indices indices;
5699 unsigned int k;
5700 for (k = 0; k < 3; k++)
5702 for (i = 0; i < nelt; i++)
5703 if (3 * i + k < 2 * nelt)
5704 sel[i] = 3 * i + k;
5705 else
5706 sel[i] = 0;
5707 indices.new_vector (sel, 2, nelt);
5708 if (!can_vec_perm_const_p (mode, indices))
5710 if (dump_enabled_p ())
5711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5712 "shuffle of 3 loads is not supported by"
5713 " target\n");
5714 return false;
5716 for (i = 0, j = 0; i < nelt; i++)
5717 if (3 * i + k < 2 * nelt)
5718 sel[i] = i;
5719 else
5720 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5721 indices.new_vector (sel, 2, nelt);
5722 if (!can_vec_perm_const_p (mode, indices))
5724 if (dump_enabled_p ())
5725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5726 "shuffle of 3 loads is not supported by"
5727 " target\n");
5728 return false;
5731 return true;
5733 else
5735 /* If length is not equal to 3 then only power of 2 is supported. */
5736 gcc_assert (pow2p_hwi (count));
5737 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5739 /* The encoding has a single stepped pattern. */
5740 vec_perm_builder sel (nelt, 1, 3);
5741 sel.quick_grow (3);
5742 for (i = 0; i < 3; i++)
5743 sel[i] = i * 2;
5744 vec_perm_indices indices (sel, 2, nelt);
5745 if (can_vec_perm_const_p (mode, indices))
5747 for (i = 0; i < 3; i++)
5748 sel[i] = i * 2 + 1;
5749 indices.new_vector (sel, 2, nelt);
5750 if (can_vec_perm_const_p (mode, indices))
5751 return true;
5756 if (dump_enabled_p ())
5757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5758 "extract even/odd not supported by target\n");
5759 return false;
5762 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5763 type VECTYPE. MASKED_P says whether the masked form is needed. */
5765 bool
5766 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5767 bool masked_p)
5769 if (masked_p)
5770 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5771 vec_mask_load_lanes_optab,
5772 vectype, count);
5773 else
5774 return vect_lanes_optab_supported_p ("vec_load_lanes",
5775 vec_load_lanes_optab,
5776 vectype, count);
5779 /* Function vect_permute_load_chain.
5781 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5782 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5783 the input data correctly. Return the final references for loads in
5784 RESULT_CHAIN.
5786 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5787 The input is 4 vectors each containing 8 elements. We assign a number to each
5788 element, the input sequence is:
5790 1st vec: 0 1 2 3 4 5 6 7
5791 2nd vec: 8 9 10 11 12 13 14 15
5792 3rd vec: 16 17 18 19 20 21 22 23
5793 4th vec: 24 25 26 27 28 29 30 31
5795 The output sequence should be:
5797 1st vec: 0 4 8 12 16 20 24 28
5798 2nd vec: 1 5 9 13 17 21 25 29
5799 3rd vec: 2 6 10 14 18 22 26 30
5800 4th vec: 3 7 11 15 19 23 27 31
5802 i.e., the first output vector should contain the first elements of each
5803 interleaving group, etc.
5805 We use extract_even/odd instructions to create such output. The input of
5806 each extract_even/odd operation is two vectors
5807 1st vec 2nd vec
5808 0 1 2 3 4 5 6 7
5810 and the output is the vector of extracted even/odd elements. The output of
5811 extract_even will be: 0 2 4 6
5812 and of extract_odd: 1 3 5 7
5815 The permutation is done in log LENGTH stages. In each stage extract_even
5816 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5817 their order. In our example,
5819 E1: extract_even (1st vec, 2nd vec)
5820 E2: extract_odd (1st vec, 2nd vec)
5821 E3: extract_even (3rd vec, 4th vec)
5822 E4: extract_odd (3rd vec, 4th vec)
5824 The output for the first stage will be:
5826 E1: 0 2 4 6 8 10 12 14
5827 E2: 1 3 5 7 9 11 13 15
5828 E3: 16 18 20 22 24 26 28 30
5829 E4: 17 19 21 23 25 27 29 31
5831 In order to proceed and create the correct sequence for the next stage (or
5832 for the correct output, if the second stage is the last one, as in our
5833 example), we first put the output of extract_even operation and then the
5834 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5835 The input for the second stage is:
5837 1st vec (E1): 0 2 4 6 8 10 12 14
5838 2nd vec (E3): 16 18 20 22 24 26 28 30
5839 3rd vec (E2): 1 3 5 7 9 11 13 15
5840 4th vec (E4): 17 19 21 23 25 27 29 31
5842 The output of the second stage:
5844 E1: 0 4 8 12 16 20 24 28
5845 E2: 2 6 10 14 18 22 26 30
5846 E3: 1 5 9 13 17 21 25 29
5847 E4: 3 7 11 15 19 23 27 31
5849 And RESULT_CHAIN after reordering:
5851 1st vec (E1): 0 4 8 12 16 20 24 28
5852 2nd vec (E3): 1 5 9 13 17 21 25 29
5853 3rd vec (E2): 2 6 10 14 18 22 26 30
5854 4th vec (E4): 3 7 11 15 19 23 27 31. */
5856 static void
5857 vect_permute_load_chain (vec<tree> dr_chain,
5858 unsigned int length,
5859 gimple *stmt,
5860 gimple_stmt_iterator *gsi,
5861 vec<tree> *result_chain)
5863 tree data_ref, first_vect, second_vect;
5864 tree perm_mask_even, perm_mask_odd;
5865 tree perm3_mask_low, perm3_mask_high;
5866 gimple *perm_stmt;
5867 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5868 unsigned int i, j, log_length = exact_log2 (length);
5870 result_chain->quick_grow (length);
5871 memcpy (result_chain->address (), dr_chain.address (),
5872 length * sizeof (tree));
5874 if (length == 3)
5876 /* vect_grouped_load_supported ensures that this is constant. */
5877 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5878 unsigned int k;
5880 vec_perm_builder sel (nelt, nelt, 1);
5881 sel.quick_grow (nelt);
5882 vec_perm_indices indices;
5883 for (k = 0; k < 3; k++)
5885 for (i = 0; i < nelt; i++)
5886 if (3 * i + k < 2 * nelt)
5887 sel[i] = 3 * i + k;
5888 else
5889 sel[i] = 0;
5890 indices.new_vector (sel, 2, nelt);
5891 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5893 for (i = 0, j = 0; i < nelt; i++)
5894 if (3 * i + k < 2 * nelt)
5895 sel[i] = i;
5896 else
5897 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5898 indices.new_vector (sel, 2, nelt);
5899 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5901 first_vect = dr_chain[0];
5902 second_vect = dr_chain[1];
5904 /* Create interleaving stmt (low part of):
5905 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5906 ...}> */
5907 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5908 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5909 second_vect, perm3_mask_low);
5910 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5912 /* Create interleaving stmt (high part of):
5913 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5914 ...}> */
5915 first_vect = data_ref;
5916 second_vect = dr_chain[2];
5917 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5918 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5919 second_vect, perm3_mask_high);
5920 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5921 (*result_chain)[k] = data_ref;
5924 else
5926 /* If length is not equal to 3 then only power of 2 is supported. */
5927 gcc_assert (pow2p_hwi (length));
5929 /* The encoding has a single stepped pattern. */
5930 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5931 vec_perm_builder sel (nelt, 1, 3);
5932 sel.quick_grow (3);
5933 for (i = 0; i < 3; ++i)
5934 sel[i] = i * 2;
5935 vec_perm_indices indices (sel, 2, nelt);
5936 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5938 for (i = 0; i < 3; ++i)
5939 sel[i] = i * 2 + 1;
5940 indices.new_vector (sel, 2, nelt);
5941 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5943 for (i = 0; i < log_length; i++)
5945 for (j = 0; j < length; j += 2)
5947 first_vect = dr_chain[j];
5948 second_vect = dr_chain[j+1];
5950 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5951 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5952 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5953 first_vect, second_vect,
5954 perm_mask_even);
5955 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5956 (*result_chain)[j/2] = data_ref;
5958 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5959 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5960 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5961 first_vect, second_vect,
5962 perm_mask_odd);
5963 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5964 (*result_chain)[j/2+length/2] = data_ref;
5966 memcpy (dr_chain.address (), result_chain->address (),
5967 length * sizeof (tree));
5972 /* Function vect_shift_permute_load_chain.
5974 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5975 sequence of stmts to reorder the input data accordingly.
5976 Return the final references for loads in RESULT_CHAIN.
5977 Return true if successed, false otherwise.
5979 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5980 The input is 3 vectors each containing 8 elements. We assign a
5981 number to each element, the input sequence is:
5983 1st vec: 0 1 2 3 4 5 6 7
5984 2nd vec: 8 9 10 11 12 13 14 15
5985 3rd vec: 16 17 18 19 20 21 22 23
5987 The output sequence should be:
5989 1st vec: 0 3 6 9 12 15 18 21
5990 2nd vec: 1 4 7 10 13 16 19 22
5991 3rd vec: 2 5 8 11 14 17 20 23
5993 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5995 First we shuffle all 3 vectors to get correct elements order:
5997 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5998 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5999 3rd vec: (16 19 22) (17 20 23) (18 21)
6001 Next we unite and shift vector 3 times:
6003 1st step:
6004 shift right by 6 the concatenation of:
6005 "1st vec" and "2nd vec"
6006 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6007 "2nd vec" and "3rd vec"
6008 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6009 "3rd vec" and "1st vec"
6010 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6011 | New vectors |
6013 So that now new vectors are:
6015 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6016 2nd vec: (10 13) (16 19 22) (17 20 23)
6017 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6019 2nd step:
6020 shift right by 5 the concatenation of:
6021 "1st vec" and "3rd vec"
6022 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6023 "2nd vec" and "1st vec"
6024 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6025 "3rd vec" and "2nd vec"
6026 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6027 | New vectors |
6029 So that now new vectors are:
6031 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6032 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6033 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6035 3rd step:
6036 shift right by 5 the concatenation of:
6037 "1st vec" and "1st vec"
6038 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6039 shift right by 3 the concatenation of:
6040 "2nd vec" and "2nd vec"
6041 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6042 | New vectors |
6044 So that now all vectors are READY:
6045 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6046 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6047 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6049 This algorithm is faster than one in vect_permute_load_chain if:
6050 1. "shift of a concatination" is faster than general permutation.
6051 This is usually so.
6052 2. The TARGET machine can't execute vector instructions in parallel.
6053 This is because each step of the algorithm depends on previous.
6054 The algorithm in vect_permute_load_chain is much more parallel.
6056 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6059 static bool
6060 vect_shift_permute_load_chain (vec<tree> dr_chain,
6061 unsigned int length,
6062 gimple *stmt,
6063 gimple_stmt_iterator *gsi,
6064 vec<tree> *result_chain)
6066 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6067 tree perm2_mask1, perm2_mask2, perm3_mask;
6068 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6069 gimple *perm_stmt;
6071 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6072 unsigned int i;
6073 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6074 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6076 unsigned HOST_WIDE_INT nelt, vf;
6077 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6078 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6079 /* Not supported for variable-length vectors. */
6080 return false;
6082 vec_perm_builder sel (nelt, nelt, 1);
6083 sel.quick_grow (nelt);
6085 result_chain->quick_grow (length);
6086 memcpy (result_chain->address (), dr_chain.address (),
6087 length * sizeof (tree));
6089 if (pow2p_hwi (length) && vf > 4)
6091 unsigned int j, log_length = exact_log2 (length);
6092 for (i = 0; i < nelt / 2; ++i)
6093 sel[i] = i * 2;
6094 for (i = 0; i < nelt / 2; ++i)
6095 sel[nelt / 2 + i] = i * 2 + 1;
6096 vec_perm_indices indices (sel, 2, nelt);
6097 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6099 if (dump_enabled_p ())
6100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6101 "shuffle of 2 fields structure is not \
6102 supported by target\n");
6103 return false;
6105 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6107 for (i = 0; i < nelt / 2; ++i)
6108 sel[i] = i * 2 + 1;
6109 for (i = 0; i < nelt / 2; ++i)
6110 sel[nelt / 2 + i] = i * 2;
6111 indices.new_vector (sel, 2, nelt);
6112 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6114 if (dump_enabled_p ())
6115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6116 "shuffle of 2 fields structure is not \
6117 supported by target\n");
6118 return false;
6120 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6122 /* Generating permutation constant to shift all elements.
6123 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6124 for (i = 0; i < nelt; i++)
6125 sel[i] = nelt / 2 + i;
6126 indices.new_vector (sel, 2, nelt);
6127 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6129 if (dump_enabled_p ())
6130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6131 "shift permutation is not supported by target\n");
6132 return false;
6134 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6136 /* Generating permutation constant to select vector from 2.
6137 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6138 for (i = 0; i < nelt / 2; i++)
6139 sel[i] = i;
6140 for (i = nelt / 2; i < nelt; i++)
6141 sel[i] = nelt + i;
6142 indices.new_vector (sel, 2, nelt);
6143 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6145 if (dump_enabled_p ())
6146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6147 "select is not supported by target\n");
6148 return false;
6150 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6152 for (i = 0; i < log_length; i++)
6154 for (j = 0; j < length; j += 2)
6156 first_vect = dr_chain[j];
6157 second_vect = dr_chain[j + 1];
6159 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6160 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6161 first_vect, first_vect,
6162 perm2_mask1);
6163 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6164 vect[0] = data_ref;
6166 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6167 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6168 second_vect, second_vect,
6169 perm2_mask2);
6170 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6171 vect[1] = data_ref;
6173 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6174 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6175 vect[0], vect[1], shift1_mask);
6176 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6177 (*result_chain)[j/2 + length/2] = data_ref;
6179 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6180 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6181 vect[0], vect[1], select_mask);
6182 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6183 (*result_chain)[j/2] = data_ref;
6185 memcpy (dr_chain.address (), result_chain->address (),
6186 length * sizeof (tree));
6188 return true;
6190 if (length == 3 && vf > 2)
6192 unsigned int k = 0, l = 0;
6194 /* Generating permutation constant to get all elements in rigth order.
6195 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6196 for (i = 0; i < nelt; i++)
6198 if (3 * k + (l % 3) >= nelt)
6200 k = 0;
6201 l += (3 - (nelt % 3));
6203 sel[i] = 3 * k + (l % 3);
6204 k++;
6206 vec_perm_indices indices (sel, 2, nelt);
6207 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6209 if (dump_enabled_p ())
6210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6211 "shuffle of 3 fields structure is not \
6212 supported by target\n");
6213 return false;
6215 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6217 /* Generating permutation constant to shift all elements.
6218 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6219 for (i = 0; i < nelt; i++)
6220 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6221 indices.new_vector (sel, 2, nelt);
6222 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6224 if (dump_enabled_p ())
6225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6226 "shift permutation is not supported by target\n");
6227 return false;
6229 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6231 /* Generating permutation constant to shift all elements.
6232 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6233 for (i = 0; i < nelt; i++)
6234 sel[i] = 2 * (nelt / 3) + 1 + i;
6235 indices.new_vector (sel, 2, nelt);
6236 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6238 if (dump_enabled_p ())
6239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6240 "shift permutation is not supported by target\n");
6241 return false;
6243 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6245 /* Generating permutation constant to shift all elements.
6246 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6247 for (i = 0; i < nelt; i++)
6248 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6249 indices.new_vector (sel, 2, nelt);
6250 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6252 if (dump_enabled_p ())
6253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6254 "shift permutation is not supported by target\n");
6255 return false;
6257 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6259 /* Generating permutation constant to shift all elements.
6260 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6261 for (i = 0; i < nelt; i++)
6262 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6263 indices.new_vector (sel, 2, nelt);
6264 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6266 if (dump_enabled_p ())
6267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6268 "shift permutation is not supported by target\n");
6269 return false;
6271 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6273 for (k = 0; k < 3; k++)
6275 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6276 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6277 dr_chain[k], dr_chain[k],
6278 perm3_mask);
6279 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6280 vect[k] = data_ref;
6283 for (k = 0; k < 3; k++)
6285 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6286 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6287 vect[k % 3], vect[(k + 1) % 3],
6288 shift1_mask);
6289 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6290 vect_shift[k] = data_ref;
6293 for (k = 0; k < 3; k++)
6295 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6296 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6297 vect_shift[(4 - k) % 3],
6298 vect_shift[(3 - k) % 3],
6299 shift2_mask);
6300 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6301 vect[k] = data_ref;
6304 (*result_chain)[3 - (nelt % 3)] = vect[2];
6306 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6307 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6308 vect[0], shift3_mask);
6309 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6310 (*result_chain)[nelt % 3] = data_ref;
6312 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6313 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6314 vect[1], shift4_mask);
6315 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6316 (*result_chain)[0] = data_ref;
6317 return true;
6319 return false;
6322 /* Function vect_transform_grouped_load.
6324 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6325 to perform their permutation and ascribe the result vectorized statements to
6326 the scalar statements.
6329 void
6330 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6331 gimple_stmt_iterator *gsi)
6333 machine_mode mode;
6334 vec<tree> result_chain = vNULL;
6336 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6337 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6338 vectors, that are ready for vector computation. */
6339 result_chain.create (size);
6341 /* If reassociation width for vector type is 2 or greater target machine can
6342 execute 2 or more vector instructions in parallel. Otherwise try to
6343 get chain for loads group using vect_shift_permute_load_chain. */
6344 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6345 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6346 || pow2p_hwi (size)
6347 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6348 gsi, &result_chain))
6349 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6350 vect_record_grouped_load_vectors (stmt, result_chain);
6351 result_chain.release ();
6354 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6355 generated as part of the vectorization of STMT. Assign the statement
6356 for each vector to the associated scalar statement. */
6358 void
6359 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6361 gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6362 gimple *next_stmt, *new_stmt;
6363 unsigned int i, gap_count;
6364 tree tmp_data_ref;
6366 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6367 Since we scan the chain starting from it's first node, their order
6368 corresponds the order of data-refs in RESULT_CHAIN. */
6369 next_stmt = first_stmt;
6370 gap_count = 1;
6371 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6373 if (!next_stmt)
6374 break;
6376 /* Skip the gaps. Loads created for the gaps will be removed by dead
6377 code elimination pass later. No need to check for the first stmt in
6378 the group, since it always exists.
6379 DR_GROUP_GAP is the number of steps in elements from the previous
6380 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6381 correspond to the gaps. */
6382 if (next_stmt != first_stmt
6383 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6385 gap_count++;
6386 continue;
6389 while (next_stmt)
6391 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6392 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6393 copies, and we put the new vector statement in the first available
6394 RELATED_STMT. */
6395 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6396 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6397 else
6399 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6401 gimple *prev_stmt =
6402 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6403 gimple *rel_stmt =
6404 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6405 while (rel_stmt)
6407 prev_stmt = rel_stmt;
6408 rel_stmt =
6409 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6412 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6413 new_stmt;
6417 next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6418 gap_count = 1;
6419 /* If NEXT_STMT accesses the same DR as the previous statement,
6420 put the same TMP_DATA_REF as its vectorized statement; otherwise
6421 get the next data-ref from RESULT_CHAIN. */
6422 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6423 break;
6428 /* Function vect_force_dr_alignment_p.
6430 Returns whether the alignment of a DECL can be forced to be aligned
6431 on ALIGNMENT bit boundary. */
6433 bool
6434 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6436 if (!VAR_P (decl))
6437 return false;
6439 if (decl_in_symtab_p (decl)
6440 && !symtab_node::get (decl)->can_increase_alignment_p ())
6441 return false;
6443 if (TREE_STATIC (decl))
6444 return (alignment <= MAX_OFILE_ALIGNMENT);
6445 else
6446 return (alignment <= MAX_STACK_ALIGNMENT);
6450 /* Return whether the data reference DR is supported with respect to its
6451 alignment.
6452 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6453 it is aligned, i.e., check if it is possible to vectorize it with different
6454 alignment. */
6456 enum dr_alignment_support
6457 vect_supportable_dr_alignment (struct data_reference *dr,
6458 bool check_aligned_accesses)
6460 gimple *stmt = vect_dr_stmt (dr);
6461 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6462 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6463 machine_mode mode = TYPE_MODE (vectype);
6464 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6465 struct loop *vect_loop = NULL;
6466 bool nested_in_vect_loop = false;
6468 if (aligned_access_p (dr) && !check_aligned_accesses)
6469 return dr_aligned;
6471 /* For now assume all conditional loads/stores support unaligned
6472 access without any special code. */
6473 if (is_gimple_call (stmt)
6474 && gimple_call_internal_p (stmt)
6475 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6476 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6477 return dr_unaligned_supported;
6479 if (loop_vinfo)
6481 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6482 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6485 /* Possibly unaligned access. */
6487 /* We can choose between using the implicit realignment scheme (generating
6488 a misaligned_move stmt) and the explicit realignment scheme (generating
6489 aligned loads with a REALIGN_LOAD). There are two variants to the
6490 explicit realignment scheme: optimized, and unoptimized.
6491 We can optimize the realignment only if the step between consecutive
6492 vector loads is equal to the vector size. Since the vector memory
6493 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6494 is guaranteed that the misalignment amount remains the same throughout the
6495 execution of the vectorized loop. Therefore, we can create the
6496 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6497 at the loop preheader.
6499 However, in the case of outer-loop vectorization, when vectorizing a
6500 memory access in the inner-loop nested within the LOOP that is now being
6501 vectorized, while it is guaranteed that the misalignment of the
6502 vectorized memory access will remain the same in different outer-loop
6503 iterations, it is *not* guaranteed that is will remain the same throughout
6504 the execution of the inner-loop. This is because the inner-loop advances
6505 with the original scalar step (and not in steps of VS). If the inner-loop
6506 step happens to be a multiple of VS, then the misalignment remains fixed
6507 and we can use the optimized realignment scheme. For example:
6509 for (i=0; i<N; i++)
6510 for (j=0; j<M; j++)
6511 s += a[i+j];
6513 When vectorizing the i-loop in the above example, the step between
6514 consecutive vector loads is 1, and so the misalignment does not remain
6515 fixed across the execution of the inner-loop, and the realignment cannot
6516 be optimized (as illustrated in the following pseudo vectorized loop):
6518 for (i=0; i<N; i+=4)
6519 for (j=0; j<M; j++){
6520 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6521 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6522 // (assuming that we start from an aligned address).
6525 We therefore have to use the unoptimized realignment scheme:
6527 for (i=0; i<N; i+=4)
6528 for (j=k; j<M; j+=4)
6529 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6530 // that the misalignment of the initial address is
6531 // 0).
6533 The loop can then be vectorized as follows:
6535 for (k=0; k<4; k++){
6536 rt = get_realignment_token (&vp[k]);
6537 for (i=0; i<N; i+=4){
6538 v1 = vp[i+k];
6539 for (j=k; j<M; j+=4){
6540 v2 = vp[i+j+VS-1];
6541 va = REALIGN_LOAD <v1,v2,rt>;
6542 vs += va;
6543 v1 = v2;
6546 } */
6548 if (DR_IS_READ (dr))
6550 bool is_packed = false;
6551 tree type = (TREE_TYPE (DR_REF (dr)));
6553 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6554 && (!targetm.vectorize.builtin_mask_for_load
6555 || targetm.vectorize.builtin_mask_for_load ()))
6557 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6559 /* If we are doing SLP then the accesses need not have the
6560 same alignment, instead it depends on the SLP group size. */
6561 if (loop_vinfo
6562 && STMT_SLP_TYPE (stmt_info)
6563 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6564 * DR_GROUP_SIZE (vinfo_for_stmt
6565 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6566 TYPE_VECTOR_SUBPARTS (vectype)))
6568 else if (!loop_vinfo
6569 || (nested_in_vect_loop
6570 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6571 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6572 return dr_explicit_realign;
6573 else
6574 return dr_explicit_realign_optimized;
6576 if (!known_alignment_for_access_p (dr))
6577 is_packed = not_size_aligned (DR_REF (dr));
6579 if (targetm.vectorize.support_vector_misalignment
6580 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6581 /* Can't software pipeline the loads, but can at least do them. */
6582 return dr_unaligned_supported;
6584 else
6586 bool is_packed = false;
6587 tree type = (TREE_TYPE (DR_REF (dr)));
6589 if (!known_alignment_for_access_p (dr))
6590 is_packed = not_size_aligned (DR_REF (dr));
6592 if (targetm.vectorize.support_vector_misalignment
6593 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6594 return dr_unaligned_supported;
6597 /* Unsupported. */
6598 return dr_unaligned_unsupported;