LWG 3035. std::allocator's constructors should be constexpr
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob3eb67c93dff07e18f6ea98805f5a06920385c960
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 if (is_pattern_stmt_p (stmtinfo_a))
216 stmt_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
217 if (is_pattern_stmt_p (stmtinfo_b))
218 stmt_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
219 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
220 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
223 /* A subroutine of vect_analyze_data_ref_dependence. Handle
224 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
225 distances. These distances are conservatively correct but they don't
226 reflect a guaranteed dependence.
228 Return true if this function does all the work necessary to avoid
229 an alias or false if the caller should use the dependence distances
230 to limit the vectorization factor in the usual way. LOOP_DEPTH is
231 the depth of the loop described by LOOP_VINFO and the other arguments
232 are as for vect_analyze_data_ref_dependence. */
234 static bool
235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
236 loop_vec_info loop_vinfo,
237 int loop_depth, unsigned int *max_vf)
239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
240 lambda_vector dist_v;
241 unsigned int i;
242 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
244 int dist = dist_v[loop_depth];
245 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
247 /* If the user asserted safelen >= DIST consecutive iterations
248 can be executed concurrently, assume independence.
250 ??? An alternative would be to add the alias check even
251 in this case, and vectorize the fallback loop with the
252 maximum VF set to safelen. However, if the user has
253 explicitly given a length, it's less likely that that
254 would be a win. */
255 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
257 if ((unsigned int) loop->safelen < *max_vf)
258 *max_vf = loop->safelen;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
260 continue;
263 /* For dependence distances of 2 or more, we have the option
264 of limiting VF or checking for an alias at runtime.
265 Prefer to check at runtime if we can, to avoid limiting
266 the VF unnecessarily when the bases are in fact independent.
268 Note that the alias checks will be removed if the VF ends up
269 being small enough. */
270 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
273 return true;
277 /* Function vect_analyze_data_ref_dependence.
279 Return TRUE if there (might) exist a dependence between a memory-reference
280 DRA and a memory-reference DRB. When versioning for alias may check a
281 dependence at run-time, return FALSE. Adjust *MAX_VF according to
282 the data dependence. */
284 static bool
285 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
286 loop_vec_info loop_vinfo,
287 unsigned int *max_vf)
289 unsigned int i;
290 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
291 struct data_reference *dra = DDR_A (ddr);
292 struct data_reference *drb = DDR_B (ddr);
293 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
294 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
295 lambda_vector dist_v;
296 unsigned int loop_depth;
298 /* In loop analysis all data references should be vectorizable. */
299 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
300 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
301 gcc_unreachable ();
303 /* Independent data accesses. */
304 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
305 return false;
307 if (dra == drb
308 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
309 return false;
311 /* We do not have to consider dependences between accesses that belong
312 to the same group, unless the stride could be smaller than the
313 group size. */
314 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
315 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
316 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
317 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
318 return false;
320 /* Even if we have an anti-dependence then, as the vectorized loop covers at
321 least two scalar iterations, there is always also a true dependence.
322 As the vectorizer does not re-order loads and stores we can ignore
323 the anti-dependence if TBAA can disambiguate both DRs similar to the
324 case with known negative distance anti-dependences (positive
325 distance anti-dependences would violate TBAA constraints). */
326 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
327 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
328 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
329 get_alias_set (DR_REF (drb))))
330 return false;
332 /* Unknown data dependence. */
333 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
335 /* If user asserted safelen consecutive iterations can be
336 executed concurrently, assume independence. */
337 if (loop->safelen >= 2)
339 if ((unsigned int) loop->safelen < *max_vf)
340 *max_vf = loop->safelen;
341 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
342 return false;
345 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
346 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
348 if (dump_enabled_p ())
350 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
351 "versioning for alias not supported for: "
352 "can't determine dependence between ");
353 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
354 DR_REF (dra));
355 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
357 DR_REF (drb));
358 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
360 return true;
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
366 "versioning for alias required: "
367 "can't determine dependence between ");
368 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
369 DR_REF (dra));
370 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
371 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
372 DR_REF (drb));
373 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
376 /* Add to list of ddrs that need to be tested at run-time. */
377 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
380 /* Known data dependence. */
381 if (DDR_NUM_DIST_VECTS (ddr) == 0)
383 /* If user asserted safelen consecutive iterations can be
384 executed concurrently, assume independence. */
385 if (loop->safelen >= 2)
387 if ((unsigned int) loop->safelen < *max_vf)
388 *max_vf = loop->safelen;
389 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
390 return false;
393 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
394 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
396 if (dump_enabled_p ())
398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
399 "versioning for alias not supported for: "
400 "bad dist vector for ");
401 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
402 DR_REF (dra));
403 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
404 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
405 DR_REF (drb));
406 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
408 return true;
411 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414 "versioning for alias required: "
415 "bad dist vector for ");
416 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
417 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
418 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
419 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
421 /* Add to list of ddrs that need to be tested at run-time. */
422 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
425 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
427 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
428 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
429 loop_depth, max_vf))
430 return false;
432 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
434 int dist = dist_v[loop_depth];
436 if (dump_enabled_p ())
437 dump_printf_loc (MSG_NOTE, vect_location,
438 "dependence distance = %d.\n", dist);
440 if (dist == 0)
442 if (dump_enabled_p ())
444 dump_printf_loc (MSG_NOTE, vect_location,
445 "dependence distance == 0 between ");
446 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
447 dump_printf (MSG_NOTE, " and ");
448 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
449 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
452 /* When we perform grouped accesses and perform implicit CSE
453 by detecting equal accesses and doing disambiguation with
454 runtime alias tests like for
455 .. = a[i];
456 .. = a[i+1];
457 a[i] = ..;
458 a[i+1] = ..;
459 *p = ..;
460 .. = a[i];
461 .. = a[i+1];
462 where we will end up loading { a[i], a[i+1] } once, make
463 sure that inserting group loads before the first load and
464 stores after the last store will do the right thing.
465 Similar for groups like
466 a[i] = ...;
467 ... = a[i];
468 a[i+1] = ...;
469 where loads from the group interleave with the store. */
470 if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
471 vect_dr_stmt (drb)))
473 if (dump_enabled_p ())
474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
475 "READ_WRITE dependence in interleaving.\n");
476 return true;
479 if (loop->safelen < 2)
481 tree indicator = dr_zero_step_indicator (dra);
482 if (TREE_CODE (indicator) != INTEGER_CST)
483 vect_check_nonzero_value (loop_vinfo, indicator);
484 else if (integer_zerop (indicator))
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
488 "access also has a zero step\n");
489 return true;
492 continue;
495 if (dist > 0 && DDR_REVERSED_P (ddr))
497 /* If DDR_REVERSED_P the order of the data-refs in DDR was
498 reversed (to make distance vector positive), and the actual
499 distance is negative. */
500 if (dump_enabled_p ())
501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
502 "dependence distance negative.\n");
503 /* Record a negative dependence distance to later limit the
504 amount of stmt copying / unrolling we can perform.
505 Only need to handle read-after-write dependence. */
506 if (DR_IS_READ (drb)
507 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
508 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
509 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
510 continue;
513 unsigned int abs_dist = abs (dist);
514 if (abs_dist >= 2 && abs_dist < *max_vf)
516 /* The dependence distance requires reduction of the maximal
517 vectorization factor. */
518 *max_vf = abs (dist);
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "adjusting maximal vectorization factor to %i\n",
522 *max_vf);
525 if (abs_dist >= *max_vf)
527 /* Dependence distance does not create dependence, as far as
528 vectorization is concerned, in this case. */
529 if (dump_enabled_p ())
530 dump_printf_loc (MSG_NOTE, vect_location,
531 "dependence distance >= VF.\n");
532 continue;
535 if (dump_enabled_p ())
537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
538 "not vectorized, possible dependence "
539 "between data-refs ");
540 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
541 dump_printf (MSG_NOTE, " and ");
542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
543 dump_printf (MSG_NOTE, "\n");
546 return true;
549 return false;
552 /* Function vect_analyze_data_ref_dependences.
554 Examine all the data references in the loop, and make sure there do not
555 exist any data dependences between them. Set *MAX_VF according to
556 the maximum vectorization factor the data dependences allow. */
558 bool
559 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
560 unsigned int *max_vf)
562 unsigned int i;
563 struct data_dependence_relation *ddr;
565 if (dump_enabled_p ())
566 dump_printf_loc (MSG_NOTE, vect_location,
567 "=== vect_analyze_data_ref_dependences ===\n");
569 LOOP_VINFO_DDRS (loop_vinfo)
570 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
571 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
572 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
573 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
574 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
575 &LOOP_VINFO_DDRS (loop_vinfo),
576 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
577 return false;
579 /* For epilogues we either have no aliases or alias versioning
580 was applied to original loop. Therefore we may just get max_vf
581 using VF of original loop. */
582 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
583 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
584 else
585 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
586 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
587 return false;
589 return true;
593 /* Function vect_slp_analyze_data_ref_dependence.
595 Return TRUE if there (might) exist a dependence between a memory-reference
596 DRA and a memory-reference DRB. When versioning for alias may check a
597 dependence at run-time, return FALSE. Adjust *MAX_VF according to
598 the data dependence. */
600 static bool
601 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
603 struct data_reference *dra = DDR_A (ddr);
604 struct data_reference *drb = DDR_B (ddr);
606 /* We need to check dependences of statements marked as unvectorizable
607 as well, they still can prohibit vectorization. */
609 /* Independent data accesses. */
610 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
611 return false;
613 if (dra == drb)
614 return false;
616 /* Read-read is OK. */
617 if (DR_IS_READ (dra) && DR_IS_READ (drb))
618 return false;
620 /* If dra and drb are part of the same interleaving chain consider
621 them independent. */
622 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (vect_dr_stmt (dra)))
623 && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dra)))
624 == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (drb)))))
625 return false;
627 /* Unknown data dependence. */
628 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
630 if (dump_enabled_p ())
632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
633 "can't determine dependence between ");
634 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
635 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
636 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
637 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
640 else if (dump_enabled_p ())
642 dump_printf_loc (MSG_NOTE, vect_location,
643 "determined dependence between ");
644 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
645 dump_printf (MSG_NOTE, " and ");
646 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
647 dump_printf (MSG_NOTE, "\n");
650 return true;
654 /* Analyze dependences involved in the transform of SLP NODE. STORES
655 contain the vector of scalar stores of this instance if we are
656 disambiguating the loads. */
658 static bool
659 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
660 vec<gimple *> stores, gimple *last_store)
662 /* This walks over all stmts involved in the SLP load/store done
663 in NODE verifying we can sink them up to the last stmt in the
664 group. */
665 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
666 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
668 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
669 if (access == last_access)
670 continue;
671 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
672 ao_ref ref;
673 bool ref_initialized_p = false;
674 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
675 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
677 gimple *stmt = gsi_stmt (gsi);
678 if (! gimple_vuse (stmt)
679 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
680 continue;
682 /* If we couldn't record a (single) data reference for this
683 stmt we have to resort to the alias oracle. */
684 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
685 if (!dr_b)
687 /* We are moving a store or sinking a load - this means
688 we cannot use TBAA for disambiguation. */
689 if (!ref_initialized_p)
690 ao_ref_init (&ref, DR_REF (dr_a));
691 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
692 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
693 return false;
694 continue;
697 bool dependent = false;
698 /* If we run into a store of this same instance (we've just
699 marked those) then delay dependence checking until we run
700 into the last store because this is where it will have
701 been sunk to (and we verify if we can do that as well). */
702 if (gimple_visited_p (stmt))
704 if (stmt != last_store)
705 continue;
706 unsigned i;
707 gimple *store;
708 FOR_EACH_VEC_ELT (stores, i, store)
710 data_reference *store_dr
711 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
712 ddr_p ddr = initialize_data_dependence_relation
713 (dr_a, store_dr, vNULL);
714 dependent = vect_slp_analyze_data_ref_dependence (ddr);
715 free_dependence_relation (ddr);
716 if (dependent)
717 break;
720 else
722 ddr_p ddr = initialize_data_dependence_relation (dr_a,
723 dr_b, vNULL);
724 dependent = vect_slp_analyze_data_ref_dependence (ddr);
725 free_dependence_relation (ddr);
727 if (dependent)
728 return false;
731 return true;
735 /* Function vect_analyze_data_ref_dependences.
737 Examine all the data references in the basic-block, and make sure there
738 do not exist any data dependences between them. Set *MAX_VF according to
739 the maximum vectorization factor the data dependences allow. */
741 bool
742 vect_slp_analyze_instance_dependence (slp_instance instance)
744 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE, vect_location,
746 "=== vect_slp_analyze_instance_dependence ===\n");
748 /* The stores of this instance are at the root of the SLP tree. */
749 slp_tree store = SLP_INSTANCE_TREE (instance);
750 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
751 store = NULL;
753 /* Verify we can sink stores to the vectorized stmt insert location. */
754 gimple *last_store = NULL;
755 if (store)
757 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
758 return false;
760 /* Mark stores in this instance and remember the last one. */
761 last_store = vect_find_last_scalar_stmt_in_slp (store);
762 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
763 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
766 bool res = true;
768 /* Verify we can sink loads to the vectorized stmt insert location,
769 special-casing stores of this instance. */
770 slp_tree load;
771 unsigned int i;
772 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
773 if (! vect_slp_analyze_node_dependences (instance, load,
774 store
775 ? SLP_TREE_SCALAR_STMTS (store)
776 : vNULL, last_store))
778 res = false;
779 break;
782 /* Unset the visited flag. */
783 if (store)
784 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
785 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
787 return res;
790 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
791 the statement that contains DRB, which is useful for recording in the
792 dump file. */
794 static void
795 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
796 innermost_loop_behavior *drb)
798 bool existed;
799 innermost_loop_behavior *&entry
800 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
801 if (!existed || entry->base_alignment < drb->base_alignment)
803 entry = drb;
804 if (dump_enabled_p ())
806 dump_printf_loc (MSG_NOTE, vect_location,
807 "recording new base alignment for ");
808 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
809 dump_printf (MSG_NOTE, "\n");
810 dump_printf_loc (MSG_NOTE, vect_location,
811 " alignment: %d\n", drb->base_alignment);
812 dump_printf_loc (MSG_NOTE, vect_location,
813 " misalignment: %d\n", drb->base_misalignment);
814 dump_printf_loc (MSG_NOTE, vect_location,
815 " based on: ");
816 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
821 /* If the region we're going to vectorize is reached, all unconditional
822 data references occur at least once. We can therefore pool the base
823 alignment guarantees from each unconditional reference. Do this by
824 going through all the data references in VINFO and checking whether
825 the containing statement makes the reference unconditionally. If so,
826 record the alignment of the base address in VINFO so that it can be
827 used for all other references with the same base. */
829 void
830 vect_record_base_alignments (vec_info *vinfo)
832 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
833 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
834 data_reference *dr;
835 unsigned int i;
836 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
838 gimple *stmt = vect_dr_stmt (dr);
839 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
840 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
842 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
844 /* If DR is nested in the loop that is being vectorized, we can also
845 record the alignment of the base wrt the outer loop. */
846 if (loop && nested_in_vect_loop_p (loop, stmt))
848 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
849 vect_record_base_alignment
850 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
856 /* Return the target alignment for the vectorized form of DR. */
858 static unsigned int
859 vect_calculate_target_alignment (struct data_reference *dr)
861 gimple *stmt = vect_dr_stmt (dr);
862 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
863 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
864 return targetm.vectorize.preferred_vector_alignment (vectype);
867 /* Function vect_compute_data_ref_alignment
869 Compute the misalignment of the data reference DR.
871 Output:
872 1. If during the misalignment computation it is found that the data reference
873 cannot be vectorized then false is returned.
874 2. DR_MISALIGNMENT (DR) is defined.
876 FOR NOW: No analysis is actually performed. Misalignment is calculated
877 only for trivial cases. TODO. */
879 static bool
880 vect_compute_data_ref_alignment (struct data_reference *dr)
882 gimple *stmt = vect_dr_stmt (dr);
883 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
884 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
885 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
886 struct loop *loop = NULL;
887 tree ref = DR_REF (dr);
888 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
890 if (dump_enabled_p ())
891 dump_printf_loc (MSG_NOTE, vect_location,
892 "vect_compute_data_ref_alignment:\n");
894 if (loop_vinfo)
895 loop = LOOP_VINFO_LOOP (loop_vinfo);
897 /* Initialize misalignment to unknown. */
898 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
900 innermost_loop_behavior *drb = vect_dr_behavior (dr);
901 bool step_preserves_misalignment_p;
903 unsigned HOST_WIDE_INT vector_alignment
904 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
905 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
907 /* No step for BB vectorization. */
908 if (!loop)
910 gcc_assert (integer_zerop (drb->step));
911 step_preserves_misalignment_p = true;
914 /* In case the dataref is in an inner-loop of the loop that is being
915 vectorized (LOOP), we use the base and misalignment information
916 relative to the outer-loop (LOOP). This is ok only if the misalignment
917 stays the same throughout the execution of the inner-loop, which is why
918 we have to check that the stride of the dataref in the inner-loop evenly
919 divides by the vector alignment. */
920 else if (nested_in_vect_loop_p (loop, stmt))
922 step_preserves_misalignment_p
923 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
925 if (dump_enabled_p ())
927 if (step_preserves_misalignment_p)
928 dump_printf_loc (MSG_NOTE, vect_location,
929 "inner step divides the vector alignment.\n");
930 else
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 "inner step doesn't divide the vector"
933 " alignment.\n");
937 /* Similarly we can only use base and misalignment information relative to
938 an innermost loop if the misalignment stays the same throughout the
939 execution of the loop. As above, this is the case if the stride of
940 the dataref evenly divides by the alignment. */
941 else
943 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
944 step_preserves_misalignment_p
945 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
947 if (!step_preserves_misalignment_p && dump_enabled_p ())
948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
949 "step doesn't divide the vector alignment.\n");
952 unsigned int base_alignment = drb->base_alignment;
953 unsigned int base_misalignment = drb->base_misalignment;
955 /* Calculate the maximum of the pooled base address alignment and the
956 alignment that we can compute for DR itself. */
957 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
958 if (entry && base_alignment < (*entry)->base_alignment)
960 base_alignment = (*entry)->base_alignment;
961 base_misalignment = (*entry)->base_misalignment;
964 if (drb->offset_alignment < vector_alignment
965 || !step_preserves_misalignment_p
966 /* We need to know whether the step wrt the vectorized loop is
967 negative when computing the starting misalignment below. */
968 || TREE_CODE (drb->step) != INTEGER_CST)
970 if (dump_enabled_p ())
972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
973 "Unknown alignment for access: ");
974 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
975 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
977 return true;
980 if (base_alignment < vector_alignment)
982 unsigned int max_alignment;
983 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
984 if (max_alignment < vector_alignment
985 || !vect_can_force_dr_alignment_p (base,
986 vector_alignment * BITS_PER_UNIT))
988 if (dump_enabled_p ())
990 dump_printf_loc (MSG_NOTE, vect_location,
991 "can't force alignment of ref: ");
992 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
993 dump_printf (MSG_NOTE, "\n");
995 return true;
998 /* Force the alignment of the decl.
999 NOTE: This is the only change to the code we make during
1000 the analysis phase, before deciding to vectorize the loop. */
1001 if (dump_enabled_p ())
1003 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1004 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1005 dump_printf (MSG_NOTE, "\n");
1008 DR_VECT_AUX (dr)->base_decl = base;
1009 DR_VECT_AUX (dr)->base_misaligned = true;
1010 base_misalignment = 0;
1012 poly_int64 misalignment
1013 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1015 /* If this is a backward running DR then first access in the larger
1016 vectype actually is N-1 elements before the address in the DR.
1017 Adjust misalign accordingly. */
1018 if (tree_int_cst_sgn (drb->step) < 0)
1019 /* PLUS because STEP is negative. */
1020 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1021 * TREE_INT_CST_LOW (drb->step));
1023 unsigned int const_misalignment;
1024 if (!known_misalignment (misalignment, vector_alignment,
1025 &const_misalignment))
1027 if (dump_enabled_p ())
1029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1030 "Non-constant misalignment for access: ");
1031 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1032 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1034 return true;
1037 SET_DR_MISALIGNMENT (dr, const_misalignment);
1039 if (dump_enabled_p ())
1041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1042 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1043 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1044 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1047 return true;
1050 /* Function vect_update_misalignment_for_peel.
1051 Sets DR's misalignment
1052 - to 0 if it has the same alignment as DR_PEEL,
1053 - to the misalignment computed using NPEEL if DR's salignment is known,
1054 - to -1 (unknown) otherwise.
1056 DR - the data reference whose misalignment is to be adjusted.
1057 DR_PEEL - the data reference whose misalignment is being made
1058 zero in the vector loop by the peel.
1059 NPEEL - the number of iterations in the peel loop if the misalignment
1060 of DR_PEEL is known at compile time. */
1062 static void
1063 vect_update_misalignment_for_peel (struct data_reference *dr,
1064 struct data_reference *dr_peel, int npeel)
1066 unsigned int i;
1067 vec<dr_p> same_aligned_drs;
1068 struct data_reference *current_dr;
1069 int dr_size = vect_get_scalar_dr_size (dr);
1070 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1071 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
1072 stmt_vec_info peel_stmt_info = vinfo_for_stmt (vect_dr_stmt (dr_peel));
1074 /* For interleaved data accesses the step in the loop must be multiplied by
1075 the size of the interleaving group. */
1076 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1077 dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1078 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1079 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1081 /* It can be assumed that the data refs with the same alignment as dr_peel
1082 are aligned in the vector loop. */
1083 same_aligned_drs
1084 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (vect_dr_stmt (dr_peel)));
1085 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1087 if (current_dr != dr)
1088 continue;
1089 gcc_assert (!known_alignment_for_access_p (dr)
1090 || !known_alignment_for_access_p (dr_peel)
1091 || (DR_MISALIGNMENT (dr) / dr_size
1092 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1093 SET_DR_MISALIGNMENT (dr, 0);
1094 return;
1097 if (known_alignment_for_access_p (dr)
1098 && known_alignment_for_access_p (dr_peel))
1100 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1101 int misal = DR_MISALIGNMENT (dr);
1102 misal += negative ? -npeel * dr_size : npeel * dr_size;
1103 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1104 SET_DR_MISALIGNMENT (dr, misal);
1105 return;
1108 if (dump_enabled_p ())
1109 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1110 "to unknown (-1).\n");
1111 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1115 /* Function verify_data_ref_alignment
1117 Return TRUE if DR can be handled with respect to alignment. */
1119 static bool
1120 verify_data_ref_alignment (data_reference_p dr)
1122 enum dr_alignment_support supportable_dr_alignment
1123 = vect_supportable_dr_alignment (dr, false);
1124 if (!supportable_dr_alignment)
1126 if (dump_enabled_p ())
1128 if (DR_IS_READ (dr))
1129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1130 "not vectorized: unsupported unaligned load.");
1131 else
1132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1133 "not vectorized: unsupported unaligned "
1134 "store.");
1136 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1137 DR_REF (dr));
1138 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1140 return false;
1143 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1144 dump_printf_loc (MSG_NOTE, vect_location,
1145 "Vectorizing an unaligned access.\n");
1147 return true;
1150 /* Function vect_verify_datarefs_alignment
1152 Return TRUE if all data references in the loop can be
1153 handled with respect to alignment. */
1155 bool
1156 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1158 vec<data_reference_p> datarefs = vinfo->datarefs;
1159 struct data_reference *dr;
1160 unsigned int i;
1162 FOR_EACH_VEC_ELT (datarefs, i, dr)
1164 gimple *stmt = vect_dr_stmt (dr);
1165 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1167 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1168 continue;
1170 /* For interleaving, only the alignment of the first access matters. */
1171 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1172 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1173 continue;
1175 /* Strided accesses perform only component accesses, alignment is
1176 irrelevant for them. */
1177 if (STMT_VINFO_STRIDED_P (stmt_info)
1178 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1179 continue;
1181 if (! verify_data_ref_alignment (dr))
1182 return false;
1185 return true;
1188 /* Given an memory reference EXP return whether its alignment is less
1189 than its size. */
1191 static bool
1192 not_size_aligned (tree exp)
1194 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1195 return true;
1197 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1198 > get_object_alignment (exp));
1201 /* Function vector_alignment_reachable_p
1203 Return true if vector alignment for DR is reachable by peeling
1204 a few loop iterations. Return false otherwise. */
1206 static bool
1207 vector_alignment_reachable_p (struct data_reference *dr)
1209 gimple *stmt = vect_dr_stmt (dr);
1210 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1211 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1213 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1215 /* For interleaved access we peel only if number of iterations in
1216 the prolog loop ({VF - misalignment}), is a multiple of the
1217 number of the interleaved accesses. */
1218 int elem_size, mis_in_elements;
1220 /* FORNOW: handle only known alignment. */
1221 if (!known_alignment_for_access_p (dr))
1222 return false;
1224 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1225 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1226 elem_size = vector_element_size (vector_size, nelements);
1227 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1229 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1230 return false;
1233 /* If misalignment is known at the compile time then allow peeling
1234 only if natural alignment is reachable through peeling. */
1235 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1237 HOST_WIDE_INT elmsize =
1238 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1239 if (dump_enabled_p ())
1241 dump_printf_loc (MSG_NOTE, vect_location,
1242 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1243 dump_printf (MSG_NOTE,
1244 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1246 if (DR_MISALIGNMENT (dr) % elmsize)
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 "data size does not divide the misalignment.\n");
1251 return false;
1255 if (!known_alignment_for_access_p (dr))
1257 tree type = TREE_TYPE (DR_REF (dr));
1258 bool is_packed = not_size_aligned (DR_REF (dr));
1259 if (dump_enabled_p ())
1260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1261 "Unknown misalignment, %snaturally aligned\n",
1262 is_packed ? "not " : "");
1263 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1266 return true;
1270 /* Calculate the cost of the memory access represented by DR. */
1272 static void
1273 vect_get_data_access_cost (struct data_reference *dr,
1274 unsigned int *inside_cost,
1275 unsigned int *outside_cost,
1276 stmt_vector_for_cost *body_cost_vec,
1277 stmt_vector_for_cost *prologue_cost_vec)
1279 gimple *stmt = vect_dr_stmt (dr);
1280 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1281 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1282 int ncopies;
1284 if (PURE_SLP_STMT (stmt_info))
1285 ncopies = 1;
1286 else
1287 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1289 if (DR_IS_READ (dr))
1290 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1291 prologue_cost_vec, body_cost_vec, false);
1292 else
1293 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1295 if (dump_enabled_p ())
1296 dump_printf_loc (MSG_NOTE, vect_location,
1297 "vect_get_data_access_cost: inside_cost = %d, "
1298 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1302 typedef struct _vect_peel_info
1304 struct data_reference *dr;
1305 int npeel;
1306 unsigned int count;
1307 } *vect_peel_info;
1309 typedef struct _vect_peel_extended_info
1311 struct _vect_peel_info peel_info;
1312 unsigned int inside_cost;
1313 unsigned int outside_cost;
1314 } *vect_peel_extended_info;
1317 /* Peeling hashtable helpers. */
1319 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1321 static inline hashval_t hash (const _vect_peel_info *);
1322 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1325 inline hashval_t
1326 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1328 return (hashval_t) peel_info->npeel;
1331 inline bool
1332 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1334 return (a->npeel == b->npeel);
1338 /* Insert DR into peeling hash table with NPEEL as key. */
1340 static void
1341 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1342 loop_vec_info loop_vinfo, struct data_reference *dr,
1343 int npeel)
1345 struct _vect_peel_info elem, *slot;
1346 _vect_peel_info **new_slot;
1347 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1349 elem.npeel = npeel;
1350 slot = peeling_htab->find (&elem);
1351 if (slot)
1352 slot->count++;
1353 else
1355 slot = XNEW (struct _vect_peel_info);
1356 slot->npeel = npeel;
1357 slot->dr = dr;
1358 slot->count = 1;
1359 new_slot = peeling_htab->find_slot (slot, INSERT);
1360 *new_slot = slot;
1363 if (!supportable_dr_alignment
1364 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1365 slot->count += VECT_MAX_COST;
1369 /* Traverse peeling hash table to find peeling option that aligns maximum
1370 number of data accesses. */
1373 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1374 _vect_peel_extended_info *max)
1376 vect_peel_info elem = *slot;
1378 if (elem->count > max->peel_info.count
1379 || (elem->count == max->peel_info.count
1380 && max->peel_info.npeel > elem->npeel))
1382 max->peel_info.npeel = elem->npeel;
1383 max->peel_info.count = elem->count;
1384 max->peel_info.dr = elem->dr;
1387 return 1;
1390 /* Get the costs of peeling NPEEL iterations checking data access costs
1391 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1392 misalignment will be zero after peeling. */
1394 static void
1395 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1396 struct data_reference *dr0,
1397 unsigned int *inside_cost,
1398 unsigned int *outside_cost,
1399 stmt_vector_for_cost *body_cost_vec,
1400 stmt_vector_for_cost *prologue_cost_vec,
1401 unsigned int npeel,
1402 bool unknown_misalignment)
1404 unsigned i;
1405 data_reference *dr;
1407 FOR_EACH_VEC_ELT (datarefs, i, dr)
1409 gimple *stmt = vect_dr_stmt (dr);
1410 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1411 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1412 continue;
1414 /* For interleaving, only the alignment of the first access
1415 matters. */
1416 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1417 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1418 continue;
1420 /* Strided accesses perform only component accesses, alignment is
1421 irrelevant for them. */
1422 if (STMT_VINFO_STRIDED_P (stmt_info)
1423 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1424 continue;
1426 int save_misalignment;
1427 save_misalignment = DR_MISALIGNMENT (dr);
1428 if (npeel == 0)
1430 else if (unknown_misalignment && dr == dr0)
1431 SET_DR_MISALIGNMENT (dr, 0);
1432 else
1433 vect_update_misalignment_for_peel (dr, dr0, npeel);
1434 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1435 body_cost_vec, prologue_cost_vec);
1436 SET_DR_MISALIGNMENT (dr, save_misalignment);
1440 /* Traverse peeling hash table and calculate cost for each peeling option.
1441 Find the one with the lowest cost. */
1444 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1445 _vect_peel_extended_info *min)
1447 vect_peel_info elem = *slot;
1448 int dummy;
1449 unsigned int inside_cost = 0, outside_cost = 0;
1450 gimple *stmt = vect_dr_stmt (elem->dr);
1451 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1452 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1453 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1454 epilogue_cost_vec;
1456 prologue_cost_vec.create (2);
1457 body_cost_vec.create (2);
1458 epilogue_cost_vec.create (2);
1460 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1461 elem->dr, &inside_cost, &outside_cost,
1462 &body_cost_vec, &prologue_cost_vec,
1463 elem->npeel, false);
1465 body_cost_vec.release ();
1467 outside_cost += vect_get_known_peeling_cost
1468 (loop_vinfo, elem->npeel, &dummy,
1469 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1470 &prologue_cost_vec, &epilogue_cost_vec);
1472 /* Prologue and epilogue costs are added to the target model later.
1473 These costs depend only on the scalar iteration cost, the
1474 number of peeling iterations finally chosen, and the number of
1475 misaligned statements. So discard the information found here. */
1476 prologue_cost_vec.release ();
1477 epilogue_cost_vec.release ();
1479 if (inside_cost < min->inside_cost
1480 || (inside_cost == min->inside_cost
1481 && outside_cost < min->outside_cost))
1483 min->inside_cost = inside_cost;
1484 min->outside_cost = outside_cost;
1485 min->peel_info.dr = elem->dr;
1486 min->peel_info.npeel = elem->npeel;
1487 min->peel_info.count = elem->count;
1490 return 1;
1494 /* Choose best peeling option by traversing peeling hash table and either
1495 choosing an option with the lowest cost (if cost model is enabled) or the
1496 option that aligns as many accesses as possible. */
1498 static struct _vect_peel_extended_info
1499 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1500 loop_vec_info loop_vinfo)
1502 struct _vect_peel_extended_info res;
1504 res.peel_info.dr = NULL;
1506 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1508 res.inside_cost = INT_MAX;
1509 res.outside_cost = INT_MAX;
1510 peeling_htab->traverse <_vect_peel_extended_info *,
1511 vect_peeling_hash_get_lowest_cost> (&res);
1513 else
1515 res.peel_info.count = 0;
1516 peeling_htab->traverse <_vect_peel_extended_info *,
1517 vect_peeling_hash_get_most_frequent> (&res);
1518 res.inside_cost = 0;
1519 res.outside_cost = 0;
1522 return res;
1525 /* Return true if the new peeling NPEEL is supported. */
1527 static bool
1528 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1529 unsigned npeel)
1531 unsigned i;
1532 struct data_reference *dr = NULL;
1533 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1534 gimple *stmt;
1535 stmt_vec_info stmt_info;
1536 enum dr_alignment_support supportable_dr_alignment;
1538 /* Ensure that all data refs can be vectorized after the peel. */
1539 FOR_EACH_VEC_ELT (datarefs, i, dr)
1541 int save_misalignment;
1543 if (dr == dr0)
1544 continue;
1546 stmt = vect_dr_stmt (dr);
1547 stmt_info = vinfo_for_stmt (stmt);
1548 /* For interleaving, only the alignment of the first access
1549 matters. */
1550 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1551 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1552 continue;
1554 /* Strided accesses perform only component accesses, alignment is
1555 irrelevant for them. */
1556 if (STMT_VINFO_STRIDED_P (stmt_info)
1557 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1558 continue;
1560 save_misalignment = DR_MISALIGNMENT (dr);
1561 vect_update_misalignment_for_peel (dr, dr0, npeel);
1562 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1563 SET_DR_MISALIGNMENT (dr, save_misalignment);
1565 if (!supportable_dr_alignment)
1566 return false;
1569 return true;
1572 /* Function vect_enhance_data_refs_alignment
1574 This pass will use loop versioning and loop peeling in order to enhance
1575 the alignment of data references in the loop.
1577 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1578 original loop is to be vectorized. Any other loops that are created by
1579 the transformations performed in this pass - are not supposed to be
1580 vectorized. This restriction will be relaxed.
1582 This pass will require a cost model to guide it whether to apply peeling
1583 or versioning or a combination of the two. For example, the scheme that
1584 intel uses when given a loop with several memory accesses, is as follows:
1585 choose one memory access ('p') which alignment you want to force by doing
1586 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1587 other accesses are not necessarily aligned, or (2) use loop versioning to
1588 generate one loop in which all accesses are aligned, and another loop in
1589 which only 'p' is necessarily aligned.
1591 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1592 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1593 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1595 Devising a cost model is the most critical aspect of this work. It will
1596 guide us on which access to peel for, whether to use loop versioning, how
1597 many versions to create, etc. The cost model will probably consist of
1598 generic considerations as well as target specific considerations (on
1599 powerpc for example, misaligned stores are more painful than misaligned
1600 loads).
1602 Here are the general steps involved in alignment enhancements:
1604 -- original loop, before alignment analysis:
1605 for (i=0; i<N; i++){
1606 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1607 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1610 -- After vect_compute_data_refs_alignment:
1611 for (i=0; i<N; i++){
1612 x = q[i]; # DR_MISALIGNMENT(q) = 3
1613 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1616 -- Possibility 1: we do loop versioning:
1617 if (p is aligned) {
1618 for (i=0; i<N; i++){ # loop 1A
1619 x = q[i]; # DR_MISALIGNMENT(q) = 3
1620 p[i] = y; # DR_MISALIGNMENT(p) = 0
1623 else {
1624 for (i=0; i<N; i++){ # loop 1B
1625 x = q[i]; # DR_MISALIGNMENT(q) = 3
1626 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1630 -- Possibility 2: we do loop peeling:
1631 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1632 x = q[i];
1633 p[i] = y;
1635 for (i = 3; i < N; i++){ # loop 2A
1636 x = q[i]; # DR_MISALIGNMENT(q) = 0
1637 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1640 -- Possibility 3: combination of loop peeling and versioning:
1641 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1642 x = q[i];
1643 p[i] = y;
1645 if (p is aligned) {
1646 for (i = 3; i<N; i++){ # loop 3A
1647 x = q[i]; # DR_MISALIGNMENT(q) = 0
1648 p[i] = y; # DR_MISALIGNMENT(p) = 0
1651 else {
1652 for (i = 3; i<N; i++){ # loop 3B
1653 x = q[i]; # DR_MISALIGNMENT(q) = 0
1654 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1658 These loops are later passed to loop_transform to be vectorized. The
1659 vectorizer will use the alignment information to guide the transformation
1660 (whether to generate regular loads/stores, or with special handling for
1661 misalignment). */
1663 bool
1664 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1666 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1667 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1668 enum dr_alignment_support supportable_dr_alignment;
1669 struct data_reference *dr0 = NULL, *first_store = NULL;
1670 struct data_reference *dr;
1671 unsigned int i, j;
1672 bool do_peeling = false;
1673 bool do_versioning = false;
1674 bool stat;
1675 gimple *stmt;
1676 stmt_vec_info stmt_info;
1677 unsigned int npeel = 0;
1678 bool one_misalignment_known = false;
1679 bool one_misalignment_unknown = false;
1680 bool one_dr_unsupportable = false;
1681 struct data_reference *unsupportable_dr = NULL;
1682 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1683 unsigned possible_npeel_number = 1;
1684 tree vectype;
1685 unsigned int mis, same_align_drs_max = 0;
1686 hash_table<peel_info_hasher> peeling_htab (1);
1688 if (dump_enabled_p ())
1689 dump_printf_loc (MSG_NOTE, vect_location,
1690 "=== vect_enhance_data_refs_alignment ===\n");
1692 /* Reset data so we can safely be called multiple times. */
1693 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1694 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1696 /* While cost model enhancements are expected in the future, the high level
1697 view of the code at this time is as follows:
1699 A) If there is a misaligned access then see if peeling to align
1700 this access can make all data references satisfy
1701 vect_supportable_dr_alignment. If so, update data structures
1702 as needed and return true.
1704 B) If peeling wasn't possible and there is a data reference with an
1705 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1706 then see if loop versioning checks can be used to make all data
1707 references satisfy vect_supportable_dr_alignment. If so, update
1708 data structures as needed and return true.
1710 C) If neither peeling nor versioning were successful then return false if
1711 any data reference does not satisfy vect_supportable_dr_alignment.
1713 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1715 Note, Possibility 3 above (which is peeling and versioning together) is not
1716 being done at this time. */
1718 /* (1) Peeling to force alignment. */
1720 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1721 Considerations:
1722 + How many accesses will become aligned due to the peeling
1723 - How many accesses will become unaligned due to the peeling,
1724 and the cost of misaligned accesses.
1725 - The cost of peeling (the extra runtime checks, the increase
1726 in code size). */
1728 FOR_EACH_VEC_ELT (datarefs, i, dr)
1730 stmt = vect_dr_stmt (dr);
1731 stmt_info = vinfo_for_stmt (stmt);
1733 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1734 continue;
1736 /* For interleaving, only the alignment of the first access
1737 matters. */
1738 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1739 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1740 continue;
1742 /* For invariant accesses there is nothing to enhance. */
1743 if (integer_zerop (DR_STEP (dr)))
1744 continue;
1746 /* Strided accesses perform only component accesses, alignment is
1747 irrelevant for them. */
1748 if (STMT_VINFO_STRIDED_P (stmt_info)
1749 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1750 continue;
1752 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1753 do_peeling = vector_alignment_reachable_p (dr);
1754 if (do_peeling)
1756 if (known_alignment_for_access_p (dr))
1758 unsigned int npeel_tmp = 0;
1759 bool negative = tree_int_cst_compare (DR_STEP (dr),
1760 size_zero_node) < 0;
1762 vectype = STMT_VINFO_VECTYPE (stmt_info);
1763 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1764 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1765 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1766 if (DR_MISALIGNMENT (dr) != 0)
1767 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1769 /* For multiple types, it is possible that the bigger type access
1770 will have more than one peeling option. E.g., a loop with two
1771 types: one of size (vector size / 4), and the other one of
1772 size (vector size / 8). Vectorization factor will 8. If both
1773 accesses are misaligned by 3, the first one needs one scalar
1774 iteration to be aligned, and the second one needs 5. But the
1775 first one will be aligned also by peeling 5 scalar
1776 iterations, and in that case both accesses will be aligned.
1777 Hence, except for the immediate peeling amount, we also want
1778 to try to add full vector size, while we don't exceed
1779 vectorization factor.
1780 We do this automatically for cost model, since we calculate
1781 cost for every peeling option. */
1782 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1784 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786 possible_npeel_number
1787 = vect_get_num_vectors (nscalars, vectype);
1789 /* NPEEL_TMP is 0 when there is no misalignment, but also
1790 allow peeling NELEMENTS. */
1791 if (DR_MISALIGNMENT (dr) == 0)
1792 possible_npeel_number++;
1795 /* Save info about DR in the hash table. Also include peeling
1796 amounts according to the explanation above. */
1797 for (j = 0; j < possible_npeel_number; j++)
1799 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800 dr, npeel_tmp);
1801 npeel_tmp += target_align / dr_size;
1804 one_misalignment_known = true;
1806 else
1808 /* If we don't know any misalignment values, we prefer
1809 peeling for data-ref that has the maximum number of data-refs
1810 with the same alignment, unless the target prefers to align
1811 stores over load. */
1812 unsigned same_align_drs
1813 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814 if (!dr0
1815 || same_align_drs_max < same_align_drs)
1817 same_align_drs_max = same_align_drs;
1818 dr0 = dr;
1820 /* For data-refs with the same number of related
1821 accesses prefer the one where the misalign
1822 computation will be invariant in the outermost loop. */
1823 else if (same_align_drs_max == same_align_drs)
1825 struct loop *ivloop0, *ivloop;
1826 ivloop0 = outermost_invariant_loop_for_expr
1827 (loop, DR_BASE_ADDRESS (dr0));
1828 ivloop = outermost_invariant_loop_for_expr
1829 (loop, DR_BASE_ADDRESS (dr));
1830 if ((ivloop && !ivloop0)
1831 || (ivloop && ivloop0
1832 && flow_loop_nested_p (ivloop, ivloop0)))
1833 dr0 = dr;
1836 one_misalignment_unknown = true;
1838 /* Check for data refs with unsupportable alignment that
1839 can be peeled. */
1840 if (!supportable_dr_alignment)
1842 one_dr_unsupportable = true;
1843 unsupportable_dr = dr;
1846 if (!first_store && DR_IS_WRITE (dr))
1847 first_store = dr;
1850 else
1852 if (!aligned_access_p (dr))
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856 "vector alignment may not be reachable\n");
1857 break;
1862 /* Check if we can possibly peel the loop. */
1863 if (!vect_can_advance_ivs_p (loop_vinfo)
1864 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865 || loop->inner)
1866 do_peeling = false;
1868 struct _vect_peel_extended_info peel_for_known_alignment;
1869 struct _vect_peel_extended_info peel_for_unknown_alignment;
1870 struct _vect_peel_extended_info best_peel;
1872 peel_for_unknown_alignment.inside_cost = INT_MAX;
1873 peel_for_unknown_alignment.outside_cost = INT_MAX;
1874 peel_for_unknown_alignment.peel_info.count = 0;
1876 if (do_peeling
1877 && one_misalignment_unknown)
1879 /* Check if the target requires to prefer stores over loads, i.e., if
1880 misaligned stores are more expensive than misaligned loads (taking
1881 drs with same alignment into account). */
1882 unsigned int load_inside_cost = 0;
1883 unsigned int load_outside_cost = 0;
1884 unsigned int store_inside_cost = 0;
1885 unsigned int store_outside_cost = 0;
1886 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1888 stmt_vector_for_cost dummy;
1889 dummy.create (2);
1890 vect_get_peeling_costs_all_drs (datarefs, dr0,
1891 &load_inside_cost,
1892 &load_outside_cost,
1893 &dummy, &dummy, estimated_npeels, true);
1894 dummy.release ();
1896 if (first_store)
1898 dummy.create (2);
1899 vect_get_peeling_costs_all_drs (datarefs, first_store,
1900 &store_inside_cost,
1901 &store_outside_cost,
1902 &dummy, &dummy,
1903 estimated_npeels, true);
1904 dummy.release ();
1906 else
1908 store_inside_cost = INT_MAX;
1909 store_outside_cost = INT_MAX;
1912 if (load_inside_cost > store_inside_cost
1913 || (load_inside_cost == store_inside_cost
1914 && load_outside_cost > store_outside_cost))
1916 dr0 = first_store;
1917 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1920 else
1922 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1926 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927 prologue_cost_vec.create (2);
1928 epilogue_cost_vec.create (2);
1930 int dummy2;
1931 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932 (loop_vinfo, estimated_npeels, &dummy2,
1933 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934 &prologue_cost_vec, &epilogue_cost_vec);
1936 prologue_cost_vec.release ();
1937 epilogue_cost_vec.release ();
1939 peel_for_unknown_alignment.peel_info.count = 1
1940 + STMT_VINFO_SAME_ALIGN_REFS
1941 (vinfo_for_stmt (vect_dr_stmt (dr0))).length ();
1944 peel_for_unknown_alignment.peel_info.npeel = 0;
1945 peel_for_unknown_alignment.peel_info.dr = dr0;
1947 best_peel = peel_for_unknown_alignment;
1949 peel_for_known_alignment.inside_cost = INT_MAX;
1950 peel_for_known_alignment.outside_cost = INT_MAX;
1951 peel_for_known_alignment.peel_info.count = 0;
1952 peel_for_known_alignment.peel_info.dr = NULL;
1954 if (do_peeling && one_misalignment_known)
1956 /* Peeling is possible, but there is no data access that is not supported
1957 unless aligned. So we try to choose the best possible peeling from
1958 the hash table. */
1959 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1960 (&peeling_htab, loop_vinfo);
1963 /* Compare costs of peeling for known and unknown alignment. */
1964 if (peel_for_known_alignment.peel_info.dr != NULL
1965 && peel_for_unknown_alignment.inside_cost
1966 >= peel_for_known_alignment.inside_cost)
1968 best_peel = peel_for_known_alignment;
1970 /* If the best peeling for known alignment has NPEEL == 0, perform no
1971 peeling at all except if there is an unsupportable dr that we can
1972 align. */
1973 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1974 do_peeling = false;
1977 /* If there is an unsupportable data ref, prefer this over all choices so far
1978 since we'd have to discard a chosen peeling except when it accidentally
1979 aligned the unsupportable data ref. */
1980 if (one_dr_unsupportable)
1981 dr0 = unsupportable_dr;
1982 else if (do_peeling)
1984 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1985 TODO: Use nopeel_outside_cost or get rid of it? */
1986 unsigned nopeel_inside_cost = 0;
1987 unsigned nopeel_outside_cost = 0;
1989 stmt_vector_for_cost dummy;
1990 dummy.create (2);
1991 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1992 &nopeel_outside_cost, &dummy, &dummy,
1993 0, false);
1994 dummy.release ();
1996 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1997 costs will be recorded. */
1998 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1999 prologue_cost_vec.create (2);
2000 epilogue_cost_vec.create (2);
2002 int dummy2;
2003 nopeel_outside_cost += vect_get_known_peeling_cost
2004 (loop_vinfo, 0, &dummy2,
2005 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2006 &prologue_cost_vec, &epilogue_cost_vec);
2008 prologue_cost_vec.release ();
2009 epilogue_cost_vec.release ();
2011 npeel = best_peel.peel_info.npeel;
2012 dr0 = best_peel.peel_info.dr;
2014 /* If no peeling is not more expensive than the best peeling we
2015 have so far, don't perform any peeling. */
2016 if (nopeel_inside_cost <= best_peel.inside_cost)
2017 do_peeling = false;
2020 if (do_peeling)
2022 stmt = vect_dr_stmt (dr0);
2023 stmt_info = vinfo_for_stmt (stmt);
2024 vectype = STMT_VINFO_VECTYPE (stmt_info);
2026 if (known_alignment_for_access_p (dr0))
2028 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2029 size_zero_node) < 0;
2030 if (!npeel)
2032 /* Since it's known at compile time, compute the number of
2033 iterations in the peeled loop (the peeling factor) for use in
2034 updating DR_MISALIGNMENT values. The peeling factor is the
2035 vectorization factor minus the misalignment as an element
2036 count. */
2037 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2038 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2039 npeel = ((mis & (target_align - 1))
2040 / vect_get_scalar_dr_size (dr0));
2043 /* For interleaved data access every iteration accesses all the
2044 members of the group, therefore we divide the number of iterations
2045 by the group size. */
2046 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr0));
2047 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2048 npeel /= DR_GROUP_SIZE (stmt_info);
2050 if (dump_enabled_p ())
2051 dump_printf_loc (MSG_NOTE, vect_location,
2052 "Try peeling by %d\n", npeel);
2055 /* Ensure that all datarefs can be vectorized after the peel. */
2056 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2057 do_peeling = false;
2059 /* Check if all datarefs are supportable and log. */
2060 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2062 stat = vect_verify_datarefs_alignment (loop_vinfo);
2063 if (!stat)
2064 do_peeling = false;
2065 else
2066 return stat;
2069 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2070 if (do_peeling)
2072 unsigned max_allowed_peel
2073 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2074 if (max_allowed_peel != (unsigned)-1)
2076 unsigned max_peel = npeel;
2077 if (max_peel == 0)
2079 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2080 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2082 if (max_peel > max_allowed_peel)
2084 do_peeling = false;
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_NOTE, vect_location,
2087 "Disable peeling, max peels reached: %d\n", max_peel);
2092 /* Cost model #2 - if peeling may result in a remaining loop not
2093 iterating enough to be vectorized then do not peel. Since this
2094 is a cost heuristic rather than a correctness decision, use the
2095 most likely runtime value for variable vectorization factors. */
2096 if (do_peeling
2097 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2099 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2100 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2101 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2102 < assumed_vf + max_peel)
2103 do_peeling = false;
2106 if (do_peeling)
2108 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2109 If the misalignment of DR_i is identical to that of dr0 then set
2110 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2111 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2112 by the peeling factor times the element size of DR_i (MOD the
2113 vectorization factor times the size). Otherwise, the
2114 misalignment of DR_i must be set to unknown. */
2115 FOR_EACH_VEC_ELT (datarefs, i, dr)
2116 if (dr != dr0)
2118 /* Strided accesses perform only component accesses, alignment
2119 is irrelevant for them. */
2120 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2121 if (STMT_VINFO_STRIDED_P (stmt_info)
2122 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2123 continue;
2125 vect_update_misalignment_for_peel (dr, dr0, npeel);
2128 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2129 if (npeel)
2130 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2131 else
2132 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2133 = DR_MISALIGNMENT (dr0);
2134 SET_DR_MISALIGNMENT (dr0, 0);
2135 if (dump_enabled_p ())
2137 dump_printf_loc (MSG_NOTE, vect_location,
2138 "Alignment of access forced using peeling.\n");
2139 dump_printf_loc (MSG_NOTE, vect_location,
2140 "Peeling for alignment will be applied.\n");
2143 /* The inside-loop cost will be accounted for in vectorizable_load
2144 and vectorizable_store correctly with adjusted alignments.
2145 Drop the body_cst_vec on the floor here. */
2146 stat = vect_verify_datarefs_alignment (loop_vinfo);
2147 gcc_assert (stat);
2148 return stat;
2152 /* (2) Versioning to force alignment. */
2154 /* Try versioning if:
2155 1) optimize loop for speed
2156 2) there is at least one unsupported misaligned data ref with an unknown
2157 misalignment, and
2158 3) all misaligned data refs with a known misalignment are supported, and
2159 4) the number of runtime alignment checks is within reason. */
2161 do_versioning =
2162 optimize_loop_nest_for_speed_p (loop)
2163 && (!loop->inner); /* FORNOW */
2165 if (do_versioning)
2167 FOR_EACH_VEC_ELT (datarefs, i, dr)
2169 stmt = vect_dr_stmt (dr);
2170 stmt_info = vinfo_for_stmt (stmt);
2172 /* For interleaving, only the alignment of the first access
2173 matters. */
2174 if (aligned_access_p (dr)
2175 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2176 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2177 continue;
2179 if (STMT_VINFO_STRIDED_P (stmt_info))
2181 /* Strided loads perform only component accesses, alignment is
2182 irrelevant for them. */
2183 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2184 continue;
2185 do_versioning = false;
2186 break;
2189 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2191 if (!supportable_dr_alignment)
2193 gimple *stmt;
2194 int mask;
2195 tree vectype;
2197 if (known_alignment_for_access_p (dr)
2198 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2199 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2201 do_versioning = false;
2202 break;
2205 stmt = vect_dr_stmt (dr);
2206 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2207 gcc_assert (vectype);
2209 /* At present we don't support versioning for alignment
2210 with variable VF, since there's no guarantee that the
2211 VF is a power of two. We could relax this if we added
2212 a way of enforcing a power-of-two size. */
2213 unsigned HOST_WIDE_INT size;
2214 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2216 do_versioning = false;
2217 break;
2220 /* The rightmost bits of an aligned address must be zeros.
2221 Construct the mask needed for this test. For example,
2222 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2223 mask must be 15 = 0xf. */
2224 mask = size - 1;
2226 /* FORNOW: use the same mask to test all potentially unaligned
2227 references in the loop. The vectorizer currently supports
2228 a single vector size, see the reference to
2229 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2230 vectorization factor is computed. */
2231 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2232 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2233 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2234 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2235 vect_dr_stmt (dr));
2239 /* Versioning requires at least one misaligned data reference. */
2240 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2241 do_versioning = false;
2242 else if (!do_versioning)
2243 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2246 if (do_versioning)
2248 vec<gimple *> may_misalign_stmts
2249 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2250 gimple *stmt;
2252 /* It can now be assumed that the data references in the statements
2253 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2254 of the loop being vectorized. */
2255 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2257 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2258 dr = STMT_VINFO_DATA_REF (stmt_info);
2259 SET_DR_MISALIGNMENT (dr, 0);
2260 if (dump_enabled_p ())
2261 dump_printf_loc (MSG_NOTE, vect_location,
2262 "Alignment of access forced using versioning.\n");
2265 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_NOTE, vect_location,
2267 "Versioning for alignment will be applied.\n");
2269 /* Peeling and versioning can't be done together at this time. */
2270 gcc_assert (! (do_peeling && do_versioning));
2272 stat = vect_verify_datarefs_alignment (loop_vinfo);
2273 gcc_assert (stat);
2274 return stat;
2277 /* This point is reached if neither peeling nor versioning is being done. */
2278 gcc_assert (! (do_peeling || do_versioning));
2280 stat = vect_verify_datarefs_alignment (loop_vinfo);
2281 return stat;
2285 /* Function vect_find_same_alignment_drs.
2287 Update group and alignment relations according to the chosen
2288 vectorization factor. */
2290 static void
2291 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2293 struct data_reference *dra = DDR_A (ddr);
2294 struct data_reference *drb = DDR_B (ddr);
2295 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2296 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2298 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2299 return;
2301 if (dra == drb)
2302 return;
2304 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2305 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2306 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2307 return;
2309 /* Two references with distance zero have the same alignment. */
2310 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2311 - wi::to_poly_offset (DR_INIT (drb)));
2312 if (maybe_ne (diff, 0))
2314 /* Get the wider of the two alignments. */
2315 unsigned int align_a = (vect_calculate_target_alignment (dra)
2316 / BITS_PER_UNIT);
2317 unsigned int align_b = (vect_calculate_target_alignment (drb)
2318 / BITS_PER_UNIT);
2319 unsigned int max_align = MAX (align_a, align_b);
2321 /* Require the gap to be a multiple of the larger vector alignment. */
2322 if (!multiple_p (diff, max_align))
2323 return;
2326 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2327 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2328 if (dump_enabled_p ())
2330 dump_printf_loc (MSG_NOTE, vect_location,
2331 "accesses have the same alignment: ");
2332 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2333 dump_printf (MSG_NOTE, " and ");
2334 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2335 dump_printf (MSG_NOTE, "\n");
2340 /* Function vect_analyze_data_refs_alignment
2342 Analyze the alignment of the data-references in the loop.
2343 Return FALSE if a data reference is found that cannot be vectorized. */
2345 bool
2346 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2348 if (dump_enabled_p ())
2349 dump_printf_loc (MSG_NOTE, vect_location,
2350 "=== vect_analyze_data_refs_alignment ===\n");
2352 /* Mark groups of data references with same alignment using
2353 data dependence information. */
2354 vec<ddr_p> ddrs = vinfo->ddrs;
2355 struct data_dependence_relation *ddr;
2356 unsigned int i;
2358 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2359 vect_find_same_alignment_drs (ddr);
2361 vec<data_reference_p> datarefs = vinfo->datarefs;
2362 struct data_reference *dr;
2364 vect_record_base_alignments (vinfo);
2365 FOR_EACH_VEC_ELT (datarefs, i, dr)
2367 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2368 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2369 && !vect_compute_data_ref_alignment (dr))
2371 /* Strided accesses perform only component accesses, misalignment
2372 information is irrelevant for them. */
2373 if (STMT_VINFO_STRIDED_P (stmt_info)
2374 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2375 continue;
2377 if (dump_enabled_p ())
2378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2379 "not vectorized: can't calculate alignment "
2380 "for data ref.\n");
2382 return false;
2386 return true;
2390 /* Analyze alignment of DRs of stmts in NODE. */
2392 static bool
2393 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2395 /* We vectorize from the first scalar stmt in the node unless
2396 the node is permuted in which case we start from the first
2397 element in the group. */
2398 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2399 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2400 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2401 first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2403 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2404 if (! vect_compute_data_ref_alignment (dr)
2405 /* For creating the data-ref pointer we need alignment of the
2406 first element anyway. */
2407 || (dr != first_dr
2408 && ! vect_compute_data_ref_alignment (first_dr))
2409 || ! verify_data_ref_alignment (dr))
2411 if (dump_enabled_p ())
2412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2413 "not vectorized: bad data alignment in basic "
2414 "block.\n");
2415 return false;
2418 return true;
2421 /* Function vect_slp_analyze_instance_alignment
2423 Analyze the alignment of the data-references in the SLP instance.
2424 Return FALSE if a data reference is found that cannot be vectorized. */
2426 bool
2427 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2429 if (dump_enabled_p ())
2430 dump_printf_loc (MSG_NOTE, vect_location,
2431 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2433 slp_tree node;
2434 unsigned i;
2435 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2436 if (! vect_slp_analyze_and_verify_node_alignment (node))
2437 return false;
2439 node = SLP_INSTANCE_TREE (instance);
2440 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2441 && ! vect_slp_analyze_and_verify_node_alignment
2442 (SLP_INSTANCE_TREE (instance)))
2443 return false;
2445 return true;
2449 /* Analyze groups of accesses: check that DR belongs to a group of
2450 accesses of legal size, step, etc. Detect gaps, single element
2451 interleaving, and other special cases. Set grouped access info.
2452 Collect groups of strided stores for further use in SLP analysis.
2453 Worker for vect_analyze_group_access. */
2455 static bool
2456 vect_analyze_group_access_1 (struct data_reference *dr)
2458 tree step = DR_STEP (dr);
2459 tree scalar_type = TREE_TYPE (DR_REF (dr));
2460 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2461 gimple *stmt = vect_dr_stmt (dr);
2462 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2463 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2464 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2465 HOST_WIDE_INT dr_step = -1;
2466 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2467 bool slp_impossible = false;
2469 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2470 size of the interleaving group (including gaps). */
2471 if (tree_fits_shwi_p (step))
2473 dr_step = tree_to_shwi (step);
2474 /* Check that STEP is a multiple of type size. Otherwise there is
2475 a non-element-sized gap at the end of the group which we
2476 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2477 ??? As we can handle non-constant step fine here we should
2478 simply remove uses of DR_GROUP_GAP between the last and first
2479 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2480 simply not include that gap. */
2481 if ((dr_step % type_size) != 0)
2483 if (dump_enabled_p ())
2485 dump_printf_loc (MSG_NOTE, vect_location,
2486 "Step ");
2487 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2488 dump_printf (MSG_NOTE,
2489 " is not a multiple of the element size for ");
2490 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2491 dump_printf (MSG_NOTE, "\n");
2493 return false;
2495 groupsize = absu_hwi (dr_step) / type_size;
2497 else
2498 groupsize = 0;
2500 /* Not consecutive access is possible only if it is a part of interleaving. */
2501 if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2503 /* Check if it this DR is a part of interleaving, and is a single
2504 element of the group that is accessed in the loop. */
2506 /* Gaps are supported only for loads. STEP must be a multiple of the type
2507 size. */
2508 if (DR_IS_READ (dr)
2509 && (dr_step % type_size) == 0
2510 && groupsize > 0)
2512 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2513 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2514 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2515 if (dump_enabled_p ())
2517 dump_printf_loc (MSG_NOTE, vect_location,
2518 "Detected single element interleaving ");
2519 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2520 dump_printf (MSG_NOTE, " step ");
2521 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2522 dump_printf (MSG_NOTE, "\n");
2525 return true;
2528 if (dump_enabled_p ())
2530 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2531 "not consecutive access ");
2532 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2535 if (bb_vinfo)
2537 /* Mark the statement as unvectorizable. */
2538 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
2539 return true;
2542 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2543 STMT_VINFO_STRIDED_P (stmt_info) = true;
2544 return true;
2547 if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2549 /* First stmt in the interleaving chain. Check the chain. */
2550 gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2551 struct data_reference *data_ref = dr;
2552 unsigned int count = 1;
2553 tree prev_init = DR_INIT (data_ref);
2554 gimple *prev = stmt;
2555 HOST_WIDE_INT diff, gaps = 0;
2557 /* By construction, all group members have INTEGER_CST DR_INITs. */
2558 while (next)
2560 /* Skip same data-refs. In case that two or more stmts share
2561 data-ref (supported only for loads), we vectorize only the first
2562 stmt, and the rest get their vectorized loads from the first
2563 one. */
2564 if (!tree_int_cst_compare (DR_INIT (data_ref),
2565 DR_INIT (STMT_VINFO_DATA_REF (
2566 vinfo_for_stmt (next)))))
2568 if (DR_IS_WRITE (data_ref))
2570 if (dump_enabled_p ())
2571 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2572 "Two store stmts share the same dr.\n");
2573 return false;
2576 if (dump_enabled_p ())
2577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2578 "Two or more load stmts share the same dr.\n");
2580 /* For load use the same data-ref load. */
2581 DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2583 prev = next;
2584 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2585 continue;
2588 prev = next;
2589 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2591 /* All group members have the same STEP by construction. */
2592 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2594 /* Check that the distance between two accesses is equal to the type
2595 size. Otherwise, we have gaps. */
2596 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2597 - TREE_INT_CST_LOW (prev_init)) / type_size;
2598 if (diff != 1)
2600 /* FORNOW: SLP of accesses with gaps is not supported. */
2601 slp_impossible = true;
2602 if (DR_IS_WRITE (data_ref))
2604 if (dump_enabled_p ())
2605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2606 "interleaved store with gaps\n");
2607 return false;
2610 gaps += diff - 1;
2613 last_accessed_element += diff;
2615 /* Store the gap from the previous member of the group. If there is no
2616 gap in the access, DR_GROUP_GAP is always 1. */
2617 DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2619 prev_init = DR_INIT (data_ref);
2620 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2621 /* Count the number of data-refs in the chain. */
2622 count++;
2625 if (groupsize == 0)
2626 groupsize = count + gaps;
2628 /* This could be UINT_MAX but as we are generating code in a very
2629 inefficient way we have to cap earlier. See PR78699 for example. */
2630 if (groupsize > 4096)
2632 if (dump_enabled_p ())
2633 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2634 "group is too large\n");
2635 return false;
2638 /* Check that the size of the interleaving is equal to count for stores,
2639 i.e., that there are no gaps. */
2640 if (groupsize != count
2641 && !DR_IS_READ (dr))
2643 if (dump_enabled_p ())
2644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2645 "interleaved store with gaps\n");
2646 return false;
2649 /* If there is a gap after the last load in the group it is the
2650 difference between the groupsize and the last accessed
2651 element.
2652 When there is no gap, this difference should be 0. */
2653 DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2655 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2656 if (dump_enabled_p ())
2658 dump_printf_loc (MSG_NOTE, vect_location,
2659 "Detected interleaving ");
2660 if (DR_IS_READ (dr))
2661 dump_printf (MSG_NOTE, "load ");
2662 else
2663 dump_printf (MSG_NOTE, "store ");
2664 dump_printf (MSG_NOTE, "of size %u starting with ",
2665 (unsigned)groupsize);
2666 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2667 if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2668 dump_printf_loc (MSG_NOTE, vect_location,
2669 "There is a gap of %u elements after the group\n",
2670 DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2673 /* SLP: create an SLP data structure for every interleaving group of
2674 stores for further analysis in vect_analyse_slp. */
2675 if (DR_IS_WRITE (dr) && !slp_impossible)
2677 if (loop_vinfo)
2678 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2679 if (bb_vinfo)
2680 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2684 return true;
2687 /* Analyze groups of accesses: check that DR belongs to a group of
2688 accesses of legal size, step, etc. Detect gaps, single element
2689 interleaving, and other special cases. Set grouped access info.
2690 Collect groups of strided stores for further use in SLP analysis. */
2692 static bool
2693 vect_analyze_group_access (struct data_reference *dr)
2695 if (!vect_analyze_group_access_1 (dr))
2697 /* Dissolve the group if present. */
2698 gimple *next;
2699 gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dr)));
2700 while (stmt)
2702 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2703 next = DR_GROUP_NEXT_ELEMENT (vinfo);
2704 DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2705 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2706 stmt = next;
2708 return false;
2710 return true;
2713 /* Analyze the access pattern of the data-reference DR.
2714 In case of non-consecutive accesses call vect_analyze_group_access() to
2715 analyze groups of accesses. */
2717 static bool
2718 vect_analyze_data_ref_access (struct data_reference *dr)
2720 tree step = DR_STEP (dr);
2721 tree scalar_type = TREE_TYPE (DR_REF (dr));
2722 gimple *stmt = vect_dr_stmt (dr);
2723 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2724 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2725 struct loop *loop = NULL;
2727 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2728 return true;
2730 if (loop_vinfo)
2731 loop = LOOP_VINFO_LOOP (loop_vinfo);
2733 if (loop_vinfo && !step)
2735 if (dump_enabled_p ())
2736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2737 "bad data-ref access in loop\n");
2738 return false;
2741 /* Allow loads with zero step in inner-loop vectorization. */
2742 if (loop_vinfo && integer_zerop (step))
2744 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2745 if (!nested_in_vect_loop_p (loop, stmt))
2746 return DR_IS_READ (dr);
2747 /* Allow references with zero step for outer loops marked
2748 with pragma omp simd only - it guarantees absence of
2749 loop-carried dependencies between inner loop iterations. */
2750 if (loop->safelen < 2)
2752 if (dump_enabled_p ())
2753 dump_printf_loc (MSG_NOTE, vect_location,
2754 "zero step in inner loop of nest\n");
2755 return false;
2759 if (loop && nested_in_vect_loop_p (loop, stmt))
2761 /* Interleaved accesses are not yet supported within outer-loop
2762 vectorization for references in the inner-loop. */
2763 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2765 /* For the rest of the analysis we use the outer-loop step. */
2766 step = STMT_VINFO_DR_STEP (stmt_info);
2767 if (integer_zerop (step))
2769 if (dump_enabled_p ())
2770 dump_printf_loc (MSG_NOTE, vect_location,
2771 "zero step in outer loop.\n");
2772 return DR_IS_READ (dr);
2776 /* Consecutive? */
2777 if (TREE_CODE (step) == INTEGER_CST)
2779 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2780 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2781 || (dr_step < 0
2782 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2784 /* Mark that it is not interleaving. */
2785 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2786 return true;
2790 if (loop && nested_in_vect_loop_p (loop, stmt))
2792 if (dump_enabled_p ())
2793 dump_printf_loc (MSG_NOTE, vect_location,
2794 "grouped access in outer loop.\n");
2795 return false;
2799 /* Assume this is a DR handled by non-constant strided load case. */
2800 if (TREE_CODE (step) != INTEGER_CST)
2801 return (STMT_VINFO_STRIDED_P (stmt_info)
2802 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2803 || vect_analyze_group_access (dr)));
2805 /* Not consecutive access - check if it's a part of interleaving group. */
2806 return vect_analyze_group_access (dr);
2809 /* Compare two data-references DRA and DRB to group them into chunks
2810 suitable for grouping. */
2812 static int
2813 dr_group_sort_cmp (const void *dra_, const void *drb_)
2815 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2816 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2817 int cmp;
2819 /* Stabilize sort. */
2820 if (dra == drb)
2821 return 0;
2823 /* DRs in different loops never belong to the same group. */
2824 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2825 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2826 if (loopa != loopb)
2827 return loopa->num < loopb->num ? -1 : 1;
2829 /* Ordering of DRs according to base. */
2830 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2831 DR_BASE_ADDRESS (drb));
2832 if (cmp != 0)
2833 return cmp;
2835 /* And according to DR_OFFSET. */
2836 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2837 if (cmp != 0)
2838 return cmp;
2840 /* Put reads before writes. */
2841 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2842 return DR_IS_READ (dra) ? -1 : 1;
2844 /* Then sort after access size. */
2845 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2846 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2847 if (cmp != 0)
2848 return cmp;
2850 /* And after step. */
2851 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2852 if (cmp != 0)
2853 return cmp;
2855 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2856 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2857 if (cmp == 0)
2858 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2859 return cmp;
2862 /* If OP is the result of a conversion, return the unconverted value,
2863 otherwise return null. */
2865 static tree
2866 strip_conversion (tree op)
2868 if (TREE_CODE (op) != SSA_NAME)
2869 return NULL_TREE;
2870 gimple *stmt = SSA_NAME_DEF_STMT (op);
2871 if (!is_gimple_assign (stmt)
2872 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2873 return NULL_TREE;
2874 return gimple_assign_rhs1 (stmt);
2877 /* Return true if vectorizable_* routines can handle statements STMT1
2878 and STMT2 being in a single group. */
2880 static bool
2881 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2883 if (gimple_assign_single_p (stmt1))
2884 return gimple_assign_single_p (stmt2);
2886 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2888 /* Check for two masked loads or two masked stores. */
2889 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2890 return false;
2891 internal_fn ifn = gimple_call_internal_fn (stmt1);
2892 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2893 return false;
2894 if (ifn != gimple_call_internal_fn (stmt2))
2895 return false;
2897 /* Check that the masks are the same. Cope with casts of masks,
2898 like those created by build_mask_conversion. */
2899 tree mask1 = gimple_call_arg (stmt1, 2);
2900 tree mask2 = gimple_call_arg (stmt2, 2);
2901 if (!operand_equal_p (mask1, mask2, 0))
2903 mask1 = strip_conversion (mask1);
2904 if (!mask1)
2905 return false;
2906 mask2 = strip_conversion (mask2);
2907 if (!mask2)
2908 return false;
2909 if (!operand_equal_p (mask1, mask2, 0))
2910 return false;
2912 return true;
2915 return false;
2918 /* Function vect_analyze_data_ref_accesses.
2920 Analyze the access pattern of all the data references in the loop.
2922 FORNOW: the only access pattern that is considered vectorizable is a
2923 simple step 1 (consecutive) access.
2925 FORNOW: handle only arrays and pointer accesses. */
2927 bool
2928 vect_analyze_data_ref_accesses (vec_info *vinfo)
2930 unsigned int i;
2931 vec<data_reference_p> datarefs = vinfo->datarefs;
2932 struct data_reference *dr;
2934 if (dump_enabled_p ())
2935 dump_printf_loc (MSG_NOTE, vect_location,
2936 "=== vect_analyze_data_ref_accesses ===\n");
2938 if (datarefs.is_empty ())
2939 return true;
2941 /* Sort the array of datarefs to make building the interleaving chains
2942 linear. Don't modify the original vector's order, it is needed for
2943 determining what dependencies are reversed. */
2944 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2945 datarefs_copy.qsort (dr_group_sort_cmp);
2947 /* Build the interleaving chains. */
2948 for (i = 0; i < datarefs_copy.length () - 1;)
2950 data_reference_p dra = datarefs_copy[i];
2951 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2952 stmt_vec_info lastinfo = NULL;
2953 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2954 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2956 ++i;
2957 continue;
2959 for (i = i + 1; i < datarefs_copy.length (); ++i)
2961 data_reference_p drb = datarefs_copy[i];
2962 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2963 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2964 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2965 break;
2967 /* ??? Imperfect sorting (non-compatible types, non-modulo
2968 accesses, same accesses) can lead to a group to be artificially
2969 split here as we don't just skip over those. If it really
2970 matters we can push those to a worklist and re-iterate
2971 over them. The we can just skip ahead to the next DR here. */
2973 /* DRs in a different loop should not be put into the same
2974 interleaving group. */
2975 if (gimple_bb (DR_STMT (dra))->loop_father
2976 != gimple_bb (DR_STMT (drb))->loop_father)
2977 break;
2979 /* Check that the data-refs have same first location (except init)
2980 and they are both either store or load (not load and store,
2981 not masked loads or stores). */
2982 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2983 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2984 DR_BASE_ADDRESS (drb)) != 0
2985 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2986 || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2987 break;
2989 /* Check that the data-refs have the same constant size. */
2990 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2991 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2992 if (!tree_fits_uhwi_p (sza)
2993 || !tree_fits_uhwi_p (szb)
2994 || !tree_int_cst_equal (sza, szb))
2995 break;
2997 /* Check that the data-refs have the same step. */
2998 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2999 break;
3001 /* Check the types are compatible.
3002 ??? We don't distinguish this during sorting. */
3003 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3004 TREE_TYPE (DR_REF (drb))))
3005 break;
3007 /* Check that the DR_INITs are compile-time constants. */
3008 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3009 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3010 break;
3012 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3013 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3014 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3015 HOST_WIDE_INT init_prev
3016 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3017 gcc_assert (init_a <= init_b
3018 && init_a <= init_prev
3019 && init_prev <= init_b);
3021 /* Do not place the same access in the interleaving chain twice. */
3022 if (init_b == init_prev)
3024 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3025 < gimple_uid (DR_STMT (drb)));
3026 /* ??? For now we simply "drop" the later reference which is
3027 otherwise the same rather than finishing off this group.
3028 In the end we'd want to re-process duplicates forming
3029 multiple groups from the refs, likely by just collecting
3030 all candidates (including duplicates and split points
3031 below) in a vector and then process them together. */
3032 continue;
3035 /* If init_b == init_a + the size of the type * k, we have an
3036 interleaving, and DRA is accessed before DRB. */
3037 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3038 if (type_size_a == 0
3039 || (init_b - init_a) % type_size_a != 0)
3040 break;
3042 /* If we have a store, the accesses are adjacent. This splits
3043 groups into chunks we support (we don't support vectorization
3044 of stores with gaps). */
3045 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3046 break;
3048 /* If the step (if not zero or non-constant) is greater than the
3049 difference between data-refs' inits this splits groups into
3050 suitable sizes. */
3051 if (tree_fits_shwi_p (DR_STEP (dra)))
3053 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3054 if (step != 0 && step <= (init_b - init_a))
3055 break;
3058 if (dump_enabled_p ())
3060 dump_printf_loc (MSG_NOTE, vect_location,
3061 "Detected interleaving ");
3062 if (DR_IS_READ (dra))
3063 dump_printf (MSG_NOTE, "load ");
3064 else
3065 dump_printf (MSG_NOTE, "store ");
3066 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3067 dump_printf (MSG_NOTE, " and ");
3068 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3069 dump_printf (MSG_NOTE, "\n");
3072 /* Link the found element into the group list. */
3073 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3075 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3076 lastinfo = stmtinfo_a;
3078 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3079 DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3080 lastinfo = stmtinfo_b;
3084 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3085 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr)))
3086 && !vect_analyze_data_ref_access (dr))
3088 if (dump_enabled_p ())
3089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3090 "not vectorized: complicated access pattern.\n");
3092 if (is_a <bb_vec_info> (vinfo))
3094 /* Mark the statement as not vectorizable. */
3095 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
3096 continue;
3098 else
3100 datarefs_copy.release ();
3101 return false;
3105 datarefs_copy.release ();
3106 return true;
3109 /* Function vect_vfa_segment_size.
3111 Input:
3112 DR: The data reference.
3113 LENGTH_FACTOR: segment length to consider.
3115 Return a value suitable for the dr_with_seg_len::seg_len field.
3116 This is the "distance travelled" by the pointer from the first
3117 iteration in the segment to the last. Note that it does not include
3118 the size of the access; in effect it only describes the first byte. */
3120 static tree
3121 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3123 length_factor = size_binop (MINUS_EXPR,
3124 fold_convert (sizetype, length_factor),
3125 size_one_node);
3126 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3127 length_factor);
3130 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3131 gives the worst-case number of bytes covered by the segment. */
3133 static unsigned HOST_WIDE_INT
3134 vect_vfa_access_size (data_reference *dr)
3136 stmt_vec_info stmt_vinfo = vinfo_for_stmt (vect_dr_stmt (dr));
3137 tree ref_type = TREE_TYPE (DR_REF (dr));
3138 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3139 unsigned HOST_WIDE_INT access_size = ref_size;
3140 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3142 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3143 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3145 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3146 && (vect_supportable_dr_alignment (dr, false)
3147 == dr_explicit_realign_optimized))
3149 /* We might access a full vector's worth. */
3150 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3151 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3153 return access_size;
3156 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3158 static unsigned int
3159 vect_vfa_align (const data_reference *dr)
3161 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3164 /* Function vect_no_alias_p.
3166 Given data references A and B with equal base and offset, see whether
3167 the alias relation can be decided at compilation time. Return 1 if
3168 it can and the references alias, 0 if it can and the references do
3169 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3170 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3171 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3173 static int
3174 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3175 tree segment_length_a, tree segment_length_b,
3176 unsigned HOST_WIDE_INT access_size_a,
3177 unsigned HOST_WIDE_INT access_size_b)
3179 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3180 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3181 poly_uint64 const_length_a;
3182 poly_uint64 const_length_b;
3184 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3185 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3186 [a, a+12) */
3187 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3189 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3190 offset_a = (offset_a + access_size_a) - const_length_a;
3192 else
3193 const_length_a = tree_to_poly_uint64 (segment_length_a);
3194 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3196 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3197 offset_b = (offset_b + access_size_b) - const_length_b;
3199 else
3200 const_length_b = tree_to_poly_uint64 (segment_length_b);
3202 const_length_a += access_size_a;
3203 const_length_b += access_size_b;
3205 if (ranges_known_overlap_p (offset_a, const_length_a,
3206 offset_b, const_length_b))
3207 return 1;
3209 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3210 offset_b, const_length_b))
3211 return 0;
3213 return -1;
3216 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3217 in DDR is >= VF. */
3219 static bool
3220 dependence_distance_ge_vf (data_dependence_relation *ddr,
3221 unsigned int loop_depth, poly_uint64 vf)
3223 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3224 || DDR_NUM_DIST_VECTS (ddr) == 0)
3225 return false;
3227 /* If the dependence is exact, we should have limited the VF instead. */
3228 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3230 unsigned int i;
3231 lambda_vector dist_v;
3232 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3234 HOST_WIDE_INT dist = dist_v[loop_depth];
3235 if (dist != 0
3236 && !(dist > 0 && DDR_REVERSED_P (ddr))
3237 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3238 return false;
3241 if (dump_enabled_p ())
3243 dump_printf_loc (MSG_NOTE, vect_location,
3244 "dependence distance between ");
3245 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3246 dump_printf (MSG_NOTE, " and ");
3247 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3248 dump_printf (MSG_NOTE, " is >= VF\n");
3251 return true;
3254 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3256 static void
3257 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3259 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3260 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3261 dump_printf (dump_kind, ") >= ");
3262 dump_dec (dump_kind, lower_bound.min_value);
3265 /* Record that the vectorized loop requires the vec_lower_bound described
3266 by EXPR, UNSIGNED_P and MIN_VALUE. */
3268 static void
3269 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3270 poly_uint64 min_value)
3272 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3273 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3274 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3276 unsigned_p &= lower_bounds[i].unsigned_p;
3277 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3278 if (lower_bounds[i].unsigned_p != unsigned_p
3279 || maybe_lt (lower_bounds[i].min_value, min_value))
3281 lower_bounds[i].unsigned_p = unsigned_p;
3282 lower_bounds[i].min_value = min_value;
3283 if (dump_enabled_p ())
3285 dump_printf_loc (MSG_NOTE, vect_location,
3286 "updating run-time check to ");
3287 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3288 dump_printf (MSG_NOTE, "\n");
3291 return;
3294 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3295 if (dump_enabled_p ())
3297 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3298 dump_lower_bound (MSG_NOTE, lower_bound);
3299 dump_printf (MSG_NOTE, "\n");
3301 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3304 /* Return true if it's unlikely that the step of the vectorized form of DR
3305 will span fewer than GAP bytes. */
3307 static bool
3308 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3310 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
3311 HOST_WIDE_INT count
3312 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3313 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3314 count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3315 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3318 /* Return true if we know that there is no alias between DR_A and DR_B
3319 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3320 *LOWER_BOUND_OUT to this N. */
3322 static bool
3323 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3324 poly_uint64 *lower_bound_out)
3326 /* Check that there is a constant gap of known sign between DR_A
3327 and DR_B. */
3328 poly_int64 init_a, init_b;
3329 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3330 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3331 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3332 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3333 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3334 || !ordered_p (init_a, init_b))
3335 return false;
3337 /* Sort DR_A and DR_B by the address they access. */
3338 if (maybe_lt (init_b, init_a))
3340 std::swap (init_a, init_b);
3341 std::swap (dr_a, dr_b);
3344 /* If the two accesses could be dependent within a scalar iteration,
3345 make sure that we'd retain their order. */
3346 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3347 && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3348 vect_dr_stmt (dr_b)))
3349 return false;
3351 /* There is no alias if abs (DR_STEP) is greater than or equal to
3352 the bytes spanned by the combination of the two accesses. */
3353 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3354 return true;
3357 /* Function vect_prune_runtime_alias_test_list.
3359 Prune a list of ddrs to be tested at run-time by versioning for alias.
3360 Merge several alias checks into one if possible.
3361 Return FALSE if resulting list of ddrs is longer then allowed by
3362 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3364 bool
3365 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3367 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3368 hash_set <tree_pair_hash> compared_objects;
3370 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3371 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3372 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3373 vec<vec_object_pair> &check_unequal_addrs
3374 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3375 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3376 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3378 ddr_p ddr;
3379 unsigned int i;
3380 tree length_factor;
3382 if (dump_enabled_p ())
3383 dump_printf_loc (MSG_NOTE, vect_location,
3384 "=== vect_prune_runtime_alias_test_list ===\n");
3386 /* Step values are irrelevant for aliasing if the number of vector
3387 iterations is equal to the number of scalar iterations (which can
3388 happen for fully-SLP loops). */
3389 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3391 if (!ignore_step_p)
3393 /* Convert the checks for nonzero steps into bound tests. */
3394 tree value;
3395 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3396 vect_check_lower_bound (loop_vinfo, value, true, 1);
3399 if (may_alias_ddrs.is_empty ())
3400 return true;
3402 comp_alias_ddrs.create (may_alias_ddrs.length ());
3404 unsigned int loop_depth
3405 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3406 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3408 /* First, we collect all data ref pairs for aliasing checks. */
3409 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3411 int comp_res;
3412 poly_uint64 lower_bound;
3413 struct data_reference *dr_a, *dr_b;
3414 gimple *dr_group_first_a, *dr_group_first_b;
3415 tree segment_length_a, segment_length_b;
3416 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3417 unsigned int align_a, align_b;
3418 gimple *stmt_a, *stmt_b;
3420 /* Ignore the alias if the VF we chose ended up being no greater
3421 than the dependence distance. */
3422 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3423 continue;
3425 if (DDR_OBJECT_A (ddr))
3427 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3428 if (!compared_objects.add (new_pair))
3430 if (dump_enabled_p ())
3432 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3433 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3434 dump_printf (MSG_NOTE, " and ");
3435 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3436 dump_printf (MSG_NOTE, " have different addresses\n");
3438 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3440 continue;
3443 dr_a = DDR_A (ddr);
3444 stmt_a = vect_dr_stmt (DDR_A (ddr));
3446 dr_b = DDR_B (ddr);
3447 stmt_b = vect_dr_stmt (DDR_B (ddr));
3449 /* Skip the pair if inter-iteration dependencies are irrelevant
3450 and intra-iteration dependencies are guaranteed to be honored. */
3451 if (ignore_step_p
3452 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3453 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3455 if (dump_enabled_p ())
3457 dump_printf_loc (MSG_NOTE, vect_location,
3458 "no need for alias check between ");
3459 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3460 dump_printf (MSG_NOTE, " and ");
3461 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3462 dump_printf (MSG_NOTE, " when VF is 1\n");
3464 continue;
3467 /* See whether we can handle the alias using a bounds check on
3468 the step, and whether that's likely to be the best approach.
3469 (It might not be, for example, if the minimum step is much larger
3470 than the number of bytes handled by one vector iteration.) */
3471 if (!ignore_step_p
3472 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3473 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3474 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3475 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3477 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3478 if (dump_enabled_p ())
3480 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3481 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3482 dump_printf (MSG_NOTE, " and ");
3483 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3484 dump_printf (MSG_NOTE, " when the step ");
3485 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3486 dump_printf (MSG_NOTE, " is outside ");
3487 if (unsigned_p)
3488 dump_printf (MSG_NOTE, "[0");
3489 else
3491 dump_printf (MSG_NOTE, "(");
3492 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3494 dump_printf (MSG_NOTE, ", ");
3495 dump_dec (MSG_NOTE, lower_bound);
3496 dump_printf (MSG_NOTE, ")\n");
3498 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3499 lower_bound);
3500 continue;
3503 dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3504 if (dr_group_first_a)
3506 stmt_a = dr_group_first_a;
3507 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3510 dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3511 if (dr_group_first_b)
3513 stmt_b = dr_group_first_b;
3514 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3517 if (ignore_step_p)
3519 segment_length_a = size_zero_node;
3520 segment_length_b = size_zero_node;
3522 else
3524 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3525 length_factor = scalar_loop_iters;
3526 else
3527 length_factor = size_int (vect_factor);
3528 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3529 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3531 access_size_a = vect_vfa_access_size (dr_a);
3532 access_size_b = vect_vfa_access_size (dr_b);
3533 align_a = vect_vfa_align (dr_a);
3534 align_b = vect_vfa_align (dr_b);
3536 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3537 DR_BASE_ADDRESS (dr_b));
3538 if (comp_res == 0)
3539 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3540 DR_OFFSET (dr_b));
3542 /* See whether the alias is known at compilation time. */
3543 if (comp_res == 0
3544 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3545 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3546 && poly_int_tree_p (segment_length_a)
3547 && poly_int_tree_p (segment_length_b))
3549 int res = vect_compile_time_alias (dr_a, dr_b,
3550 segment_length_a,
3551 segment_length_b,
3552 access_size_a,
3553 access_size_b);
3554 if (res >= 0 && dump_enabled_p ())
3556 dump_printf_loc (MSG_NOTE, vect_location,
3557 "can tell at compile time that ");
3558 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3559 dump_printf (MSG_NOTE, " and ");
3560 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3561 if (res == 0)
3562 dump_printf (MSG_NOTE, " do not alias\n");
3563 else
3564 dump_printf (MSG_NOTE, " alias\n");
3567 if (res == 0)
3568 continue;
3570 if (res == 1)
3572 if (dump_enabled_p ())
3573 dump_printf_loc (MSG_NOTE, vect_location,
3574 "not vectorized: compilation time alias.\n");
3575 return false;
3579 dr_with_seg_len_pair_t dr_with_seg_len_pair
3580 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3581 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3583 /* Canonicalize pairs by sorting the two DR members. */
3584 if (comp_res > 0)
3585 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3587 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3590 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3592 unsigned int count = (comp_alias_ddrs.length ()
3593 + check_unequal_addrs.length ());
3595 dump_printf_loc (MSG_NOTE, vect_location,
3596 "improved number of alias checks from %d to %d\n",
3597 may_alias_ddrs.length (), count);
3598 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3600 if (dump_enabled_p ())
3601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3602 "number of versioning for alias "
3603 "run-time tests exceeds %d "
3604 "(--param vect-max-version-for-alias-checks)\n",
3605 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3606 return false;
3609 return true;
3612 /* Check whether we can use an internal function for a gather load
3613 or scatter store. READ_P is true for loads and false for stores.
3614 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3615 the type of the memory elements being loaded or stored. OFFSET_BITS
3616 is the number of bits in each scalar offset and OFFSET_SIGN is the
3617 sign of the offset. SCALE is the amount by which the offset should
3618 be multiplied *after* it has been converted to address width.
3620 Return true if the function is supported, storing the function
3621 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3623 bool
3624 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3625 tree memory_type, unsigned int offset_bits,
3626 signop offset_sign, int scale,
3627 internal_fn *ifn_out, tree *element_type_out)
3629 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3630 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3631 if (offset_bits > element_bits)
3632 /* Internal functions require the offset to be the same width as
3633 the vector elements. We can extend narrower offsets, but it isn't
3634 safe to truncate wider offsets. */
3635 return false;
3637 if (element_bits != memory_bits)
3638 /* For now the vector elements must be the same width as the
3639 memory elements. */
3640 return false;
3642 /* Work out which function we need. */
3643 internal_fn ifn;
3644 if (read_p)
3645 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3646 else
3647 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3649 /* Test whether the target supports this combination. */
3650 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3651 offset_sign, scale))
3652 return false;
3654 *ifn_out = ifn;
3655 *element_type_out = TREE_TYPE (vectype);
3656 return true;
3659 /* CALL is a call to an internal gather load or scatter store function.
3660 Describe the operation in INFO. */
3662 static void
3663 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3665 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3666 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3667 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3669 info->ifn = gimple_call_internal_fn (call);
3670 info->decl = NULL_TREE;
3671 info->base = gimple_call_arg (call, 0);
3672 info->offset = gimple_call_arg (call, 1);
3673 info->offset_dt = vect_unknown_def_type;
3674 info->offset_vectype = NULL_TREE;
3675 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3676 info->element_type = TREE_TYPE (vectype);
3677 info->memory_type = TREE_TYPE (DR_REF (dr));
3680 /* Return true if a non-affine read or write in STMT is suitable for a
3681 gather load or scatter store. Describe the operation in *INFO if so. */
3683 bool
3684 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3685 gather_scatter_info *info)
3687 HOST_WIDE_INT scale = 1;
3688 poly_int64 pbitpos, pbitsize;
3689 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3690 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3691 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3692 tree offtype = NULL_TREE;
3693 tree decl = NULL_TREE, base, off;
3694 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3695 tree memory_type = TREE_TYPE (DR_REF (dr));
3696 machine_mode pmode;
3697 int punsignedp, reversep, pvolatilep = 0;
3698 internal_fn ifn;
3699 tree element_type;
3700 bool masked_p = false;
3702 /* See whether this is already a call to a gather/scatter internal function.
3703 If not, see whether it's a masked load or store. */
3704 gcall *call = dyn_cast <gcall *> (stmt);
3705 if (call && gimple_call_internal_p (call))
3707 ifn = gimple_call_internal_fn (stmt);
3708 if (internal_gather_scatter_fn_p (ifn))
3710 vect_describe_gather_scatter_call (call, info);
3711 return true;
3713 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3716 /* True if we should aim to use internal functions rather than
3717 built-in functions. */
3718 bool use_ifn_p = (DR_IS_READ (dr)
3719 ? supports_vec_gather_load_p ()
3720 : supports_vec_scatter_store_p ());
3722 base = DR_REF (dr);
3723 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3724 see if we can use the def stmt of the address. */
3725 if (masked_p
3726 && TREE_CODE (base) == MEM_REF
3727 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3728 && integer_zerop (TREE_OPERAND (base, 1))
3729 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3731 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3732 if (is_gimple_assign (def_stmt)
3733 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3734 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3737 /* The gather and scatter builtins need address of the form
3738 loop_invariant + vector * {1, 2, 4, 8}
3740 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3741 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3742 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3743 multiplications and additions in it. To get a vector, we need
3744 a single SSA_NAME that will be defined in the loop and will
3745 contain everything that is not loop invariant and that can be
3746 vectorized. The following code attempts to find such a preexistng
3747 SSA_NAME OFF and put the loop invariants into a tree BASE
3748 that can be gimplified before the loop. */
3749 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3750 &punsignedp, &reversep, &pvolatilep);
3751 gcc_assert (base && !reversep);
3752 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3754 if (TREE_CODE (base) == MEM_REF)
3756 if (!integer_zerop (TREE_OPERAND (base, 1)))
3758 if (off == NULL_TREE)
3759 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3760 else
3761 off = size_binop (PLUS_EXPR, off,
3762 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3764 base = TREE_OPERAND (base, 0);
3766 else
3767 base = build_fold_addr_expr (base);
3769 if (off == NULL_TREE)
3770 off = size_zero_node;
3772 /* If base is not loop invariant, either off is 0, then we start with just
3773 the constant offset in the loop invariant BASE and continue with base
3774 as OFF, otherwise give up.
3775 We could handle that case by gimplifying the addition of base + off
3776 into some SSA_NAME and use that as off, but for now punt. */
3777 if (!expr_invariant_in_loop_p (loop, base))
3779 if (!integer_zerop (off))
3780 return false;
3781 off = base;
3782 base = size_int (pbytepos);
3784 /* Otherwise put base + constant offset into the loop invariant BASE
3785 and continue with OFF. */
3786 else
3788 base = fold_convert (sizetype, base);
3789 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3792 /* OFF at this point may be either a SSA_NAME or some tree expression
3793 from get_inner_reference. Try to peel off loop invariants from it
3794 into BASE as long as possible. */
3795 STRIP_NOPS (off);
3796 while (offtype == NULL_TREE)
3798 enum tree_code code;
3799 tree op0, op1, add = NULL_TREE;
3801 if (TREE_CODE (off) == SSA_NAME)
3803 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3805 if (expr_invariant_in_loop_p (loop, off))
3806 return false;
3808 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3809 break;
3811 op0 = gimple_assign_rhs1 (def_stmt);
3812 code = gimple_assign_rhs_code (def_stmt);
3813 op1 = gimple_assign_rhs2 (def_stmt);
3815 else
3817 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3818 return false;
3819 code = TREE_CODE (off);
3820 extract_ops_from_tree (off, &code, &op0, &op1);
3822 switch (code)
3824 case POINTER_PLUS_EXPR:
3825 case PLUS_EXPR:
3826 if (expr_invariant_in_loop_p (loop, op0))
3828 add = op0;
3829 off = op1;
3830 do_add:
3831 add = fold_convert (sizetype, add);
3832 if (scale != 1)
3833 add = size_binop (MULT_EXPR, add, size_int (scale));
3834 base = size_binop (PLUS_EXPR, base, add);
3835 continue;
3837 if (expr_invariant_in_loop_p (loop, op1))
3839 add = op1;
3840 off = op0;
3841 goto do_add;
3843 break;
3844 case MINUS_EXPR:
3845 if (expr_invariant_in_loop_p (loop, op1))
3847 add = fold_convert (sizetype, op1);
3848 add = size_binop (MINUS_EXPR, size_zero_node, add);
3849 off = op0;
3850 goto do_add;
3852 break;
3853 case MULT_EXPR:
3854 if (scale == 1 && tree_fits_shwi_p (op1))
3856 int new_scale = tree_to_shwi (op1);
3857 /* Only treat this as a scaling operation if the target
3858 supports it. */
3859 if (use_ifn_p
3860 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3861 vectype, memory_type, 1,
3862 TYPE_SIGN (TREE_TYPE (op0)),
3863 new_scale, &ifn,
3864 &element_type))
3865 break;
3866 scale = new_scale;
3867 off = op0;
3868 continue;
3870 break;
3871 case SSA_NAME:
3872 off = op0;
3873 continue;
3874 CASE_CONVERT:
3875 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3876 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3877 break;
3878 if (TYPE_PRECISION (TREE_TYPE (op0))
3879 == TYPE_PRECISION (TREE_TYPE (off)))
3881 off = op0;
3882 continue;
3885 /* The internal functions need the offset to be the same width
3886 as the elements of VECTYPE. Don't include operations that
3887 cast the offset from that width to a different width. */
3888 if (use_ifn_p
3889 && (int_size_in_bytes (TREE_TYPE (vectype))
3890 == int_size_in_bytes (TREE_TYPE (off))))
3891 break;
3893 if (TYPE_PRECISION (TREE_TYPE (op0))
3894 < TYPE_PRECISION (TREE_TYPE (off)))
3896 off = op0;
3897 offtype = TREE_TYPE (off);
3898 STRIP_NOPS (off);
3899 continue;
3901 break;
3902 default:
3903 break;
3905 break;
3908 /* If at the end OFF still isn't a SSA_NAME or isn't
3909 defined in the loop, punt. */
3910 if (TREE_CODE (off) != SSA_NAME
3911 || expr_invariant_in_loop_p (loop, off))
3912 return false;
3914 if (offtype == NULL_TREE)
3915 offtype = TREE_TYPE (off);
3917 if (use_ifn_p)
3919 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3920 memory_type, TYPE_PRECISION (offtype),
3921 TYPE_SIGN (offtype), scale, &ifn,
3922 &element_type))
3923 return false;
3925 else
3927 if (DR_IS_READ (dr))
3929 if (targetm.vectorize.builtin_gather)
3930 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3932 else
3934 if (targetm.vectorize.builtin_scatter)
3935 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3938 if (!decl)
3939 return false;
3941 ifn = IFN_LAST;
3942 element_type = TREE_TYPE (vectype);
3945 info->ifn = ifn;
3946 info->decl = decl;
3947 info->base = base;
3948 info->offset = off;
3949 info->offset_dt = vect_unknown_def_type;
3950 info->offset_vectype = NULL_TREE;
3951 info->scale = scale;
3952 info->element_type = element_type;
3953 info->memory_type = memory_type;
3954 return true;
3957 /* Find the data references in STMT, analyze them with respect to LOOP and
3958 append them to DATAREFS. Return false if datarefs in this stmt cannot
3959 be handled. */
3961 bool
3962 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3963 vec<data_reference_p> *datarefs)
3965 /* We can ignore clobbers for dataref analysis - they are removed during
3966 loop vectorization and BB vectorization checks dependences with a
3967 stmt walk. */
3968 if (gimple_clobber_p (stmt))
3969 return true;
3971 if (gimple_has_volatile_ops (stmt))
3973 if (dump_enabled_p ())
3975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3976 "not vectorized: volatile type ");
3977 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3979 return false;
3982 if (stmt_can_throw_internal (stmt))
3984 if (dump_enabled_p ())
3986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3987 "not vectorized: statement can throw an "
3988 "exception ");
3989 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3991 return false;
3994 auto_vec<data_reference_p, 2> refs;
3995 if (!find_data_references_in_stmt (loop, stmt, &refs))
3996 return false;
3998 if (refs.is_empty ())
3999 return true;
4001 if (refs.length () > 1)
4003 if (dump_enabled_p ())
4005 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4006 "not vectorized: more than one data ref "
4007 "in stmt: ");
4008 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4010 return false;
4013 if (gcall *call = dyn_cast <gcall *> (stmt))
4014 if (!gimple_call_internal_p (call)
4015 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4016 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4018 if (dump_enabled_p ())
4020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4021 "not vectorized: dr in a call ");
4022 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4024 return false;
4027 data_reference_p dr = refs.pop ();
4028 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4029 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4031 if (dump_enabled_p ())
4033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4034 "not vectorized: statement is bitfield "
4035 "access ");
4036 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4038 return false;
4041 if (DR_BASE_ADDRESS (dr)
4042 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4044 if (dump_enabled_p ())
4045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4046 "not vectorized: base addr of dr is a "
4047 "constant\n");
4048 return false;
4051 datarefs->safe_push (dr);
4052 return true;
4055 /* Function vect_analyze_data_refs.
4057 Find all the data references in the loop or basic block.
4059 The general structure of the analysis of data refs in the vectorizer is as
4060 follows:
4061 1- vect_analyze_data_refs(loop/bb): call
4062 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4063 in the loop/bb and their dependences.
4064 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4065 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4066 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4070 bool
4071 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4073 struct loop *loop = NULL;
4074 unsigned int i;
4075 struct data_reference *dr;
4076 tree scalar_type;
4078 if (dump_enabled_p ())
4079 dump_printf_loc (MSG_NOTE, vect_location,
4080 "=== vect_analyze_data_refs ===\n");
4082 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4083 loop = LOOP_VINFO_LOOP (loop_vinfo);
4085 /* Go through the data-refs, check that the analysis succeeded. Update
4086 pointer from stmt_vec_info struct to DR and vectype. */
4088 vec<data_reference_p> datarefs = vinfo->datarefs;
4089 FOR_EACH_VEC_ELT (datarefs, i, dr)
4091 gimple *stmt;
4092 stmt_vec_info stmt_info;
4093 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4094 bool simd_lane_access = false;
4095 poly_uint64 vf;
4097 gcc_assert (DR_REF (dr));
4098 stmt = vect_dr_stmt (dr);
4099 stmt_info = vinfo_for_stmt (stmt);
4101 /* Check that analysis of the data-ref succeeded. */
4102 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4103 || !DR_STEP (dr))
4105 bool maybe_gather
4106 = DR_IS_READ (dr)
4107 && !TREE_THIS_VOLATILE (DR_REF (dr))
4108 && (targetm.vectorize.builtin_gather != NULL
4109 || supports_vec_gather_load_p ());
4110 bool maybe_scatter
4111 = DR_IS_WRITE (dr)
4112 && !TREE_THIS_VOLATILE (DR_REF (dr))
4113 && (targetm.vectorize.builtin_scatter != NULL
4114 || supports_vec_scatter_store_p ());
4115 bool maybe_simd_lane_access
4116 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4118 /* If target supports vector gather loads or scatter stores, or if
4119 this might be a SIMD lane access, see if they can't be used. */
4120 if (is_a <loop_vec_info> (vinfo)
4121 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4122 && !nested_in_vect_loop_p (loop, stmt))
4124 struct data_reference *newdr
4125 = create_data_ref (NULL, loop_containing_stmt (stmt),
4126 DR_REF (dr), stmt, !maybe_scatter,
4127 DR_IS_CONDITIONAL_IN_STMT (dr));
4128 gcc_assert (newdr != NULL && DR_REF (newdr));
4129 if (DR_BASE_ADDRESS (newdr)
4130 && DR_OFFSET (newdr)
4131 && DR_INIT (newdr)
4132 && DR_STEP (newdr)
4133 && integer_zerop (DR_STEP (newdr)))
4135 if (maybe_simd_lane_access)
4137 tree off = DR_OFFSET (newdr);
4138 STRIP_NOPS (off);
4139 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4140 && TREE_CODE (off) == MULT_EXPR
4141 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4143 tree step = TREE_OPERAND (off, 1);
4144 off = TREE_OPERAND (off, 0);
4145 STRIP_NOPS (off);
4146 if (CONVERT_EXPR_P (off)
4147 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4148 0)))
4149 < TYPE_PRECISION (TREE_TYPE (off)))
4150 off = TREE_OPERAND (off, 0);
4151 if (TREE_CODE (off) == SSA_NAME)
4153 gimple *def = SSA_NAME_DEF_STMT (off);
4154 tree reft = TREE_TYPE (DR_REF (newdr));
4155 if (is_gimple_call (def)
4156 && gimple_call_internal_p (def)
4157 && (gimple_call_internal_fn (def)
4158 == IFN_GOMP_SIMD_LANE))
4160 tree arg = gimple_call_arg (def, 0);
4161 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4162 arg = SSA_NAME_VAR (arg);
4163 if (arg == loop->simduid
4164 /* For now. */
4165 && tree_int_cst_equal
4166 (TYPE_SIZE_UNIT (reft),
4167 step))
4169 DR_OFFSET (newdr) = ssize_int (0);
4170 DR_STEP (newdr) = step;
4171 DR_OFFSET_ALIGNMENT (newdr)
4172 = BIGGEST_ALIGNMENT;
4173 DR_STEP_ALIGNMENT (newdr)
4174 = highest_pow2_factor (step);
4175 dr = newdr;
4176 simd_lane_access = true;
4182 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4184 dr = newdr;
4185 if (maybe_gather)
4186 gatherscatter = GATHER;
4187 else
4188 gatherscatter = SCATTER;
4191 if (gatherscatter == SG_NONE && !simd_lane_access)
4192 free_data_ref (newdr);
4195 if (gatherscatter == SG_NONE && !simd_lane_access)
4197 if (dump_enabled_p ())
4199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4200 "not vectorized: data ref analysis "
4201 "failed ");
4202 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4204 if (is_a <bb_vec_info> (vinfo))
4206 /* In BB vectorization the ref can still participate
4207 in dependence analysis, we just can't vectorize it. */
4208 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4209 continue;
4211 return false;
4215 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4216 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4217 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4219 if (dump_enabled_p ())
4221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4222 "not vectorized: base object not addressable "
4223 "for stmt: ");
4224 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4226 if (is_a <bb_vec_info> (vinfo))
4228 /* In BB vectorization the ref can still participate
4229 in dependence analysis, we just can't vectorize it. */
4230 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4231 continue;
4233 return false;
4236 if (is_a <loop_vec_info> (vinfo)
4237 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4239 if (nested_in_vect_loop_p (loop, stmt))
4241 if (dump_enabled_p ())
4243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4244 "not vectorized: not suitable for strided "
4245 "load ");
4246 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4248 return false;
4250 STMT_VINFO_STRIDED_P (stmt_info) = true;
4253 /* Update DR field in stmt_vec_info struct. */
4255 /* If the dataref is in an inner-loop of the loop that is considered for
4256 for vectorization, we also want to analyze the access relative to
4257 the outer-loop (DR contains information only relative to the
4258 inner-most enclosing loop). We do that by building a reference to the
4259 first location accessed by the inner-loop, and analyze it relative to
4260 the outer-loop. */
4261 if (loop && nested_in_vect_loop_p (loop, stmt))
4263 /* Build a reference to the first location accessed by the
4264 inner loop: *(BASE + INIT + OFFSET). By construction,
4265 this address must be invariant in the inner loop, so we
4266 can consider it as being used in the outer loop. */
4267 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4268 tree offset = unshare_expr (DR_OFFSET (dr));
4269 tree init = unshare_expr (DR_INIT (dr));
4270 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4271 init, offset);
4272 tree init_addr = fold_build_pointer_plus (base, init_offset);
4273 tree init_ref = build_fold_indirect_ref (init_addr);
4275 if (dump_enabled_p ())
4277 dump_printf_loc (MSG_NOTE, vect_location,
4278 "analyze in outer loop: ");
4279 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4280 dump_printf (MSG_NOTE, "\n");
4283 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4284 init_ref, loop))
4285 /* dr_analyze_innermost already explained the failure. */
4286 return false;
4288 if (dump_enabled_p ())
4290 dump_printf_loc (MSG_NOTE, vect_location,
4291 "\touter base_address: ");
4292 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4293 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4294 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4295 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4296 STMT_VINFO_DR_OFFSET (stmt_info));
4297 dump_printf (MSG_NOTE,
4298 "\n\touter constant offset from base address: ");
4299 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4300 STMT_VINFO_DR_INIT (stmt_info));
4301 dump_printf (MSG_NOTE, "\n\touter step: ");
4302 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4303 STMT_VINFO_DR_STEP (stmt_info));
4304 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4305 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4306 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4307 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4308 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4309 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4310 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4311 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4315 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4316 STMT_VINFO_DATA_REF (stmt_info) = dr;
4317 if (simd_lane_access)
4319 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4320 free_data_ref (datarefs[i]);
4321 datarefs[i] = dr;
4324 /* Set vectype for STMT. */
4325 scalar_type = TREE_TYPE (DR_REF (dr));
4326 STMT_VINFO_VECTYPE (stmt_info)
4327 = get_vectype_for_scalar_type (scalar_type);
4328 if (!STMT_VINFO_VECTYPE (stmt_info))
4330 if (dump_enabled_p ())
4332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4333 "not vectorized: no vectype for stmt: ");
4334 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4335 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4336 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4337 scalar_type);
4338 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4341 if (is_a <bb_vec_info> (vinfo))
4343 /* No vector type is fine, the ref can still participate
4344 in dependence analysis, we just can't vectorize it. */
4345 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4346 continue;
4349 if (gatherscatter != SG_NONE || simd_lane_access)
4351 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4352 if (gatherscatter != SG_NONE)
4353 free_data_ref (dr);
4355 return false;
4357 else
4359 if (dump_enabled_p ())
4361 dump_printf_loc (MSG_NOTE, vect_location,
4362 "got vectype for stmt: ");
4363 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4364 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4365 STMT_VINFO_VECTYPE (stmt_info));
4366 dump_printf (MSG_NOTE, "\n");
4370 /* Adjust the minimal vectorization factor according to the
4371 vector type. */
4372 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4373 *min_vf = upper_bound (*min_vf, vf);
4375 if (gatherscatter != SG_NONE)
4377 gather_scatter_info gs_info;
4378 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4379 &gs_info)
4380 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4382 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4383 free_data_ref (dr);
4384 if (dump_enabled_p ())
4386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4387 (gatherscatter == GATHER) ?
4388 "not vectorized: not suitable for gather "
4389 "load " :
4390 "not vectorized: not suitable for scatter "
4391 "store ");
4392 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4394 return false;
4397 free_data_ref (datarefs[i]);
4398 datarefs[i] = dr;
4399 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4403 /* We used to stop processing and prune the list here. Verify we no
4404 longer need to. */
4405 gcc_assert (i == datarefs.length ());
4407 return true;
4411 /* Function vect_get_new_vect_var.
4413 Returns a name for a new variable. The current naming scheme appends the
4414 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4415 the name of vectorizer generated variables, and appends that to NAME if
4416 provided. */
4418 tree
4419 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4421 const char *prefix;
4422 tree new_vect_var;
4424 switch (var_kind)
4426 case vect_simple_var:
4427 prefix = "vect";
4428 break;
4429 case vect_scalar_var:
4430 prefix = "stmp";
4431 break;
4432 case vect_mask_var:
4433 prefix = "mask";
4434 break;
4435 case vect_pointer_var:
4436 prefix = "vectp";
4437 break;
4438 default:
4439 gcc_unreachable ();
4442 if (name)
4444 char* tmp = concat (prefix, "_", name, NULL);
4445 new_vect_var = create_tmp_reg (type, tmp);
4446 free (tmp);
4448 else
4449 new_vect_var = create_tmp_reg (type, prefix);
4451 return new_vect_var;
4454 /* Like vect_get_new_vect_var but return an SSA name. */
4456 tree
4457 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4459 const char *prefix;
4460 tree new_vect_var;
4462 switch (var_kind)
4464 case vect_simple_var:
4465 prefix = "vect";
4466 break;
4467 case vect_scalar_var:
4468 prefix = "stmp";
4469 break;
4470 case vect_pointer_var:
4471 prefix = "vectp";
4472 break;
4473 default:
4474 gcc_unreachable ();
4477 if (name)
4479 char* tmp = concat (prefix, "_", name, NULL);
4480 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4481 free (tmp);
4483 else
4484 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4486 return new_vect_var;
4489 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4491 static void
4492 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4494 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4495 int misalign = DR_MISALIGNMENT (dr);
4496 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4497 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4498 else
4499 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4500 DR_TARGET_ALIGNMENT (dr), misalign);
4503 /* Function vect_create_addr_base_for_vector_ref.
4505 Create an expression that computes the address of the first memory location
4506 that will be accessed for a data reference.
4508 Input:
4509 STMT: The statement containing the data reference.
4510 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4511 OFFSET: Optional. If supplied, it is be added to the initial address.
4512 LOOP: Specify relative to which loop-nest should the address be computed.
4513 For example, when the dataref is in an inner-loop nested in an
4514 outer-loop that is now being vectorized, LOOP can be either the
4515 outer-loop, or the inner-loop. The first memory location accessed
4516 by the following dataref ('in' points to short):
4518 for (i=0; i<N; i++)
4519 for (j=0; j<M; j++)
4520 s += in[i+j]
4522 is as follows:
4523 if LOOP=i_loop: &in (relative to i_loop)
4524 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4525 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4526 initial address. Unlike OFFSET, which is number of elements to
4527 be added, BYTE_OFFSET is measured in bytes.
4529 Output:
4530 1. Return an SSA_NAME whose value is the address of the memory location of
4531 the first vector of the data reference.
4532 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4533 these statement(s) which define the returned SSA_NAME.
4535 FORNOW: We are only handling array accesses with step 1. */
4537 tree
4538 vect_create_addr_base_for_vector_ref (gimple *stmt,
4539 gimple_seq *new_stmt_list,
4540 tree offset,
4541 tree byte_offset)
4543 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4544 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4545 const char *base_name;
4546 tree addr_base;
4547 tree dest;
4548 gimple_seq seq = NULL;
4549 tree vect_ptr_type;
4550 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4551 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4552 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4554 tree data_ref_base = unshare_expr (drb->base_address);
4555 tree base_offset = unshare_expr (drb->offset);
4556 tree init = unshare_expr (drb->init);
4558 if (loop_vinfo)
4559 base_name = get_name (data_ref_base);
4560 else
4562 base_offset = ssize_int (0);
4563 init = ssize_int (0);
4564 base_name = get_name (DR_REF (dr));
4567 /* Create base_offset */
4568 base_offset = size_binop (PLUS_EXPR,
4569 fold_convert (sizetype, base_offset),
4570 fold_convert (sizetype, init));
4572 if (offset)
4574 offset = fold_build2 (MULT_EXPR, sizetype,
4575 fold_convert (sizetype, offset), step);
4576 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4577 base_offset, offset);
4579 if (byte_offset)
4581 byte_offset = fold_convert (sizetype, byte_offset);
4582 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4583 base_offset, byte_offset);
4586 /* base + base_offset */
4587 if (loop_vinfo)
4588 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4589 else
4591 addr_base = build1 (ADDR_EXPR,
4592 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4593 unshare_expr (DR_REF (dr)));
4596 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4597 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4598 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4599 gimple_seq_add_seq (new_stmt_list, seq);
4601 if (DR_PTR_INFO (dr)
4602 && TREE_CODE (addr_base) == SSA_NAME
4603 && !SSA_NAME_PTR_INFO (addr_base))
4605 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4606 if (offset || byte_offset)
4607 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4610 if (dump_enabled_p ())
4612 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4613 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4614 dump_printf (MSG_NOTE, "\n");
4617 return addr_base;
4621 /* Function vect_create_data_ref_ptr.
4623 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4624 location accessed in the loop by STMT, along with the def-use update
4625 chain to appropriately advance the pointer through the loop iterations.
4626 Also set aliasing information for the pointer. This pointer is used by
4627 the callers to this function to create a memory reference expression for
4628 vector load/store access.
4630 Input:
4631 1. STMT: a stmt that references memory. Expected to be of the form
4632 GIMPLE_ASSIGN <name, data-ref> or
4633 GIMPLE_ASSIGN <data-ref, name>.
4634 2. AGGR_TYPE: the type of the reference, which should be either a vector
4635 or an array.
4636 3. AT_LOOP: the loop where the vector memref is to be created.
4637 4. OFFSET (optional): an offset to be added to the initial address accessed
4638 by the data-ref in STMT.
4639 5. BSI: location where the new stmts are to be placed if there is no loop
4640 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4641 pointing to the initial address.
4642 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4643 to the initial address accessed by the data-ref in STMT. This is
4644 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4645 in bytes.
4646 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4647 to the IV during each iteration of the loop. NULL says to move
4648 by one copy of AGGR_TYPE up or down, depending on the step of the
4649 data reference.
4651 Output:
4652 1. Declare a new ptr to vector_type, and have it point to the base of the
4653 data reference (initial addressed accessed by the data reference).
4654 For example, for vector of type V8HI, the following code is generated:
4656 v8hi *ap;
4657 ap = (v8hi *)initial_address;
4659 if OFFSET is not supplied:
4660 initial_address = &a[init];
4661 if OFFSET is supplied:
4662 initial_address = &a[init + OFFSET];
4663 if BYTE_OFFSET is supplied:
4664 initial_address = &a[init] + BYTE_OFFSET;
4666 Return the initial_address in INITIAL_ADDRESS.
4668 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4669 update the pointer in each iteration of the loop.
4671 Return the increment stmt that updates the pointer in PTR_INCR.
4673 3. Set INV_P to true if the access pattern of the data reference in the
4674 vectorized loop is invariant. Set it to false otherwise.
4676 4. Return the pointer. */
4678 tree
4679 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4680 tree offset, tree *initial_address,
4681 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4682 bool only_init, bool *inv_p, tree byte_offset,
4683 tree iv_step)
4685 const char *base_name;
4686 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4687 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4688 struct loop *loop = NULL;
4689 bool nested_in_vect_loop = false;
4690 struct loop *containing_loop = NULL;
4691 tree aggr_ptr_type;
4692 tree aggr_ptr;
4693 tree new_temp;
4694 gimple_seq new_stmt_list = NULL;
4695 edge pe = NULL;
4696 basic_block new_bb;
4697 tree aggr_ptr_init;
4698 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4699 tree aptr;
4700 gimple_stmt_iterator incr_gsi;
4701 bool insert_after;
4702 tree indx_before_incr, indx_after_incr;
4703 gimple *incr;
4704 tree step;
4705 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4707 gcc_assert (iv_step != NULL_TREE
4708 || TREE_CODE (aggr_type) == ARRAY_TYPE
4709 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4711 if (loop_vinfo)
4713 loop = LOOP_VINFO_LOOP (loop_vinfo);
4714 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4715 containing_loop = (gimple_bb (stmt))->loop_father;
4716 pe = loop_preheader_edge (loop);
4718 else
4720 gcc_assert (bb_vinfo);
4721 only_init = true;
4722 *ptr_incr = NULL;
4725 /* Check the step (evolution) of the load in LOOP, and record
4726 whether it's invariant. */
4727 step = vect_dr_behavior (dr)->step;
4728 if (integer_zerop (step))
4729 *inv_p = true;
4730 else
4731 *inv_p = false;
4733 /* Create an expression for the first address accessed by this load
4734 in LOOP. */
4735 base_name = get_name (DR_BASE_ADDRESS (dr));
4737 if (dump_enabled_p ())
4739 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4740 dump_printf_loc (MSG_NOTE, vect_location,
4741 "create %s-pointer variable to type: ",
4742 get_tree_code_name (TREE_CODE (aggr_type)));
4743 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4744 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4745 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4746 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4747 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4748 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4749 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4750 else
4751 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4752 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4753 dump_printf (MSG_NOTE, "\n");
4756 /* (1) Create the new aggregate-pointer variable.
4757 Vector and array types inherit the alias set of their component
4758 type by default so we need to use a ref-all pointer if the data
4759 reference does not conflict with the created aggregated data
4760 reference because it is not addressable. */
4761 bool need_ref_all = false;
4762 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4763 get_alias_set (DR_REF (dr))))
4764 need_ref_all = true;
4765 /* Likewise for any of the data references in the stmt group. */
4766 else if (DR_GROUP_SIZE (stmt_info) > 1)
4768 gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4771 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4772 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4773 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4774 get_alias_set (DR_REF (sdr))))
4776 need_ref_all = true;
4777 break;
4779 orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4781 while (orig_stmt);
4783 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4784 need_ref_all);
4785 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4788 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4789 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4790 def-use update cycles for the pointer: one relative to the outer-loop
4791 (LOOP), which is what steps (3) and (4) below do. The other is relative
4792 to the inner-loop (which is the inner-most loop containing the dataref),
4793 and this is done be step (5) below.
4795 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4796 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4797 redundant. Steps (3),(4) create the following:
4799 vp0 = &base_addr;
4800 LOOP: vp1 = phi(vp0,vp2)
4803 vp2 = vp1 + step
4804 goto LOOP
4806 If there is an inner-loop nested in loop, then step (5) will also be
4807 applied, and an additional update in the inner-loop will be created:
4809 vp0 = &base_addr;
4810 LOOP: vp1 = phi(vp0,vp2)
4812 inner: vp3 = phi(vp1,vp4)
4813 vp4 = vp3 + inner_step
4814 if () goto inner
4816 vp2 = vp1 + step
4817 if () goto LOOP */
4819 /* (2) Calculate the initial address of the aggregate-pointer, and set
4820 the aggregate-pointer to point to it before the loop. */
4822 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4824 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4825 offset, byte_offset);
4826 if (new_stmt_list)
4828 if (pe)
4830 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4831 gcc_assert (!new_bb);
4833 else
4834 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4837 *initial_address = new_temp;
4838 aggr_ptr_init = new_temp;
4840 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4841 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4842 inner-loop nested in LOOP (during outer-loop vectorization). */
4844 /* No update in loop is required. */
4845 if (only_init && (!loop_vinfo || at_loop == loop))
4846 aptr = aggr_ptr_init;
4847 else
4849 if (iv_step == NULL_TREE)
4851 /* The step of the aggregate pointer is the type size. */
4852 iv_step = TYPE_SIZE_UNIT (aggr_type);
4853 /* One exception to the above is when the scalar step of the load in
4854 LOOP is zero. In this case the step here is also zero. */
4855 if (*inv_p)
4856 iv_step = size_zero_node;
4857 else if (tree_int_cst_sgn (step) == -1)
4858 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4861 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4863 create_iv (aggr_ptr_init,
4864 fold_convert (aggr_ptr_type, iv_step),
4865 aggr_ptr, loop, &incr_gsi, insert_after,
4866 &indx_before_incr, &indx_after_incr);
4867 incr = gsi_stmt (incr_gsi);
4868 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4870 /* Copy the points-to information if it exists. */
4871 if (DR_PTR_INFO (dr))
4873 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4874 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4876 if (ptr_incr)
4877 *ptr_incr = incr;
4879 aptr = indx_before_incr;
4882 if (!nested_in_vect_loop || only_init)
4883 return aptr;
4886 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4887 nested in LOOP, if exists. */
4889 gcc_assert (nested_in_vect_loop);
4890 if (!only_init)
4892 standard_iv_increment_position (containing_loop, &incr_gsi,
4893 &insert_after);
4894 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4895 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4896 &indx_after_incr);
4897 incr = gsi_stmt (incr_gsi);
4898 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4900 /* Copy the points-to information if it exists. */
4901 if (DR_PTR_INFO (dr))
4903 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4904 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4906 if (ptr_incr)
4907 *ptr_incr = incr;
4909 return indx_before_incr;
4911 else
4912 gcc_unreachable ();
4916 /* Function bump_vector_ptr
4918 Increment a pointer (to a vector type) by vector-size. If requested,
4919 i.e. if PTR-INCR is given, then also connect the new increment stmt
4920 to the existing def-use update-chain of the pointer, by modifying
4921 the PTR_INCR as illustrated below:
4923 The pointer def-use update-chain before this function:
4924 DATAREF_PTR = phi (p_0, p_2)
4925 ....
4926 PTR_INCR: p_2 = DATAREF_PTR + step
4928 The pointer def-use update-chain after this function:
4929 DATAREF_PTR = phi (p_0, p_2)
4930 ....
4931 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4932 ....
4933 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4935 Input:
4936 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4937 in the loop.
4938 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4939 the loop. The increment amount across iterations is expected
4940 to be vector_size.
4941 BSI - location where the new update stmt is to be placed.
4942 STMT - the original scalar memory-access stmt that is being vectorized.
4943 BUMP - optional. The offset by which to bump the pointer. If not given,
4944 the offset is assumed to be vector_size.
4946 Output: Return NEW_DATAREF_PTR as illustrated above.
4950 tree
4951 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4952 gimple *stmt, tree bump)
4954 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4955 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4956 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4957 tree update = TYPE_SIZE_UNIT (vectype);
4958 gassign *incr_stmt;
4959 ssa_op_iter iter;
4960 use_operand_p use_p;
4961 tree new_dataref_ptr;
4963 if (bump)
4964 update = bump;
4966 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4967 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4968 else
4969 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4970 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4971 dataref_ptr, update);
4972 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4974 /* Copy the points-to information if it exists. */
4975 if (DR_PTR_INFO (dr))
4977 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4978 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4981 if (!ptr_incr)
4982 return new_dataref_ptr;
4984 /* Update the vector-pointer's cross-iteration increment. */
4985 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4987 tree use = USE_FROM_PTR (use_p);
4989 if (use == dataref_ptr)
4990 SET_USE (use_p, new_dataref_ptr);
4991 else
4992 gcc_assert (operand_equal_p (use, update, 0));
4995 return new_dataref_ptr;
4999 /* Copy memory reference info such as base/clique from the SRC reference
5000 to the DEST MEM_REF. */
5002 void
5003 vect_copy_ref_info (tree dest, tree src)
5005 if (TREE_CODE (dest) != MEM_REF)
5006 return;
5008 tree src_base = src;
5009 while (handled_component_p (src_base))
5010 src_base = TREE_OPERAND (src_base, 0);
5011 if (TREE_CODE (src_base) != MEM_REF
5012 && TREE_CODE (src_base) != TARGET_MEM_REF)
5013 return;
5015 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5016 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5020 /* Function vect_create_destination_var.
5022 Create a new temporary of type VECTYPE. */
5024 tree
5025 vect_create_destination_var (tree scalar_dest, tree vectype)
5027 tree vec_dest;
5028 const char *name;
5029 char *new_name;
5030 tree type;
5031 enum vect_var_kind kind;
5033 kind = vectype
5034 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5035 ? vect_mask_var
5036 : vect_simple_var
5037 : vect_scalar_var;
5038 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5040 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5042 name = get_name (scalar_dest);
5043 if (name)
5044 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5045 else
5046 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5047 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5048 free (new_name);
5050 return vec_dest;
5053 /* Function vect_grouped_store_supported.
5055 Returns TRUE if interleave high and interleave low permutations
5056 are supported, and FALSE otherwise. */
5058 bool
5059 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5061 machine_mode mode = TYPE_MODE (vectype);
5063 /* vect_permute_store_chain requires the group size to be equal to 3 or
5064 be a power of two. */
5065 if (count != 3 && exact_log2 (count) == -1)
5067 if (dump_enabled_p ())
5068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5069 "the size of the group of accesses"
5070 " is not a power of 2 or not eqaul to 3\n");
5071 return false;
5074 /* Check that the permutation is supported. */
5075 if (VECTOR_MODE_P (mode))
5077 unsigned int i;
5078 if (count == 3)
5080 unsigned int j0 = 0, j1 = 0, j2 = 0;
5081 unsigned int i, j;
5083 unsigned int nelt;
5084 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5086 if (dump_enabled_p ())
5087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5088 "cannot handle groups of 3 stores for"
5089 " variable-length vectors\n");
5090 return false;
5093 vec_perm_builder sel (nelt, nelt, 1);
5094 sel.quick_grow (nelt);
5095 vec_perm_indices indices;
5096 for (j = 0; j < 3; j++)
5098 int nelt0 = ((3 - j) * nelt) % 3;
5099 int nelt1 = ((3 - j) * nelt + 1) % 3;
5100 int nelt2 = ((3 - j) * nelt + 2) % 3;
5101 for (i = 0; i < nelt; i++)
5103 if (3 * i + nelt0 < nelt)
5104 sel[3 * i + nelt0] = j0++;
5105 if (3 * i + nelt1 < nelt)
5106 sel[3 * i + nelt1] = nelt + j1++;
5107 if (3 * i + nelt2 < nelt)
5108 sel[3 * i + nelt2] = 0;
5110 indices.new_vector (sel, 2, nelt);
5111 if (!can_vec_perm_const_p (mode, indices))
5113 if (dump_enabled_p ())
5114 dump_printf (MSG_MISSED_OPTIMIZATION,
5115 "permutation op not supported by target.\n");
5116 return false;
5119 for (i = 0; i < nelt; i++)
5121 if (3 * i + nelt0 < nelt)
5122 sel[3 * i + nelt0] = 3 * i + nelt0;
5123 if (3 * i + nelt1 < nelt)
5124 sel[3 * i + nelt1] = 3 * i + nelt1;
5125 if (3 * i + nelt2 < nelt)
5126 sel[3 * i + nelt2] = nelt + j2++;
5128 indices.new_vector (sel, 2, nelt);
5129 if (!can_vec_perm_const_p (mode, indices))
5131 if (dump_enabled_p ())
5132 dump_printf (MSG_MISSED_OPTIMIZATION,
5133 "permutation op not supported by target.\n");
5134 return false;
5137 return true;
5139 else
5141 /* If length is not equal to 3 then only power of 2 is supported. */
5142 gcc_assert (pow2p_hwi (count));
5143 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5145 /* The encoding has 2 interleaved stepped patterns. */
5146 vec_perm_builder sel (nelt, 2, 3);
5147 sel.quick_grow (6);
5148 for (i = 0; i < 3; i++)
5150 sel[i * 2] = i;
5151 sel[i * 2 + 1] = i + nelt;
5153 vec_perm_indices indices (sel, 2, nelt);
5154 if (can_vec_perm_const_p (mode, indices))
5156 for (i = 0; i < 6; i++)
5157 sel[i] += exact_div (nelt, 2);
5158 indices.new_vector (sel, 2, nelt);
5159 if (can_vec_perm_const_p (mode, indices))
5160 return true;
5165 if (dump_enabled_p ())
5166 dump_printf (MSG_MISSED_OPTIMIZATION,
5167 "permutaion op not supported by target.\n");
5168 return false;
5172 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5173 type VECTYPE. MASKED_P says whether the masked form is needed. */
5175 bool
5176 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5177 bool masked_p)
5179 if (masked_p)
5180 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5181 vec_mask_store_lanes_optab,
5182 vectype, count);
5183 else
5184 return vect_lanes_optab_supported_p ("vec_store_lanes",
5185 vec_store_lanes_optab,
5186 vectype, count);
5190 /* Function vect_permute_store_chain.
5192 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5193 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5194 the data correctly for the stores. Return the final references for stores
5195 in RESULT_CHAIN.
5197 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5198 The input is 4 vectors each containing 8 elements. We assign a number to
5199 each element, the input sequence is:
5201 1st vec: 0 1 2 3 4 5 6 7
5202 2nd vec: 8 9 10 11 12 13 14 15
5203 3rd vec: 16 17 18 19 20 21 22 23
5204 4th vec: 24 25 26 27 28 29 30 31
5206 The output sequence should be:
5208 1st vec: 0 8 16 24 1 9 17 25
5209 2nd vec: 2 10 18 26 3 11 19 27
5210 3rd vec: 4 12 20 28 5 13 21 30
5211 4th vec: 6 14 22 30 7 15 23 31
5213 i.e., we interleave the contents of the four vectors in their order.
5215 We use interleave_high/low instructions to create such output. The input of
5216 each interleave_high/low operation is two vectors:
5217 1st vec 2nd vec
5218 0 1 2 3 4 5 6 7
5219 the even elements of the result vector are obtained left-to-right from the
5220 high/low elements of the first vector. The odd elements of the result are
5221 obtained left-to-right from the high/low elements of the second vector.
5222 The output of interleave_high will be: 0 4 1 5
5223 and of interleave_low: 2 6 3 7
5226 The permutation is done in log LENGTH stages. In each stage interleave_high
5227 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5228 where the first argument is taken from the first half of DR_CHAIN and the
5229 second argument from it's second half.
5230 In our example,
5232 I1: interleave_high (1st vec, 3rd vec)
5233 I2: interleave_low (1st vec, 3rd vec)
5234 I3: interleave_high (2nd vec, 4th vec)
5235 I4: interleave_low (2nd vec, 4th vec)
5237 The output for the first stage is:
5239 I1: 0 16 1 17 2 18 3 19
5240 I2: 4 20 5 21 6 22 7 23
5241 I3: 8 24 9 25 10 26 11 27
5242 I4: 12 28 13 29 14 30 15 31
5244 The output of the second stage, i.e. the final result is:
5246 I1: 0 8 16 24 1 9 17 25
5247 I2: 2 10 18 26 3 11 19 27
5248 I3: 4 12 20 28 5 13 21 30
5249 I4: 6 14 22 30 7 15 23 31. */
5251 void
5252 vect_permute_store_chain (vec<tree> dr_chain,
5253 unsigned int length,
5254 gimple *stmt,
5255 gimple_stmt_iterator *gsi,
5256 vec<tree> *result_chain)
5258 tree vect1, vect2, high, low;
5259 gimple *perm_stmt;
5260 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5261 tree perm_mask_low, perm_mask_high;
5262 tree data_ref;
5263 tree perm3_mask_low, perm3_mask_high;
5264 unsigned int i, j, n, log_length = exact_log2 (length);
5266 result_chain->quick_grow (length);
5267 memcpy (result_chain->address (), dr_chain.address (),
5268 length * sizeof (tree));
5270 if (length == 3)
5272 /* vect_grouped_store_supported ensures that this is constant. */
5273 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5274 unsigned int j0 = 0, j1 = 0, j2 = 0;
5276 vec_perm_builder sel (nelt, nelt, 1);
5277 sel.quick_grow (nelt);
5278 vec_perm_indices indices;
5279 for (j = 0; j < 3; j++)
5281 int nelt0 = ((3 - j) * nelt) % 3;
5282 int nelt1 = ((3 - j) * nelt + 1) % 3;
5283 int nelt2 = ((3 - j) * nelt + 2) % 3;
5285 for (i = 0; i < nelt; i++)
5287 if (3 * i + nelt0 < nelt)
5288 sel[3 * i + nelt0] = j0++;
5289 if (3 * i + nelt1 < nelt)
5290 sel[3 * i + nelt1] = nelt + j1++;
5291 if (3 * i + nelt2 < nelt)
5292 sel[3 * i + nelt2] = 0;
5294 indices.new_vector (sel, 2, nelt);
5295 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5297 for (i = 0; i < nelt; i++)
5299 if (3 * i + nelt0 < nelt)
5300 sel[3 * i + nelt0] = 3 * i + nelt0;
5301 if (3 * i + nelt1 < nelt)
5302 sel[3 * i + nelt1] = 3 * i + nelt1;
5303 if (3 * i + nelt2 < nelt)
5304 sel[3 * i + nelt2] = nelt + j2++;
5306 indices.new_vector (sel, 2, nelt);
5307 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5309 vect1 = dr_chain[0];
5310 vect2 = dr_chain[1];
5312 /* Create interleaving stmt:
5313 low = VEC_PERM_EXPR <vect1, vect2,
5314 {j, nelt, *, j + 1, nelt + j + 1, *,
5315 j + 2, nelt + j + 2, *, ...}> */
5316 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5317 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5318 vect2, perm3_mask_low);
5319 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5321 vect1 = data_ref;
5322 vect2 = dr_chain[2];
5323 /* Create interleaving stmt:
5324 low = VEC_PERM_EXPR <vect1, vect2,
5325 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5326 6, 7, nelt + j + 2, ...}> */
5327 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5328 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5329 vect2, perm3_mask_high);
5330 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5331 (*result_chain)[j] = data_ref;
5334 else
5336 /* If length is not equal to 3 then only power of 2 is supported. */
5337 gcc_assert (pow2p_hwi (length));
5339 /* The encoding has 2 interleaved stepped patterns. */
5340 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5341 vec_perm_builder sel (nelt, 2, 3);
5342 sel.quick_grow (6);
5343 for (i = 0; i < 3; i++)
5345 sel[i * 2] = i;
5346 sel[i * 2 + 1] = i + nelt;
5348 vec_perm_indices indices (sel, 2, nelt);
5349 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5351 for (i = 0; i < 6; i++)
5352 sel[i] += exact_div (nelt, 2);
5353 indices.new_vector (sel, 2, nelt);
5354 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5356 for (i = 0, n = log_length; i < n; i++)
5358 for (j = 0; j < length/2; j++)
5360 vect1 = dr_chain[j];
5361 vect2 = dr_chain[j+length/2];
5363 /* Create interleaving stmt:
5364 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5365 ...}> */
5366 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5367 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5368 vect2, perm_mask_high);
5369 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5370 (*result_chain)[2*j] = high;
5372 /* Create interleaving stmt:
5373 low = VEC_PERM_EXPR <vect1, vect2,
5374 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5375 ...}> */
5376 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5377 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5378 vect2, perm_mask_low);
5379 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5380 (*result_chain)[2*j+1] = low;
5382 memcpy (dr_chain.address (), result_chain->address (),
5383 length * sizeof (tree));
5388 /* Function vect_setup_realignment
5390 This function is called when vectorizing an unaligned load using
5391 the dr_explicit_realign[_optimized] scheme.
5392 This function generates the following code at the loop prolog:
5394 p = initial_addr;
5395 x msq_init = *(floor(p)); # prolog load
5396 realignment_token = call target_builtin;
5397 loop:
5398 x msq = phi (msq_init, ---)
5400 The stmts marked with x are generated only for the case of
5401 dr_explicit_realign_optimized.
5403 The code above sets up a new (vector) pointer, pointing to the first
5404 location accessed by STMT, and a "floor-aligned" load using that pointer.
5405 It also generates code to compute the "realignment-token" (if the relevant
5406 target hook was defined), and creates a phi-node at the loop-header bb
5407 whose arguments are the result of the prolog-load (created by this
5408 function) and the result of a load that takes place in the loop (to be
5409 created by the caller to this function).
5411 For the case of dr_explicit_realign_optimized:
5412 The caller to this function uses the phi-result (msq) to create the
5413 realignment code inside the loop, and sets up the missing phi argument,
5414 as follows:
5415 loop:
5416 msq = phi (msq_init, lsq)
5417 lsq = *(floor(p')); # load in loop
5418 result = realign_load (msq, lsq, realignment_token);
5420 For the case of dr_explicit_realign:
5421 loop:
5422 msq = *(floor(p)); # load in loop
5423 p' = p + (VS-1);
5424 lsq = *(floor(p')); # load in loop
5425 result = realign_load (msq, lsq, realignment_token);
5427 Input:
5428 STMT - (scalar) load stmt to be vectorized. This load accesses
5429 a memory location that may be unaligned.
5430 BSI - place where new code is to be inserted.
5431 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5432 is used.
5434 Output:
5435 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5436 target hook, if defined.
5437 Return value - the result of the loop-header phi node. */
5439 tree
5440 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5441 tree *realignment_token,
5442 enum dr_alignment_support alignment_support_scheme,
5443 tree init_addr,
5444 struct loop **at_loop)
5446 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5447 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5448 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5449 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5450 struct loop *loop = NULL;
5451 edge pe = NULL;
5452 tree scalar_dest = gimple_assign_lhs (stmt);
5453 tree vec_dest;
5454 gimple *inc;
5455 tree ptr;
5456 tree data_ref;
5457 basic_block new_bb;
5458 tree msq_init = NULL_TREE;
5459 tree new_temp;
5460 gphi *phi_stmt;
5461 tree msq = NULL_TREE;
5462 gimple_seq stmts = NULL;
5463 bool inv_p;
5464 bool compute_in_loop = false;
5465 bool nested_in_vect_loop = false;
5466 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5467 struct loop *loop_for_initial_load = NULL;
5469 if (loop_vinfo)
5471 loop = LOOP_VINFO_LOOP (loop_vinfo);
5472 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5475 gcc_assert (alignment_support_scheme == dr_explicit_realign
5476 || alignment_support_scheme == dr_explicit_realign_optimized);
5478 /* We need to generate three things:
5479 1. the misalignment computation
5480 2. the extra vector load (for the optimized realignment scheme).
5481 3. the phi node for the two vectors from which the realignment is
5482 done (for the optimized realignment scheme). */
5484 /* 1. Determine where to generate the misalignment computation.
5486 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5487 calculation will be generated by this function, outside the loop (in the
5488 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5489 caller, inside the loop.
5491 Background: If the misalignment remains fixed throughout the iterations of
5492 the loop, then both realignment schemes are applicable, and also the
5493 misalignment computation can be done outside LOOP. This is because we are
5494 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5495 are a multiple of VS (the Vector Size), and therefore the misalignment in
5496 different vectorized LOOP iterations is always the same.
5497 The problem arises only if the memory access is in an inner-loop nested
5498 inside LOOP, which is now being vectorized using outer-loop vectorization.
5499 This is the only case when the misalignment of the memory access may not
5500 remain fixed throughout the iterations of the inner-loop (as explained in
5501 detail in vect_supportable_dr_alignment). In this case, not only is the
5502 optimized realignment scheme not applicable, but also the misalignment
5503 computation (and generation of the realignment token that is passed to
5504 REALIGN_LOAD) have to be done inside the loop.
5506 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5507 or not, which in turn determines if the misalignment is computed inside
5508 the inner-loop, or outside LOOP. */
5510 if (init_addr != NULL_TREE || !loop_vinfo)
5512 compute_in_loop = true;
5513 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5517 /* 2. Determine where to generate the extra vector load.
5519 For the optimized realignment scheme, instead of generating two vector
5520 loads in each iteration, we generate a single extra vector load in the
5521 preheader of the loop, and in each iteration reuse the result of the
5522 vector load from the previous iteration. In case the memory access is in
5523 an inner-loop nested inside LOOP, which is now being vectorized using
5524 outer-loop vectorization, we need to determine whether this initial vector
5525 load should be generated at the preheader of the inner-loop, or can be
5526 generated at the preheader of LOOP. If the memory access has no evolution
5527 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5528 to be generated inside LOOP (in the preheader of the inner-loop). */
5530 if (nested_in_vect_loop)
5532 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5533 bool invariant_in_outerloop =
5534 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5535 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5537 else
5538 loop_for_initial_load = loop;
5539 if (at_loop)
5540 *at_loop = loop_for_initial_load;
5542 if (loop_for_initial_load)
5543 pe = loop_preheader_edge (loop_for_initial_load);
5545 /* 3. For the case of the optimized realignment, create the first vector
5546 load at the loop preheader. */
5548 if (alignment_support_scheme == dr_explicit_realign_optimized)
5550 /* Create msq_init = *(floor(p1)) in the loop preheader */
5551 gassign *new_stmt;
5553 gcc_assert (!compute_in_loop);
5554 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5555 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5556 NULL_TREE, &init_addr, NULL, &inc,
5557 true, &inv_p);
5558 if (TREE_CODE (ptr) == SSA_NAME)
5559 new_temp = copy_ssa_name (ptr);
5560 else
5561 new_temp = make_ssa_name (TREE_TYPE (ptr));
5562 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5563 new_stmt = gimple_build_assign
5564 (new_temp, BIT_AND_EXPR, ptr,
5565 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5566 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5567 gcc_assert (!new_bb);
5568 data_ref
5569 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5570 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5571 vect_copy_ref_info (data_ref, DR_REF (dr));
5572 new_stmt = gimple_build_assign (vec_dest, data_ref);
5573 new_temp = make_ssa_name (vec_dest, new_stmt);
5574 gimple_assign_set_lhs (new_stmt, new_temp);
5575 if (pe)
5577 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5578 gcc_assert (!new_bb);
5580 else
5581 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5583 msq_init = gimple_assign_lhs (new_stmt);
5586 /* 4. Create realignment token using a target builtin, if available.
5587 It is done either inside the containing loop, or before LOOP (as
5588 determined above). */
5590 if (targetm.vectorize.builtin_mask_for_load)
5592 gcall *new_stmt;
5593 tree builtin_decl;
5595 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5596 if (!init_addr)
5598 /* Generate the INIT_ADDR computation outside LOOP. */
5599 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5600 NULL_TREE);
5601 if (loop)
5603 pe = loop_preheader_edge (loop);
5604 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5605 gcc_assert (!new_bb);
5607 else
5608 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5611 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5612 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5613 vec_dest =
5614 vect_create_destination_var (scalar_dest,
5615 gimple_call_return_type (new_stmt));
5616 new_temp = make_ssa_name (vec_dest, new_stmt);
5617 gimple_call_set_lhs (new_stmt, new_temp);
5619 if (compute_in_loop)
5620 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5621 else
5623 /* Generate the misalignment computation outside LOOP. */
5624 pe = loop_preheader_edge (loop);
5625 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5626 gcc_assert (!new_bb);
5629 *realignment_token = gimple_call_lhs (new_stmt);
5631 /* The result of the CALL_EXPR to this builtin is determined from
5632 the value of the parameter and no global variables are touched
5633 which makes the builtin a "const" function. Requiring the
5634 builtin to have the "const" attribute makes it unnecessary
5635 to call mark_call_clobbered. */
5636 gcc_assert (TREE_READONLY (builtin_decl));
5639 if (alignment_support_scheme == dr_explicit_realign)
5640 return msq;
5642 gcc_assert (!compute_in_loop);
5643 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5646 /* 5. Create msq = phi <msq_init, lsq> in loop */
5648 pe = loop_preheader_edge (containing_loop);
5649 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5650 msq = make_ssa_name (vec_dest);
5651 phi_stmt = create_phi_node (msq, containing_loop->header);
5652 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5654 return msq;
5658 /* Function vect_grouped_load_supported.
5660 COUNT is the size of the load group (the number of statements plus the
5661 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5662 only one statement, with a gap of COUNT - 1.
5664 Returns true if a suitable permute exists. */
5666 bool
5667 vect_grouped_load_supported (tree vectype, bool single_element_p,
5668 unsigned HOST_WIDE_INT count)
5670 machine_mode mode = TYPE_MODE (vectype);
5672 /* If this is single-element interleaving with an element distance
5673 that leaves unused vector loads around punt - we at least create
5674 very sub-optimal code in that case (and blow up memory,
5675 see PR65518). */
5676 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5678 if (dump_enabled_p ())
5679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5680 "single-element interleaving not supported "
5681 "for not adjacent vector loads\n");
5682 return false;
5685 /* vect_permute_load_chain requires the group size to be equal to 3 or
5686 be a power of two. */
5687 if (count != 3 && exact_log2 (count) == -1)
5689 if (dump_enabled_p ())
5690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5691 "the size of the group of accesses"
5692 " is not a power of 2 or not equal to 3\n");
5693 return false;
5696 /* Check that the permutation is supported. */
5697 if (VECTOR_MODE_P (mode))
5699 unsigned int i, j;
5700 if (count == 3)
5702 unsigned int nelt;
5703 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5705 if (dump_enabled_p ())
5706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5707 "cannot handle groups of 3 loads for"
5708 " variable-length vectors\n");
5709 return false;
5712 vec_perm_builder sel (nelt, nelt, 1);
5713 sel.quick_grow (nelt);
5714 vec_perm_indices indices;
5715 unsigned int k;
5716 for (k = 0; k < 3; k++)
5718 for (i = 0; i < nelt; i++)
5719 if (3 * i + k < 2 * nelt)
5720 sel[i] = 3 * i + k;
5721 else
5722 sel[i] = 0;
5723 indices.new_vector (sel, 2, nelt);
5724 if (!can_vec_perm_const_p (mode, indices))
5726 if (dump_enabled_p ())
5727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5728 "shuffle of 3 loads is not supported by"
5729 " target\n");
5730 return false;
5732 for (i = 0, j = 0; i < nelt; i++)
5733 if (3 * i + k < 2 * nelt)
5734 sel[i] = i;
5735 else
5736 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5737 indices.new_vector (sel, 2, nelt);
5738 if (!can_vec_perm_const_p (mode, indices))
5740 if (dump_enabled_p ())
5741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5742 "shuffle of 3 loads is not supported by"
5743 " target\n");
5744 return false;
5747 return true;
5749 else
5751 /* If length is not equal to 3 then only power of 2 is supported. */
5752 gcc_assert (pow2p_hwi (count));
5753 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5755 /* The encoding has a single stepped pattern. */
5756 vec_perm_builder sel (nelt, 1, 3);
5757 sel.quick_grow (3);
5758 for (i = 0; i < 3; i++)
5759 sel[i] = i * 2;
5760 vec_perm_indices indices (sel, 2, nelt);
5761 if (can_vec_perm_const_p (mode, indices))
5763 for (i = 0; i < 3; i++)
5764 sel[i] = i * 2 + 1;
5765 indices.new_vector (sel, 2, nelt);
5766 if (can_vec_perm_const_p (mode, indices))
5767 return true;
5772 if (dump_enabled_p ())
5773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5774 "extract even/odd not supported by target\n");
5775 return false;
5778 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5779 type VECTYPE. MASKED_P says whether the masked form is needed. */
5781 bool
5782 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5783 bool masked_p)
5785 if (masked_p)
5786 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5787 vec_mask_load_lanes_optab,
5788 vectype, count);
5789 else
5790 return vect_lanes_optab_supported_p ("vec_load_lanes",
5791 vec_load_lanes_optab,
5792 vectype, count);
5795 /* Function vect_permute_load_chain.
5797 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5798 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5799 the input data correctly. Return the final references for loads in
5800 RESULT_CHAIN.
5802 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5803 The input is 4 vectors each containing 8 elements. We assign a number to each
5804 element, the input sequence is:
5806 1st vec: 0 1 2 3 4 5 6 7
5807 2nd vec: 8 9 10 11 12 13 14 15
5808 3rd vec: 16 17 18 19 20 21 22 23
5809 4th vec: 24 25 26 27 28 29 30 31
5811 The output sequence should be:
5813 1st vec: 0 4 8 12 16 20 24 28
5814 2nd vec: 1 5 9 13 17 21 25 29
5815 3rd vec: 2 6 10 14 18 22 26 30
5816 4th vec: 3 7 11 15 19 23 27 31
5818 i.e., the first output vector should contain the first elements of each
5819 interleaving group, etc.
5821 We use extract_even/odd instructions to create such output. The input of
5822 each extract_even/odd operation is two vectors
5823 1st vec 2nd vec
5824 0 1 2 3 4 5 6 7
5826 and the output is the vector of extracted even/odd elements. The output of
5827 extract_even will be: 0 2 4 6
5828 and of extract_odd: 1 3 5 7
5831 The permutation is done in log LENGTH stages. In each stage extract_even
5832 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5833 their order. In our example,
5835 E1: extract_even (1st vec, 2nd vec)
5836 E2: extract_odd (1st vec, 2nd vec)
5837 E3: extract_even (3rd vec, 4th vec)
5838 E4: extract_odd (3rd vec, 4th vec)
5840 The output for the first stage will be:
5842 E1: 0 2 4 6 8 10 12 14
5843 E2: 1 3 5 7 9 11 13 15
5844 E3: 16 18 20 22 24 26 28 30
5845 E4: 17 19 21 23 25 27 29 31
5847 In order to proceed and create the correct sequence for the next stage (or
5848 for the correct output, if the second stage is the last one, as in our
5849 example), we first put the output of extract_even operation and then the
5850 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5851 The input for the second stage is:
5853 1st vec (E1): 0 2 4 6 8 10 12 14
5854 2nd vec (E3): 16 18 20 22 24 26 28 30
5855 3rd vec (E2): 1 3 5 7 9 11 13 15
5856 4th vec (E4): 17 19 21 23 25 27 29 31
5858 The output of the second stage:
5860 E1: 0 4 8 12 16 20 24 28
5861 E2: 2 6 10 14 18 22 26 30
5862 E3: 1 5 9 13 17 21 25 29
5863 E4: 3 7 11 15 19 23 27 31
5865 And RESULT_CHAIN after reordering:
5867 1st vec (E1): 0 4 8 12 16 20 24 28
5868 2nd vec (E3): 1 5 9 13 17 21 25 29
5869 3rd vec (E2): 2 6 10 14 18 22 26 30
5870 4th vec (E4): 3 7 11 15 19 23 27 31. */
5872 static void
5873 vect_permute_load_chain (vec<tree> dr_chain,
5874 unsigned int length,
5875 gimple *stmt,
5876 gimple_stmt_iterator *gsi,
5877 vec<tree> *result_chain)
5879 tree data_ref, first_vect, second_vect;
5880 tree perm_mask_even, perm_mask_odd;
5881 tree perm3_mask_low, perm3_mask_high;
5882 gimple *perm_stmt;
5883 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5884 unsigned int i, j, log_length = exact_log2 (length);
5886 result_chain->quick_grow (length);
5887 memcpy (result_chain->address (), dr_chain.address (),
5888 length * sizeof (tree));
5890 if (length == 3)
5892 /* vect_grouped_load_supported ensures that this is constant. */
5893 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5894 unsigned int k;
5896 vec_perm_builder sel (nelt, nelt, 1);
5897 sel.quick_grow (nelt);
5898 vec_perm_indices indices;
5899 for (k = 0; k < 3; k++)
5901 for (i = 0; i < nelt; i++)
5902 if (3 * i + k < 2 * nelt)
5903 sel[i] = 3 * i + k;
5904 else
5905 sel[i] = 0;
5906 indices.new_vector (sel, 2, nelt);
5907 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5909 for (i = 0, j = 0; i < nelt; i++)
5910 if (3 * i + k < 2 * nelt)
5911 sel[i] = i;
5912 else
5913 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5914 indices.new_vector (sel, 2, nelt);
5915 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5917 first_vect = dr_chain[0];
5918 second_vect = dr_chain[1];
5920 /* Create interleaving stmt (low part of):
5921 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5922 ...}> */
5923 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5924 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5925 second_vect, perm3_mask_low);
5926 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5928 /* Create interleaving stmt (high part of):
5929 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5930 ...}> */
5931 first_vect = data_ref;
5932 second_vect = dr_chain[2];
5933 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5934 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5935 second_vect, perm3_mask_high);
5936 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5937 (*result_chain)[k] = data_ref;
5940 else
5942 /* If length is not equal to 3 then only power of 2 is supported. */
5943 gcc_assert (pow2p_hwi (length));
5945 /* The encoding has a single stepped pattern. */
5946 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5947 vec_perm_builder sel (nelt, 1, 3);
5948 sel.quick_grow (3);
5949 for (i = 0; i < 3; ++i)
5950 sel[i] = i * 2;
5951 vec_perm_indices indices (sel, 2, nelt);
5952 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5954 for (i = 0; i < 3; ++i)
5955 sel[i] = i * 2 + 1;
5956 indices.new_vector (sel, 2, nelt);
5957 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5959 for (i = 0; i < log_length; i++)
5961 for (j = 0; j < length; j += 2)
5963 first_vect = dr_chain[j];
5964 second_vect = dr_chain[j+1];
5966 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5967 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5968 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5969 first_vect, second_vect,
5970 perm_mask_even);
5971 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5972 (*result_chain)[j/2] = data_ref;
5974 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5975 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5976 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5977 first_vect, second_vect,
5978 perm_mask_odd);
5979 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5980 (*result_chain)[j/2+length/2] = data_ref;
5982 memcpy (dr_chain.address (), result_chain->address (),
5983 length * sizeof (tree));
5988 /* Function vect_shift_permute_load_chain.
5990 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5991 sequence of stmts to reorder the input data accordingly.
5992 Return the final references for loads in RESULT_CHAIN.
5993 Return true if successed, false otherwise.
5995 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5996 The input is 3 vectors each containing 8 elements. We assign a
5997 number to each element, the input sequence is:
5999 1st vec: 0 1 2 3 4 5 6 7
6000 2nd vec: 8 9 10 11 12 13 14 15
6001 3rd vec: 16 17 18 19 20 21 22 23
6003 The output sequence should be:
6005 1st vec: 0 3 6 9 12 15 18 21
6006 2nd vec: 1 4 7 10 13 16 19 22
6007 3rd vec: 2 5 8 11 14 17 20 23
6009 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6011 First we shuffle all 3 vectors to get correct elements order:
6013 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6014 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6015 3rd vec: (16 19 22) (17 20 23) (18 21)
6017 Next we unite and shift vector 3 times:
6019 1st step:
6020 shift right by 6 the concatenation of:
6021 "1st vec" and "2nd vec"
6022 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6023 "2nd vec" and "3rd vec"
6024 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6025 "3rd vec" and "1st vec"
6026 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6027 | New vectors |
6029 So that now new vectors are:
6031 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6032 2nd vec: (10 13) (16 19 22) (17 20 23)
6033 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6035 2nd step:
6036 shift right by 5 the concatenation of:
6037 "1st vec" and "3rd vec"
6038 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6039 "2nd vec" and "1st vec"
6040 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6041 "3rd vec" and "2nd vec"
6042 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6043 | New vectors |
6045 So that now new vectors are:
6047 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6048 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6049 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6051 3rd step:
6052 shift right by 5 the concatenation of:
6053 "1st vec" and "1st vec"
6054 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6055 shift right by 3 the concatenation of:
6056 "2nd vec" and "2nd vec"
6057 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6058 | New vectors |
6060 So that now all vectors are READY:
6061 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6062 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6063 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6065 This algorithm is faster than one in vect_permute_load_chain if:
6066 1. "shift of a concatination" is faster than general permutation.
6067 This is usually so.
6068 2. The TARGET machine can't execute vector instructions in parallel.
6069 This is because each step of the algorithm depends on previous.
6070 The algorithm in vect_permute_load_chain is much more parallel.
6072 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6075 static bool
6076 vect_shift_permute_load_chain (vec<tree> dr_chain,
6077 unsigned int length,
6078 gimple *stmt,
6079 gimple_stmt_iterator *gsi,
6080 vec<tree> *result_chain)
6082 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6083 tree perm2_mask1, perm2_mask2, perm3_mask;
6084 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6085 gimple *perm_stmt;
6087 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6088 unsigned int i;
6089 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6090 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6092 unsigned HOST_WIDE_INT nelt, vf;
6093 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6094 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6095 /* Not supported for variable-length vectors. */
6096 return false;
6098 vec_perm_builder sel (nelt, nelt, 1);
6099 sel.quick_grow (nelt);
6101 result_chain->quick_grow (length);
6102 memcpy (result_chain->address (), dr_chain.address (),
6103 length * sizeof (tree));
6105 if (pow2p_hwi (length) && vf > 4)
6107 unsigned int j, log_length = exact_log2 (length);
6108 for (i = 0; i < nelt / 2; ++i)
6109 sel[i] = i * 2;
6110 for (i = 0; i < nelt / 2; ++i)
6111 sel[nelt / 2 + i] = i * 2 + 1;
6112 vec_perm_indices indices (sel, 2, nelt);
6113 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6115 if (dump_enabled_p ())
6116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6117 "shuffle of 2 fields structure is not \
6118 supported by target\n");
6119 return false;
6121 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6123 for (i = 0; i < nelt / 2; ++i)
6124 sel[i] = i * 2 + 1;
6125 for (i = 0; i < nelt / 2; ++i)
6126 sel[nelt / 2 + i] = i * 2;
6127 indices.new_vector (sel, 2, nelt);
6128 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6130 if (dump_enabled_p ())
6131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6132 "shuffle of 2 fields structure is not \
6133 supported by target\n");
6134 return false;
6136 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6138 /* Generating permutation constant to shift all elements.
6139 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6140 for (i = 0; i < nelt; i++)
6141 sel[i] = nelt / 2 + i;
6142 indices.new_vector (sel, 2, nelt);
6143 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6145 if (dump_enabled_p ())
6146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6147 "shift permutation is not supported by target\n");
6148 return false;
6150 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6152 /* Generating permutation constant to select vector from 2.
6153 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6154 for (i = 0; i < nelt / 2; i++)
6155 sel[i] = i;
6156 for (i = nelt / 2; i < nelt; i++)
6157 sel[i] = nelt + i;
6158 indices.new_vector (sel, 2, nelt);
6159 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6161 if (dump_enabled_p ())
6162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6163 "select is not supported by target\n");
6164 return false;
6166 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6168 for (i = 0; i < log_length; i++)
6170 for (j = 0; j < length; j += 2)
6172 first_vect = dr_chain[j];
6173 second_vect = dr_chain[j + 1];
6175 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6176 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6177 first_vect, first_vect,
6178 perm2_mask1);
6179 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6180 vect[0] = data_ref;
6182 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6183 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6184 second_vect, second_vect,
6185 perm2_mask2);
6186 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6187 vect[1] = data_ref;
6189 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6190 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6191 vect[0], vect[1], shift1_mask);
6192 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6193 (*result_chain)[j/2 + length/2] = data_ref;
6195 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6196 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6197 vect[0], vect[1], select_mask);
6198 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6199 (*result_chain)[j/2] = data_ref;
6201 memcpy (dr_chain.address (), result_chain->address (),
6202 length * sizeof (tree));
6204 return true;
6206 if (length == 3 && vf > 2)
6208 unsigned int k = 0, l = 0;
6210 /* Generating permutation constant to get all elements in rigth order.
6211 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6212 for (i = 0; i < nelt; i++)
6214 if (3 * k + (l % 3) >= nelt)
6216 k = 0;
6217 l += (3 - (nelt % 3));
6219 sel[i] = 3 * k + (l % 3);
6220 k++;
6222 vec_perm_indices indices (sel, 2, nelt);
6223 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6225 if (dump_enabled_p ())
6226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6227 "shuffle of 3 fields structure is not \
6228 supported by target\n");
6229 return false;
6231 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6233 /* Generating permutation constant to shift all elements.
6234 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6235 for (i = 0; i < nelt; i++)
6236 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6237 indices.new_vector (sel, 2, nelt);
6238 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6240 if (dump_enabled_p ())
6241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6242 "shift permutation is not supported by target\n");
6243 return false;
6245 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6247 /* Generating permutation constant to shift all elements.
6248 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6249 for (i = 0; i < nelt; i++)
6250 sel[i] = 2 * (nelt / 3) + 1 + i;
6251 indices.new_vector (sel, 2, nelt);
6252 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6254 if (dump_enabled_p ())
6255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6256 "shift permutation is not supported by target\n");
6257 return false;
6259 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6261 /* Generating permutation constant to shift all elements.
6262 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6263 for (i = 0; i < nelt; i++)
6264 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6265 indices.new_vector (sel, 2, nelt);
6266 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6268 if (dump_enabled_p ())
6269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6270 "shift permutation is not supported by target\n");
6271 return false;
6273 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6275 /* Generating permutation constant to shift all elements.
6276 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6277 for (i = 0; i < nelt; i++)
6278 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6279 indices.new_vector (sel, 2, nelt);
6280 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6282 if (dump_enabled_p ())
6283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6284 "shift permutation is not supported by target\n");
6285 return false;
6287 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6289 for (k = 0; k < 3; k++)
6291 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6292 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6293 dr_chain[k], dr_chain[k],
6294 perm3_mask);
6295 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6296 vect[k] = data_ref;
6299 for (k = 0; k < 3; k++)
6301 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6302 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6303 vect[k % 3], vect[(k + 1) % 3],
6304 shift1_mask);
6305 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6306 vect_shift[k] = data_ref;
6309 for (k = 0; k < 3; k++)
6311 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6312 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6313 vect_shift[(4 - k) % 3],
6314 vect_shift[(3 - k) % 3],
6315 shift2_mask);
6316 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6317 vect[k] = data_ref;
6320 (*result_chain)[3 - (nelt % 3)] = vect[2];
6322 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6323 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6324 vect[0], shift3_mask);
6325 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6326 (*result_chain)[nelt % 3] = data_ref;
6328 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6329 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6330 vect[1], shift4_mask);
6331 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6332 (*result_chain)[0] = data_ref;
6333 return true;
6335 return false;
6338 /* Function vect_transform_grouped_load.
6340 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6341 to perform their permutation and ascribe the result vectorized statements to
6342 the scalar statements.
6345 void
6346 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6347 gimple_stmt_iterator *gsi)
6349 machine_mode mode;
6350 vec<tree> result_chain = vNULL;
6352 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6353 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6354 vectors, that are ready for vector computation. */
6355 result_chain.create (size);
6357 /* If reassociation width for vector type is 2 or greater target machine can
6358 execute 2 or more vector instructions in parallel. Otherwise try to
6359 get chain for loads group using vect_shift_permute_load_chain. */
6360 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6361 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6362 || pow2p_hwi (size)
6363 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6364 gsi, &result_chain))
6365 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6366 vect_record_grouped_load_vectors (stmt, result_chain);
6367 result_chain.release ();
6370 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6371 generated as part of the vectorization of STMT. Assign the statement
6372 for each vector to the associated scalar statement. */
6374 void
6375 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6377 gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6378 gimple *next_stmt, *new_stmt;
6379 unsigned int i, gap_count;
6380 tree tmp_data_ref;
6382 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6383 Since we scan the chain starting from it's first node, their order
6384 corresponds the order of data-refs in RESULT_CHAIN. */
6385 next_stmt = first_stmt;
6386 gap_count = 1;
6387 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6389 if (!next_stmt)
6390 break;
6392 /* Skip the gaps. Loads created for the gaps will be removed by dead
6393 code elimination pass later. No need to check for the first stmt in
6394 the group, since it always exists.
6395 DR_GROUP_GAP is the number of steps in elements from the previous
6396 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6397 correspond to the gaps. */
6398 if (next_stmt != first_stmt
6399 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6401 gap_count++;
6402 continue;
6405 while (next_stmt)
6407 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6408 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6409 copies, and we put the new vector statement in the first available
6410 RELATED_STMT. */
6411 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6412 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6413 else
6415 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6417 gimple *prev_stmt =
6418 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6419 gimple *rel_stmt =
6420 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6421 while (rel_stmt)
6423 prev_stmt = rel_stmt;
6424 rel_stmt =
6425 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6428 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6429 new_stmt;
6433 next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6434 gap_count = 1;
6435 /* If NEXT_STMT accesses the same DR as the previous statement,
6436 put the same TMP_DATA_REF as its vectorized statement; otherwise
6437 get the next data-ref from RESULT_CHAIN. */
6438 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6439 break;
6444 /* Function vect_force_dr_alignment_p.
6446 Returns whether the alignment of a DECL can be forced to be aligned
6447 on ALIGNMENT bit boundary. */
6449 bool
6450 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6452 if (!VAR_P (decl))
6453 return false;
6455 if (decl_in_symtab_p (decl)
6456 && !symtab_node::get (decl)->can_increase_alignment_p ())
6457 return false;
6459 if (TREE_STATIC (decl))
6460 return (alignment <= MAX_OFILE_ALIGNMENT);
6461 else
6462 return (alignment <= MAX_STACK_ALIGNMENT);
6466 /* Return whether the data reference DR is supported with respect to its
6467 alignment.
6468 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6469 it is aligned, i.e., check if it is possible to vectorize it with different
6470 alignment. */
6472 enum dr_alignment_support
6473 vect_supportable_dr_alignment (struct data_reference *dr,
6474 bool check_aligned_accesses)
6476 gimple *stmt = vect_dr_stmt (dr);
6477 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6478 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6479 machine_mode mode = TYPE_MODE (vectype);
6480 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6481 struct loop *vect_loop = NULL;
6482 bool nested_in_vect_loop = false;
6484 if (aligned_access_p (dr) && !check_aligned_accesses)
6485 return dr_aligned;
6487 /* For now assume all conditional loads/stores support unaligned
6488 access without any special code. */
6489 if (is_gimple_call (stmt)
6490 && gimple_call_internal_p (stmt)
6491 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6492 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6493 return dr_unaligned_supported;
6495 if (loop_vinfo)
6497 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6498 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6501 /* Possibly unaligned access. */
6503 /* We can choose between using the implicit realignment scheme (generating
6504 a misaligned_move stmt) and the explicit realignment scheme (generating
6505 aligned loads with a REALIGN_LOAD). There are two variants to the
6506 explicit realignment scheme: optimized, and unoptimized.
6507 We can optimize the realignment only if the step between consecutive
6508 vector loads is equal to the vector size. Since the vector memory
6509 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6510 is guaranteed that the misalignment amount remains the same throughout the
6511 execution of the vectorized loop. Therefore, we can create the
6512 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6513 at the loop preheader.
6515 However, in the case of outer-loop vectorization, when vectorizing a
6516 memory access in the inner-loop nested within the LOOP that is now being
6517 vectorized, while it is guaranteed that the misalignment of the
6518 vectorized memory access will remain the same in different outer-loop
6519 iterations, it is *not* guaranteed that is will remain the same throughout
6520 the execution of the inner-loop. This is because the inner-loop advances
6521 with the original scalar step (and not in steps of VS). If the inner-loop
6522 step happens to be a multiple of VS, then the misalignment remains fixed
6523 and we can use the optimized realignment scheme. For example:
6525 for (i=0; i<N; i++)
6526 for (j=0; j<M; j++)
6527 s += a[i+j];
6529 When vectorizing the i-loop in the above example, the step between
6530 consecutive vector loads is 1, and so the misalignment does not remain
6531 fixed across the execution of the inner-loop, and the realignment cannot
6532 be optimized (as illustrated in the following pseudo vectorized loop):
6534 for (i=0; i<N; i+=4)
6535 for (j=0; j<M; j++){
6536 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6537 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6538 // (assuming that we start from an aligned address).
6541 We therefore have to use the unoptimized realignment scheme:
6543 for (i=0; i<N; i+=4)
6544 for (j=k; j<M; j+=4)
6545 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6546 // that the misalignment of the initial address is
6547 // 0).
6549 The loop can then be vectorized as follows:
6551 for (k=0; k<4; k++){
6552 rt = get_realignment_token (&vp[k]);
6553 for (i=0; i<N; i+=4){
6554 v1 = vp[i+k];
6555 for (j=k; j<M; j+=4){
6556 v2 = vp[i+j+VS-1];
6557 va = REALIGN_LOAD <v1,v2,rt>;
6558 vs += va;
6559 v1 = v2;
6562 } */
6564 if (DR_IS_READ (dr))
6566 bool is_packed = false;
6567 tree type = (TREE_TYPE (DR_REF (dr)));
6569 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6570 && (!targetm.vectorize.builtin_mask_for_load
6571 || targetm.vectorize.builtin_mask_for_load ()))
6573 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6575 /* If we are doing SLP then the accesses need not have the
6576 same alignment, instead it depends on the SLP group size. */
6577 if (loop_vinfo
6578 && STMT_SLP_TYPE (stmt_info)
6579 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6580 * DR_GROUP_SIZE (vinfo_for_stmt
6581 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6582 TYPE_VECTOR_SUBPARTS (vectype)))
6584 else if (!loop_vinfo
6585 || (nested_in_vect_loop
6586 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6587 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6588 return dr_explicit_realign;
6589 else
6590 return dr_explicit_realign_optimized;
6592 if (!known_alignment_for_access_p (dr))
6593 is_packed = not_size_aligned (DR_REF (dr));
6595 if (targetm.vectorize.support_vector_misalignment
6596 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6597 /* Can't software pipeline the loads, but can at least do them. */
6598 return dr_unaligned_supported;
6600 else
6602 bool is_packed = false;
6603 tree type = (TREE_TYPE (DR_REF (dr)));
6605 if (!known_alignment_for_access_p (dr))
6606 is_packed = not_size_aligned (DR_REF (dr));
6608 if (targetm.vectorize.support_vector_misalignment
6609 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6610 return dr_unaligned_supported;
6613 /* Unsupported. */
6614 return dr_unaligned_unsupported;