Transform switch_conversion into a class.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob3e66e25e9cf56b42cfbae4f0ff3bcc6da63fd0f3
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 if (is_pattern_stmt_p (stmtinfo_a))
216 stmt_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
217 if (is_pattern_stmt_p (stmtinfo_b))
218 stmt_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
219 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
220 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
223 /* A subroutine of vect_analyze_data_ref_dependence. Handle
224 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
225 distances. These distances are conservatively correct but they don't
226 reflect a guaranteed dependence.
228 Return true if this function does all the work necessary to avoid
229 an alias or false if the caller should use the dependence distances
230 to limit the vectorization factor in the usual way. LOOP_DEPTH is
231 the depth of the loop described by LOOP_VINFO and the other arguments
232 are as for vect_analyze_data_ref_dependence. */
234 static bool
235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
236 loop_vec_info loop_vinfo,
237 int loop_depth, unsigned int *max_vf)
239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
240 lambda_vector dist_v;
241 unsigned int i;
242 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
244 int dist = dist_v[loop_depth];
245 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
247 /* If the user asserted safelen >= DIST consecutive iterations
248 can be executed concurrently, assume independence.
250 ??? An alternative would be to add the alias check even
251 in this case, and vectorize the fallback loop with the
252 maximum VF set to safelen. However, if the user has
253 explicitly given a length, it's less likely that that
254 would be a win. */
255 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
257 if ((unsigned int) loop->safelen < *max_vf)
258 *max_vf = loop->safelen;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
260 continue;
263 /* For dependence distances of 2 or more, we have the option
264 of limiting VF or checking for an alias at runtime.
265 Prefer to check at runtime if we can, to avoid limiting
266 the VF unnecessarily when the bases are in fact independent.
268 Note that the alias checks will be removed if the VF ends up
269 being small enough. */
270 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
273 return true;
277 /* Function vect_analyze_data_ref_dependence.
279 Return TRUE if there (might) exist a dependence between a memory-reference
280 DRA and a memory-reference DRB. When versioning for alias may check a
281 dependence at run-time, return FALSE. Adjust *MAX_VF according to
282 the data dependence. */
284 static bool
285 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
286 loop_vec_info loop_vinfo,
287 unsigned int *max_vf)
289 unsigned int i;
290 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
291 struct data_reference *dra = DDR_A (ddr);
292 struct data_reference *drb = DDR_B (ddr);
293 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
294 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
295 lambda_vector dist_v;
296 unsigned int loop_depth;
298 /* In loop analysis all data references should be vectorizable. */
299 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
300 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
301 gcc_unreachable ();
303 /* Independent data accesses. */
304 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
305 return false;
307 if (dra == drb
308 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
309 return false;
311 /* We do not have to consider dependences between accesses that belong
312 to the same group, unless the stride could be smaller than the
313 group size. */
314 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
315 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
316 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
317 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
318 return false;
320 /* Even if we have an anti-dependence then, as the vectorized loop covers at
321 least two scalar iterations, there is always also a true dependence.
322 As the vectorizer does not re-order loads and stores we can ignore
323 the anti-dependence if TBAA can disambiguate both DRs similar to the
324 case with known negative distance anti-dependences (positive
325 distance anti-dependences would violate TBAA constraints). */
326 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
327 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
328 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
329 get_alias_set (DR_REF (drb))))
330 return false;
332 /* Unknown data dependence. */
333 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
335 /* If user asserted safelen consecutive iterations can be
336 executed concurrently, assume independence. */
337 if (loop->safelen >= 2)
339 if ((unsigned int) loop->safelen < *max_vf)
340 *max_vf = loop->safelen;
341 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
342 return false;
345 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
346 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
348 if (dump_enabled_p ())
350 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
351 "versioning for alias not supported for: "
352 "can't determine dependence between ");
353 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
354 DR_REF (dra));
355 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
357 DR_REF (drb));
358 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
360 return true;
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
366 "versioning for alias required: "
367 "can't determine dependence between ");
368 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
369 DR_REF (dra));
370 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
371 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
372 DR_REF (drb));
373 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
376 /* Add to list of ddrs that need to be tested at run-time. */
377 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
380 /* Known data dependence. */
381 if (DDR_NUM_DIST_VECTS (ddr) == 0)
383 /* If user asserted safelen consecutive iterations can be
384 executed concurrently, assume independence. */
385 if (loop->safelen >= 2)
387 if ((unsigned int) loop->safelen < *max_vf)
388 *max_vf = loop->safelen;
389 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
390 return false;
393 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
394 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
396 if (dump_enabled_p ())
398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
399 "versioning for alias not supported for: "
400 "bad dist vector for ");
401 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
402 DR_REF (dra));
403 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
404 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
405 DR_REF (drb));
406 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
408 return true;
411 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414 "versioning for alias required: "
415 "bad dist vector for ");
416 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
417 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
418 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
419 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
421 /* Add to list of ddrs that need to be tested at run-time. */
422 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
425 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
427 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
428 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
429 loop_depth, max_vf))
430 return false;
432 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
434 int dist = dist_v[loop_depth];
436 if (dump_enabled_p ())
437 dump_printf_loc (MSG_NOTE, vect_location,
438 "dependence distance = %d.\n", dist);
440 if (dist == 0)
442 if (dump_enabled_p ())
444 dump_printf_loc (MSG_NOTE, vect_location,
445 "dependence distance == 0 between ");
446 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
447 dump_printf (MSG_NOTE, " and ");
448 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
449 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
452 /* When we perform grouped accesses and perform implicit CSE
453 by detecting equal accesses and doing disambiguation with
454 runtime alias tests like for
455 .. = a[i];
456 .. = a[i+1];
457 a[i] = ..;
458 a[i+1] = ..;
459 *p = ..;
460 .. = a[i];
461 .. = a[i+1];
462 where we will end up loading { a[i], a[i+1] } once, make
463 sure that inserting group loads before the first load and
464 stores after the last store will do the right thing.
465 Similar for groups like
466 a[i] = ...;
467 ... = a[i];
468 a[i+1] = ...;
469 where loads from the group interleave with the store. */
470 if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
471 vect_dr_stmt (drb)))
473 if (dump_enabled_p ())
474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
475 "READ_WRITE dependence in interleaving.\n");
476 return true;
479 if (loop->safelen < 2)
481 tree indicator = dr_zero_step_indicator (dra);
482 if (TREE_CODE (indicator) != INTEGER_CST)
483 vect_check_nonzero_value (loop_vinfo, indicator);
484 else if (integer_zerop (indicator))
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
488 "access also has a zero step\n");
489 return true;
492 continue;
495 if (dist > 0 && DDR_REVERSED_P (ddr))
497 /* If DDR_REVERSED_P the order of the data-refs in DDR was
498 reversed (to make distance vector positive), and the actual
499 distance is negative. */
500 if (dump_enabled_p ())
501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
502 "dependence distance negative.\n");
503 /* Record a negative dependence distance to later limit the
504 amount of stmt copying / unrolling we can perform.
505 Only need to handle read-after-write dependence. */
506 if (DR_IS_READ (drb)
507 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
508 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
509 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
510 continue;
513 unsigned int abs_dist = abs (dist);
514 if (abs_dist >= 2 && abs_dist < *max_vf)
516 /* The dependence distance requires reduction of the maximal
517 vectorization factor. */
518 *max_vf = abs (dist);
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "adjusting maximal vectorization factor to %i\n",
522 *max_vf);
525 if (abs_dist >= *max_vf)
527 /* Dependence distance does not create dependence, as far as
528 vectorization is concerned, in this case. */
529 if (dump_enabled_p ())
530 dump_printf_loc (MSG_NOTE, vect_location,
531 "dependence distance >= VF.\n");
532 continue;
535 if (dump_enabled_p ())
537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
538 "not vectorized, possible dependence "
539 "between data-refs ");
540 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
541 dump_printf (MSG_NOTE, " and ");
542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
543 dump_printf (MSG_NOTE, "\n");
546 return true;
549 return false;
552 /* Function vect_analyze_data_ref_dependences.
554 Examine all the data references in the loop, and make sure there do not
555 exist any data dependences between them. Set *MAX_VF according to
556 the maximum vectorization factor the data dependences allow. */
558 bool
559 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
560 unsigned int *max_vf)
562 unsigned int i;
563 struct data_dependence_relation *ddr;
565 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
567 LOOP_VINFO_DDRS (loop_vinfo)
568 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
569 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
570 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
571 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
572 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
573 &LOOP_VINFO_DDRS (loop_vinfo),
574 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
575 return false;
577 /* For epilogues we either have no aliases or alias versioning
578 was applied to original loop. Therefore we may just get max_vf
579 using VF of original loop. */
580 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
581 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
582 else
583 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
584 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
585 return false;
587 return true;
591 /* Function vect_slp_analyze_data_ref_dependence.
593 Return TRUE if there (might) exist a dependence between a memory-reference
594 DRA and a memory-reference DRB. When versioning for alias may check a
595 dependence at run-time, return FALSE. Adjust *MAX_VF according to
596 the data dependence. */
598 static bool
599 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
601 struct data_reference *dra = DDR_A (ddr);
602 struct data_reference *drb = DDR_B (ddr);
604 /* We need to check dependences of statements marked as unvectorizable
605 as well, they still can prohibit vectorization. */
607 /* Independent data accesses. */
608 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
609 return false;
611 if (dra == drb)
612 return false;
614 /* Read-read is OK. */
615 if (DR_IS_READ (dra) && DR_IS_READ (drb))
616 return false;
618 /* If dra and drb are part of the same interleaving chain consider
619 them independent. */
620 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (vect_dr_stmt (dra)))
621 && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dra)))
622 == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (drb)))))
623 return false;
625 /* Unknown data dependence. */
626 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
628 if (dump_enabled_p ())
630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
631 "can't determine dependence between ");
632 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
633 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
634 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
635 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
638 else if (dump_enabled_p ())
640 dump_printf_loc (MSG_NOTE, vect_location,
641 "determined dependence between ");
642 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
643 dump_printf (MSG_NOTE, " and ");
644 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
645 dump_printf (MSG_NOTE, "\n");
648 return true;
652 /* Analyze dependences involved in the transform of SLP NODE. STORES
653 contain the vector of scalar stores of this instance if we are
654 disambiguating the loads. */
656 static bool
657 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
658 vec<gimple *> stores, gimple *last_store)
660 /* This walks over all stmts involved in the SLP load/store done
661 in NODE verifying we can sink them up to the last stmt in the
662 group. */
663 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
664 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
666 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
667 if (access == last_access)
668 continue;
669 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
670 ao_ref ref;
671 bool ref_initialized_p = false;
672 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
673 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
675 gimple *stmt = gsi_stmt (gsi);
676 if (! gimple_vuse (stmt)
677 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
678 continue;
680 /* If we couldn't record a (single) data reference for this
681 stmt we have to resort to the alias oracle. */
682 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
683 if (!dr_b)
685 /* We are moving a store or sinking a load - this means
686 we cannot use TBAA for disambiguation. */
687 if (!ref_initialized_p)
688 ao_ref_init (&ref, DR_REF (dr_a));
689 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
690 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
691 return false;
692 continue;
695 bool dependent = false;
696 /* If we run into a store of this same instance (we've just
697 marked those) then delay dependence checking until we run
698 into the last store because this is where it will have
699 been sunk to (and we verify if we can do that as well). */
700 if (gimple_visited_p (stmt))
702 if (stmt != last_store)
703 continue;
704 unsigned i;
705 gimple *store;
706 FOR_EACH_VEC_ELT (stores, i, store)
708 data_reference *store_dr
709 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
710 ddr_p ddr = initialize_data_dependence_relation
711 (dr_a, store_dr, vNULL);
712 dependent = vect_slp_analyze_data_ref_dependence (ddr);
713 free_dependence_relation (ddr);
714 if (dependent)
715 break;
718 else
720 ddr_p ddr = initialize_data_dependence_relation (dr_a,
721 dr_b, vNULL);
722 dependent = vect_slp_analyze_data_ref_dependence (ddr);
723 free_dependence_relation (ddr);
725 if (dependent)
726 return false;
729 return true;
733 /* Function vect_analyze_data_ref_dependences.
735 Examine all the data references in the basic-block, and make sure there
736 do not exist any data dependences between them. Set *MAX_VF according to
737 the maximum vectorization factor the data dependences allow. */
739 bool
740 vect_slp_analyze_instance_dependence (slp_instance instance)
742 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
744 /* The stores of this instance are at the root of the SLP tree. */
745 slp_tree store = SLP_INSTANCE_TREE (instance);
746 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
747 store = NULL;
749 /* Verify we can sink stores to the vectorized stmt insert location. */
750 gimple *last_store = NULL;
751 if (store)
753 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
754 return false;
756 /* Mark stores in this instance and remember the last one. */
757 last_store = vect_find_last_scalar_stmt_in_slp (store);
758 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
759 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
762 bool res = true;
764 /* Verify we can sink loads to the vectorized stmt insert location,
765 special-casing stores of this instance. */
766 slp_tree load;
767 unsigned int i;
768 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
769 if (! vect_slp_analyze_node_dependences (instance, load,
770 store
771 ? SLP_TREE_SCALAR_STMTS (store)
772 : vNULL, last_store))
774 res = false;
775 break;
778 /* Unset the visited flag. */
779 if (store)
780 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
781 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
783 return res;
786 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
787 the statement that contains DRB, which is useful for recording in the
788 dump file. */
790 static void
791 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
792 innermost_loop_behavior *drb)
794 bool existed;
795 innermost_loop_behavior *&entry
796 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
797 if (!existed || entry->base_alignment < drb->base_alignment)
799 entry = drb;
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE, vect_location,
803 "recording new base alignment for ");
804 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
805 dump_printf (MSG_NOTE, "\n");
806 dump_printf_loc (MSG_NOTE, vect_location,
807 " alignment: %d\n", drb->base_alignment);
808 dump_printf_loc (MSG_NOTE, vect_location,
809 " misalignment: %d\n", drb->base_misalignment);
810 dump_printf_loc (MSG_NOTE, vect_location,
811 " based on: ");
812 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
817 /* If the region we're going to vectorize is reached, all unconditional
818 data references occur at least once. We can therefore pool the base
819 alignment guarantees from each unconditional reference. Do this by
820 going through all the data references in VINFO and checking whether
821 the containing statement makes the reference unconditionally. If so,
822 record the alignment of the base address in VINFO so that it can be
823 used for all other references with the same base. */
825 void
826 vect_record_base_alignments (vec_info *vinfo)
828 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
829 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
830 data_reference *dr;
831 unsigned int i;
832 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
834 gimple *stmt = vect_dr_stmt (dr);
835 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
836 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
838 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
840 /* If DR is nested in the loop that is being vectorized, we can also
841 record the alignment of the base wrt the outer loop. */
842 if (loop && nested_in_vect_loop_p (loop, stmt))
844 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
845 vect_record_base_alignment
846 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
852 /* Return the target alignment for the vectorized form of DR. */
854 static unsigned int
855 vect_calculate_target_alignment (struct data_reference *dr)
857 gimple *stmt = vect_dr_stmt (dr);
858 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
859 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
860 return targetm.vectorize.preferred_vector_alignment (vectype);
863 /* Function vect_compute_data_ref_alignment
865 Compute the misalignment of the data reference DR.
867 Output:
868 1. If during the misalignment computation it is found that the data reference
869 cannot be vectorized then false is returned.
870 2. DR_MISALIGNMENT (DR) is defined.
872 FOR NOW: No analysis is actually performed. Misalignment is calculated
873 only for trivial cases. TODO. */
875 static bool
876 vect_compute_data_ref_alignment (struct data_reference *dr)
878 gimple *stmt = vect_dr_stmt (dr);
879 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
880 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
881 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
882 struct loop *loop = NULL;
883 tree ref = DR_REF (dr);
884 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
886 if (dump_enabled_p ())
887 dump_printf_loc (MSG_NOTE, vect_location,
888 "vect_compute_data_ref_alignment:\n");
890 if (loop_vinfo)
891 loop = LOOP_VINFO_LOOP (loop_vinfo);
893 /* Initialize misalignment to unknown. */
894 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
896 innermost_loop_behavior *drb = vect_dr_behavior (dr);
897 bool step_preserves_misalignment_p;
899 unsigned HOST_WIDE_INT vector_alignment
900 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
901 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
903 /* No step for BB vectorization. */
904 if (!loop)
906 gcc_assert (integer_zerop (drb->step));
907 step_preserves_misalignment_p = true;
910 /* In case the dataref is in an inner-loop of the loop that is being
911 vectorized (LOOP), we use the base and misalignment information
912 relative to the outer-loop (LOOP). This is ok only if the misalignment
913 stays the same throughout the execution of the inner-loop, which is why
914 we have to check that the stride of the dataref in the inner-loop evenly
915 divides by the vector alignment. */
916 else if (nested_in_vect_loop_p (loop, stmt))
918 step_preserves_misalignment_p
919 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
921 if (dump_enabled_p ())
923 if (step_preserves_misalignment_p)
924 dump_printf_loc (MSG_NOTE, vect_location,
925 "inner step divides the vector alignment.\n");
926 else
927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
928 "inner step doesn't divide the vector"
929 " alignment.\n");
933 /* Similarly we can only use base and misalignment information relative to
934 an innermost loop if the misalignment stays the same throughout the
935 execution of the loop. As above, this is the case if the stride of
936 the dataref evenly divides by the alignment. */
937 else
939 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
940 step_preserves_misalignment_p
941 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
943 if (!step_preserves_misalignment_p && dump_enabled_p ())
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
945 "step doesn't divide the vector alignment.\n");
948 unsigned int base_alignment = drb->base_alignment;
949 unsigned int base_misalignment = drb->base_misalignment;
951 /* Calculate the maximum of the pooled base address alignment and the
952 alignment that we can compute for DR itself. */
953 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
954 if (entry && base_alignment < (*entry)->base_alignment)
956 base_alignment = (*entry)->base_alignment;
957 base_misalignment = (*entry)->base_misalignment;
960 if (drb->offset_alignment < vector_alignment
961 || !step_preserves_misalignment_p
962 /* We need to know whether the step wrt the vectorized loop is
963 negative when computing the starting misalignment below. */
964 || TREE_CODE (drb->step) != INTEGER_CST)
966 if (dump_enabled_p ())
968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
969 "Unknown alignment for access: ");
970 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
971 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
973 return true;
976 if (base_alignment < vector_alignment)
978 unsigned int max_alignment;
979 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
980 if (max_alignment < vector_alignment
981 || !vect_can_force_dr_alignment_p (base,
982 vector_alignment * BITS_PER_UNIT))
984 if (dump_enabled_p ())
986 dump_printf_loc (MSG_NOTE, vect_location,
987 "can't force alignment of ref: ");
988 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
989 dump_printf (MSG_NOTE, "\n");
991 return true;
994 /* Force the alignment of the decl.
995 NOTE: This is the only change to the code we make during
996 the analysis phase, before deciding to vectorize the loop. */
997 if (dump_enabled_p ())
999 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1000 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1001 dump_printf (MSG_NOTE, "\n");
1004 DR_VECT_AUX (dr)->base_decl = base;
1005 DR_VECT_AUX (dr)->base_misaligned = true;
1006 base_misalignment = 0;
1008 poly_int64 misalignment
1009 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1011 /* If this is a backward running DR then first access in the larger
1012 vectype actually is N-1 elements before the address in the DR.
1013 Adjust misalign accordingly. */
1014 if (tree_int_cst_sgn (drb->step) < 0)
1015 /* PLUS because STEP is negative. */
1016 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1017 * TREE_INT_CST_LOW (drb->step));
1019 unsigned int const_misalignment;
1020 if (!known_misalignment (misalignment, vector_alignment,
1021 &const_misalignment))
1023 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026 "Non-constant misalignment for access: ");
1027 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1028 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1030 return true;
1033 SET_DR_MISALIGNMENT (dr, const_misalignment);
1035 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1038 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1039 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1040 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1043 return true;
1046 /* Function vect_update_misalignment_for_peel.
1047 Sets DR's misalignment
1048 - to 0 if it has the same alignment as DR_PEEL,
1049 - to the misalignment computed using NPEEL if DR's salignment is known,
1050 - to -1 (unknown) otherwise.
1052 DR - the data reference whose misalignment is to be adjusted.
1053 DR_PEEL - the data reference whose misalignment is being made
1054 zero in the vector loop by the peel.
1055 NPEEL - the number of iterations in the peel loop if the misalignment
1056 of DR_PEEL is known at compile time. */
1058 static void
1059 vect_update_misalignment_for_peel (struct data_reference *dr,
1060 struct data_reference *dr_peel, int npeel)
1062 unsigned int i;
1063 vec<dr_p> same_aligned_drs;
1064 struct data_reference *current_dr;
1065 int dr_size = vect_get_scalar_dr_size (dr);
1066 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1067 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
1068 stmt_vec_info peel_stmt_info = vinfo_for_stmt (vect_dr_stmt (dr_peel));
1070 /* For interleaved data accesses the step in the loop must be multiplied by
1071 the size of the interleaving group. */
1072 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1073 dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1074 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1075 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1077 /* It can be assumed that the data refs with the same alignment as dr_peel
1078 are aligned in the vector loop. */
1079 same_aligned_drs
1080 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (vect_dr_stmt (dr_peel)));
1081 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1083 if (current_dr != dr)
1084 continue;
1085 gcc_assert (!known_alignment_for_access_p (dr)
1086 || !known_alignment_for_access_p (dr_peel)
1087 || (DR_MISALIGNMENT (dr) / dr_size
1088 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1089 SET_DR_MISALIGNMENT (dr, 0);
1090 return;
1093 if (known_alignment_for_access_p (dr)
1094 && known_alignment_for_access_p (dr_peel))
1096 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1097 int misal = DR_MISALIGNMENT (dr);
1098 misal += negative ? -npeel * dr_size : npeel * dr_size;
1099 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1100 SET_DR_MISALIGNMENT (dr, misal);
1101 return;
1104 if (dump_enabled_p ())
1105 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1106 "to unknown (-1).\n");
1107 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1111 /* Function verify_data_ref_alignment
1113 Return TRUE if DR can be handled with respect to alignment. */
1115 static bool
1116 verify_data_ref_alignment (data_reference_p dr)
1118 enum dr_alignment_support supportable_dr_alignment
1119 = vect_supportable_dr_alignment (dr, false);
1120 if (!supportable_dr_alignment)
1122 if (dump_enabled_p ())
1124 if (DR_IS_READ (dr))
1125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1126 "not vectorized: unsupported unaligned load.");
1127 else
1128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1129 "not vectorized: unsupported unaligned "
1130 "store.");
1132 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1133 DR_REF (dr));
1134 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1136 return false;
1139 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1140 dump_printf_loc (MSG_NOTE, vect_location,
1141 "Vectorizing an unaligned access.\n");
1143 return true;
1146 /* Function vect_verify_datarefs_alignment
1148 Return TRUE if all data references in the loop can be
1149 handled with respect to alignment. */
1151 bool
1152 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1154 vec<data_reference_p> datarefs = vinfo->datarefs;
1155 struct data_reference *dr;
1156 unsigned int i;
1158 FOR_EACH_VEC_ELT (datarefs, i, dr)
1160 gimple *stmt = vect_dr_stmt (dr);
1161 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1163 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1164 continue;
1166 /* For interleaving, only the alignment of the first access matters. */
1167 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1168 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1169 continue;
1171 /* Strided accesses perform only component accesses, alignment is
1172 irrelevant for them. */
1173 if (STMT_VINFO_STRIDED_P (stmt_info)
1174 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1175 continue;
1177 if (! verify_data_ref_alignment (dr))
1178 return false;
1181 return true;
1184 /* Given an memory reference EXP return whether its alignment is less
1185 than its size. */
1187 static bool
1188 not_size_aligned (tree exp)
1190 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1191 return true;
1193 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1194 > get_object_alignment (exp));
1197 /* Function vector_alignment_reachable_p
1199 Return true if vector alignment for DR is reachable by peeling
1200 a few loop iterations. Return false otherwise. */
1202 static bool
1203 vector_alignment_reachable_p (struct data_reference *dr)
1205 gimple *stmt = vect_dr_stmt (dr);
1206 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1207 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1209 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1211 /* For interleaved access we peel only if number of iterations in
1212 the prolog loop ({VF - misalignment}), is a multiple of the
1213 number of the interleaved accesses. */
1214 int elem_size, mis_in_elements;
1216 /* FORNOW: handle only known alignment. */
1217 if (!known_alignment_for_access_p (dr))
1218 return false;
1220 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1221 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1222 elem_size = vector_element_size (vector_size, nelements);
1223 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1225 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1226 return false;
1229 /* If misalignment is known at the compile time then allow peeling
1230 only if natural alignment is reachable through peeling. */
1231 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1233 HOST_WIDE_INT elmsize =
1234 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1235 if (dump_enabled_p ())
1237 dump_printf_loc (MSG_NOTE, vect_location,
1238 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1239 dump_printf (MSG_NOTE,
1240 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1242 if (DR_MISALIGNMENT (dr) % elmsize)
1244 if (dump_enabled_p ())
1245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1246 "data size does not divide the misalignment.\n");
1247 return false;
1251 if (!known_alignment_for_access_p (dr))
1253 tree type = TREE_TYPE (DR_REF (dr));
1254 bool is_packed = not_size_aligned (DR_REF (dr));
1255 if (dump_enabled_p ())
1256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1257 "Unknown misalignment, %snaturally aligned\n",
1258 is_packed ? "not " : "");
1259 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1262 return true;
1266 /* Calculate the cost of the memory access represented by DR. */
1268 static void
1269 vect_get_data_access_cost (struct data_reference *dr,
1270 unsigned int *inside_cost,
1271 unsigned int *outside_cost,
1272 stmt_vector_for_cost *body_cost_vec,
1273 stmt_vector_for_cost *prologue_cost_vec)
1275 gimple *stmt = vect_dr_stmt (dr);
1276 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1277 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1278 int ncopies;
1280 if (PURE_SLP_STMT (stmt_info))
1281 ncopies = 1;
1282 else
1283 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1285 if (DR_IS_READ (dr))
1286 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1287 prologue_cost_vec, body_cost_vec, false);
1288 else
1289 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_NOTE, vect_location,
1293 "vect_get_data_access_cost: inside_cost = %d, "
1294 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1298 typedef struct _vect_peel_info
1300 struct data_reference *dr;
1301 int npeel;
1302 unsigned int count;
1303 } *vect_peel_info;
1305 typedef struct _vect_peel_extended_info
1307 struct _vect_peel_info peel_info;
1308 unsigned int inside_cost;
1309 unsigned int outside_cost;
1310 } *vect_peel_extended_info;
1313 /* Peeling hashtable helpers. */
1315 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1317 static inline hashval_t hash (const _vect_peel_info *);
1318 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1321 inline hashval_t
1322 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1324 return (hashval_t) peel_info->npeel;
1327 inline bool
1328 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1330 return (a->npeel == b->npeel);
1334 /* Insert DR into peeling hash table with NPEEL as key. */
1336 static void
1337 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1338 loop_vec_info loop_vinfo, struct data_reference *dr,
1339 int npeel)
1341 struct _vect_peel_info elem, *slot;
1342 _vect_peel_info **new_slot;
1343 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1345 elem.npeel = npeel;
1346 slot = peeling_htab->find (&elem);
1347 if (slot)
1348 slot->count++;
1349 else
1351 slot = XNEW (struct _vect_peel_info);
1352 slot->npeel = npeel;
1353 slot->dr = dr;
1354 slot->count = 1;
1355 new_slot = peeling_htab->find_slot (slot, INSERT);
1356 *new_slot = slot;
1359 if (!supportable_dr_alignment
1360 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1361 slot->count += VECT_MAX_COST;
1365 /* Traverse peeling hash table to find peeling option that aligns maximum
1366 number of data accesses. */
1369 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1370 _vect_peel_extended_info *max)
1372 vect_peel_info elem = *slot;
1374 if (elem->count > max->peel_info.count
1375 || (elem->count == max->peel_info.count
1376 && max->peel_info.npeel > elem->npeel))
1378 max->peel_info.npeel = elem->npeel;
1379 max->peel_info.count = elem->count;
1380 max->peel_info.dr = elem->dr;
1383 return 1;
1386 /* Get the costs of peeling NPEEL iterations checking data access costs
1387 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1388 misalignment will be zero after peeling. */
1390 static void
1391 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1392 struct data_reference *dr0,
1393 unsigned int *inside_cost,
1394 unsigned int *outside_cost,
1395 stmt_vector_for_cost *body_cost_vec,
1396 stmt_vector_for_cost *prologue_cost_vec,
1397 unsigned int npeel,
1398 bool unknown_misalignment)
1400 unsigned i;
1401 data_reference *dr;
1403 FOR_EACH_VEC_ELT (datarefs, i, dr)
1405 gimple *stmt = vect_dr_stmt (dr);
1406 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1407 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1408 continue;
1410 /* For interleaving, only the alignment of the first access
1411 matters. */
1412 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1413 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1414 continue;
1416 /* Strided accesses perform only component accesses, alignment is
1417 irrelevant for them. */
1418 if (STMT_VINFO_STRIDED_P (stmt_info)
1419 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1420 continue;
1422 int save_misalignment;
1423 save_misalignment = DR_MISALIGNMENT (dr);
1424 if (npeel == 0)
1426 else if (unknown_misalignment && dr == dr0)
1427 SET_DR_MISALIGNMENT (dr, 0);
1428 else
1429 vect_update_misalignment_for_peel (dr, dr0, npeel);
1430 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1431 body_cost_vec, prologue_cost_vec);
1432 SET_DR_MISALIGNMENT (dr, save_misalignment);
1436 /* Traverse peeling hash table and calculate cost for each peeling option.
1437 Find the one with the lowest cost. */
1440 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1441 _vect_peel_extended_info *min)
1443 vect_peel_info elem = *slot;
1444 int dummy;
1445 unsigned int inside_cost = 0, outside_cost = 0;
1446 gimple *stmt = vect_dr_stmt (elem->dr);
1447 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1448 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1449 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1450 epilogue_cost_vec;
1452 prologue_cost_vec.create (2);
1453 body_cost_vec.create (2);
1454 epilogue_cost_vec.create (2);
1456 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1457 elem->dr, &inside_cost, &outside_cost,
1458 &body_cost_vec, &prologue_cost_vec,
1459 elem->npeel, false);
1461 body_cost_vec.release ();
1463 outside_cost += vect_get_known_peeling_cost
1464 (loop_vinfo, elem->npeel, &dummy,
1465 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1466 &prologue_cost_vec, &epilogue_cost_vec);
1468 /* Prologue and epilogue costs are added to the target model later.
1469 These costs depend only on the scalar iteration cost, the
1470 number of peeling iterations finally chosen, and the number of
1471 misaligned statements. So discard the information found here. */
1472 prologue_cost_vec.release ();
1473 epilogue_cost_vec.release ();
1475 if (inside_cost < min->inside_cost
1476 || (inside_cost == min->inside_cost
1477 && outside_cost < min->outside_cost))
1479 min->inside_cost = inside_cost;
1480 min->outside_cost = outside_cost;
1481 min->peel_info.dr = elem->dr;
1482 min->peel_info.npeel = elem->npeel;
1483 min->peel_info.count = elem->count;
1486 return 1;
1490 /* Choose best peeling option by traversing peeling hash table and either
1491 choosing an option with the lowest cost (if cost model is enabled) or the
1492 option that aligns as many accesses as possible. */
1494 static struct _vect_peel_extended_info
1495 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1496 loop_vec_info loop_vinfo)
1498 struct _vect_peel_extended_info res;
1500 res.peel_info.dr = NULL;
1502 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1504 res.inside_cost = INT_MAX;
1505 res.outside_cost = INT_MAX;
1506 peeling_htab->traverse <_vect_peel_extended_info *,
1507 vect_peeling_hash_get_lowest_cost> (&res);
1509 else
1511 res.peel_info.count = 0;
1512 peeling_htab->traverse <_vect_peel_extended_info *,
1513 vect_peeling_hash_get_most_frequent> (&res);
1514 res.inside_cost = 0;
1515 res.outside_cost = 0;
1518 return res;
1521 /* Return true if the new peeling NPEEL is supported. */
1523 static bool
1524 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1525 unsigned npeel)
1527 unsigned i;
1528 struct data_reference *dr = NULL;
1529 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1530 gimple *stmt;
1531 stmt_vec_info stmt_info;
1532 enum dr_alignment_support supportable_dr_alignment;
1534 /* Ensure that all data refs can be vectorized after the peel. */
1535 FOR_EACH_VEC_ELT (datarefs, i, dr)
1537 int save_misalignment;
1539 if (dr == dr0)
1540 continue;
1542 stmt = vect_dr_stmt (dr);
1543 stmt_info = vinfo_for_stmt (stmt);
1544 /* For interleaving, only the alignment of the first access
1545 matters. */
1546 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1547 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1548 continue;
1550 /* Strided accesses perform only component accesses, alignment is
1551 irrelevant for them. */
1552 if (STMT_VINFO_STRIDED_P (stmt_info)
1553 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1554 continue;
1556 save_misalignment = DR_MISALIGNMENT (dr);
1557 vect_update_misalignment_for_peel (dr, dr0, npeel);
1558 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1559 SET_DR_MISALIGNMENT (dr, save_misalignment);
1561 if (!supportable_dr_alignment)
1562 return false;
1565 return true;
1568 /* Function vect_enhance_data_refs_alignment
1570 This pass will use loop versioning and loop peeling in order to enhance
1571 the alignment of data references in the loop.
1573 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1574 original loop is to be vectorized. Any other loops that are created by
1575 the transformations performed in this pass - are not supposed to be
1576 vectorized. This restriction will be relaxed.
1578 This pass will require a cost model to guide it whether to apply peeling
1579 or versioning or a combination of the two. For example, the scheme that
1580 intel uses when given a loop with several memory accesses, is as follows:
1581 choose one memory access ('p') which alignment you want to force by doing
1582 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1583 other accesses are not necessarily aligned, or (2) use loop versioning to
1584 generate one loop in which all accesses are aligned, and another loop in
1585 which only 'p' is necessarily aligned.
1587 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1588 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1589 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1591 Devising a cost model is the most critical aspect of this work. It will
1592 guide us on which access to peel for, whether to use loop versioning, how
1593 many versions to create, etc. The cost model will probably consist of
1594 generic considerations as well as target specific considerations (on
1595 powerpc for example, misaligned stores are more painful than misaligned
1596 loads).
1598 Here are the general steps involved in alignment enhancements:
1600 -- original loop, before alignment analysis:
1601 for (i=0; i<N; i++){
1602 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1603 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1606 -- After vect_compute_data_refs_alignment:
1607 for (i=0; i<N; i++){
1608 x = q[i]; # DR_MISALIGNMENT(q) = 3
1609 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1612 -- Possibility 1: we do loop versioning:
1613 if (p is aligned) {
1614 for (i=0; i<N; i++){ # loop 1A
1615 x = q[i]; # DR_MISALIGNMENT(q) = 3
1616 p[i] = y; # DR_MISALIGNMENT(p) = 0
1619 else {
1620 for (i=0; i<N; i++){ # loop 1B
1621 x = q[i]; # DR_MISALIGNMENT(q) = 3
1622 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1626 -- Possibility 2: we do loop peeling:
1627 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1628 x = q[i];
1629 p[i] = y;
1631 for (i = 3; i < N; i++){ # loop 2A
1632 x = q[i]; # DR_MISALIGNMENT(q) = 0
1633 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1636 -- Possibility 3: combination of loop peeling and versioning:
1637 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1638 x = q[i];
1639 p[i] = y;
1641 if (p is aligned) {
1642 for (i = 3; i<N; i++){ # loop 3A
1643 x = q[i]; # DR_MISALIGNMENT(q) = 0
1644 p[i] = y; # DR_MISALIGNMENT(p) = 0
1647 else {
1648 for (i = 3; i<N; i++){ # loop 3B
1649 x = q[i]; # DR_MISALIGNMENT(q) = 0
1650 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1654 These loops are later passed to loop_transform to be vectorized. The
1655 vectorizer will use the alignment information to guide the transformation
1656 (whether to generate regular loads/stores, or with special handling for
1657 misalignment). */
1659 bool
1660 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1662 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1663 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1664 enum dr_alignment_support supportable_dr_alignment;
1665 struct data_reference *dr0 = NULL, *first_store = NULL;
1666 struct data_reference *dr;
1667 unsigned int i, j;
1668 bool do_peeling = false;
1669 bool do_versioning = false;
1670 bool stat;
1671 gimple *stmt;
1672 stmt_vec_info stmt_info;
1673 unsigned int npeel = 0;
1674 bool one_misalignment_known = false;
1675 bool one_misalignment_unknown = false;
1676 bool one_dr_unsupportable = false;
1677 struct data_reference *unsupportable_dr = NULL;
1678 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1679 unsigned possible_npeel_number = 1;
1680 tree vectype;
1681 unsigned int mis, same_align_drs_max = 0;
1682 hash_table<peel_info_hasher> peeling_htab (1);
1684 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1686 /* Reset data so we can safely be called multiple times. */
1687 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1688 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1690 /* While cost model enhancements are expected in the future, the high level
1691 view of the code at this time is as follows:
1693 A) If there is a misaligned access then see if peeling to align
1694 this access can make all data references satisfy
1695 vect_supportable_dr_alignment. If so, update data structures
1696 as needed and return true.
1698 B) If peeling wasn't possible and there is a data reference with an
1699 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1700 then see if loop versioning checks can be used to make all data
1701 references satisfy vect_supportable_dr_alignment. If so, update
1702 data structures as needed and return true.
1704 C) If neither peeling nor versioning were successful then return false if
1705 any data reference does not satisfy vect_supportable_dr_alignment.
1707 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1709 Note, Possibility 3 above (which is peeling and versioning together) is not
1710 being done at this time. */
1712 /* (1) Peeling to force alignment. */
1714 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1715 Considerations:
1716 + How many accesses will become aligned due to the peeling
1717 - How many accesses will become unaligned due to the peeling,
1718 and the cost of misaligned accesses.
1719 - The cost of peeling (the extra runtime checks, the increase
1720 in code size). */
1722 FOR_EACH_VEC_ELT (datarefs, i, dr)
1724 stmt = vect_dr_stmt (dr);
1725 stmt_info = vinfo_for_stmt (stmt);
1727 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1728 continue;
1730 /* For interleaving, only the alignment of the first access
1731 matters. */
1732 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1733 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1734 continue;
1736 /* For invariant accesses there is nothing to enhance. */
1737 if (integer_zerop (DR_STEP (dr)))
1738 continue;
1740 /* Strided accesses perform only component accesses, alignment is
1741 irrelevant for them. */
1742 if (STMT_VINFO_STRIDED_P (stmt_info)
1743 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1744 continue;
1746 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1747 do_peeling = vector_alignment_reachable_p (dr);
1748 if (do_peeling)
1750 if (known_alignment_for_access_p (dr))
1752 unsigned int npeel_tmp = 0;
1753 bool negative = tree_int_cst_compare (DR_STEP (dr),
1754 size_zero_node) < 0;
1756 vectype = STMT_VINFO_VECTYPE (stmt_info);
1757 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1758 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1759 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1760 if (DR_MISALIGNMENT (dr) != 0)
1761 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1763 /* For multiple types, it is possible that the bigger type access
1764 will have more than one peeling option. E.g., a loop with two
1765 types: one of size (vector size / 4), and the other one of
1766 size (vector size / 8). Vectorization factor will 8. If both
1767 accesses are misaligned by 3, the first one needs one scalar
1768 iteration to be aligned, and the second one needs 5. But the
1769 first one will be aligned also by peeling 5 scalar
1770 iterations, and in that case both accesses will be aligned.
1771 Hence, except for the immediate peeling amount, we also want
1772 to try to add full vector size, while we don't exceed
1773 vectorization factor.
1774 We do this automatically for cost model, since we calculate
1775 cost for every peeling option. */
1776 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1778 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1779 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1780 possible_npeel_number
1781 = vect_get_num_vectors (nscalars, vectype);
1783 /* NPEEL_TMP is 0 when there is no misalignment, but also
1784 allow peeling NELEMENTS. */
1785 if (DR_MISALIGNMENT (dr) == 0)
1786 possible_npeel_number++;
1789 /* Save info about DR in the hash table. Also include peeling
1790 amounts according to the explanation above. */
1791 for (j = 0; j < possible_npeel_number; j++)
1793 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1794 dr, npeel_tmp);
1795 npeel_tmp += target_align / dr_size;
1798 one_misalignment_known = true;
1800 else
1802 /* If we don't know any misalignment values, we prefer
1803 peeling for data-ref that has the maximum number of data-refs
1804 with the same alignment, unless the target prefers to align
1805 stores over load. */
1806 unsigned same_align_drs
1807 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1808 if (!dr0
1809 || same_align_drs_max < same_align_drs)
1811 same_align_drs_max = same_align_drs;
1812 dr0 = dr;
1814 /* For data-refs with the same number of related
1815 accesses prefer the one where the misalign
1816 computation will be invariant in the outermost loop. */
1817 else if (same_align_drs_max == same_align_drs)
1819 struct loop *ivloop0, *ivloop;
1820 ivloop0 = outermost_invariant_loop_for_expr
1821 (loop, DR_BASE_ADDRESS (dr0));
1822 ivloop = outermost_invariant_loop_for_expr
1823 (loop, DR_BASE_ADDRESS (dr));
1824 if ((ivloop && !ivloop0)
1825 || (ivloop && ivloop0
1826 && flow_loop_nested_p (ivloop, ivloop0)))
1827 dr0 = dr;
1830 one_misalignment_unknown = true;
1832 /* Check for data refs with unsupportable alignment that
1833 can be peeled. */
1834 if (!supportable_dr_alignment)
1836 one_dr_unsupportable = true;
1837 unsupportable_dr = dr;
1840 if (!first_store && DR_IS_WRITE (dr))
1841 first_store = dr;
1844 else
1846 if (!aligned_access_p (dr))
1848 if (dump_enabled_p ())
1849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1850 "vector alignment may not be reachable\n");
1851 break;
1856 /* Check if we can possibly peel the loop. */
1857 if (!vect_can_advance_ivs_p (loop_vinfo)
1858 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1859 || loop->inner)
1860 do_peeling = false;
1862 struct _vect_peel_extended_info peel_for_known_alignment;
1863 struct _vect_peel_extended_info peel_for_unknown_alignment;
1864 struct _vect_peel_extended_info best_peel;
1866 peel_for_unknown_alignment.inside_cost = INT_MAX;
1867 peel_for_unknown_alignment.outside_cost = INT_MAX;
1868 peel_for_unknown_alignment.peel_info.count = 0;
1870 if (do_peeling
1871 && one_misalignment_unknown)
1873 /* Check if the target requires to prefer stores over loads, i.e., if
1874 misaligned stores are more expensive than misaligned loads (taking
1875 drs with same alignment into account). */
1876 unsigned int load_inside_cost = 0;
1877 unsigned int load_outside_cost = 0;
1878 unsigned int store_inside_cost = 0;
1879 unsigned int store_outside_cost = 0;
1880 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1882 stmt_vector_for_cost dummy;
1883 dummy.create (2);
1884 vect_get_peeling_costs_all_drs (datarefs, dr0,
1885 &load_inside_cost,
1886 &load_outside_cost,
1887 &dummy, &dummy, estimated_npeels, true);
1888 dummy.release ();
1890 if (first_store)
1892 dummy.create (2);
1893 vect_get_peeling_costs_all_drs (datarefs, first_store,
1894 &store_inside_cost,
1895 &store_outside_cost,
1896 &dummy, &dummy,
1897 estimated_npeels, true);
1898 dummy.release ();
1900 else
1902 store_inside_cost = INT_MAX;
1903 store_outside_cost = INT_MAX;
1906 if (load_inside_cost > store_inside_cost
1907 || (load_inside_cost == store_inside_cost
1908 && load_outside_cost > store_outside_cost))
1910 dr0 = first_store;
1911 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1912 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1914 else
1916 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1917 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1920 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1921 prologue_cost_vec.create (2);
1922 epilogue_cost_vec.create (2);
1924 int dummy2;
1925 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1926 (loop_vinfo, estimated_npeels, &dummy2,
1927 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1928 &prologue_cost_vec, &epilogue_cost_vec);
1930 prologue_cost_vec.release ();
1931 epilogue_cost_vec.release ();
1933 peel_for_unknown_alignment.peel_info.count = 1
1934 + STMT_VINFO_SAME_ALIGN_REFS
1935 (vinfo_for_stmt (vect_dr_stmt (dr0))).length ();
1938 peel_for_unknown_alignment.peel_info.npeel = 0;
1939 peel_for_unknown_alignment.peel_info.dr = dr0;
1941 best_peel = peel_for_unknown_alignment;
1943 peel_for_known_alignment.inside_cost = INT_MAX;
1944 peel_for_known_alignment.outside_cost = INT_MAX;
1945 peel_for_known_alignment.peel_info.count = 0;
1946 peel_for_known_alignment.peel_info.dr = NULL;
1948 if (do_peeling && one_misalignment_known)
1950 /* Peeling is possible, but there is no data access that is not supported
1951 unless aligned. So we try to choose the best possible peeling from
1952 the hash table. */
1953 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1954 (&peeling_htab, loop_vinfo);
1957 /* Compare costs of peeling for known and unknown alignment. */
1958 if (peel_for_known_alignment.peel_info.dr != NULL
1959 && peel_for_unknown_alignment.inside_cost
1960 >= peel_for_known_alignment.inside_cost)
1962 best_peel = peel_for_known_alignment;
1964 /* If the best peeling for known alignment has NPEEL == 0, perform no
1965 peeling at all except if there is an unsupportable dr that we can
1966 align. */
1967 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1968 do_peeling = false;
1971 /* If there is an unsupportable data ref, prefer this over all choices so far
1972 since we'd have to discard a chosen peeling except when it accidentally
1973 aligned the unsupportable data ref. */
1974 if (one_dr_unsupportable)
1975 dr0 = unsupportable_dr;
1976 else if (do_peeling)
1978 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1979 TODO: Use nopeel_outside_cost or get rid of it? */
1980 unsigned nopeel_inside_cost = 0;
1981 unsigned nopeel_outside_cost = 0;
1983 stmt_vector_for_cost dummy;
1984 dummy.create (2);
1985 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1986 &nopeel_outside_cost, &dummy, &dummy,
1987 0, false);
1988 dummy.release ();
1990 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1991 costs will be recorded. */
1992 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1993 prologue_cost_vec.create (2);
1994 epilogue_cost_vec.create (2);
1996 int dummy2;
1997 nopeel_outside_cost += vect_get_known_peeling_cost
1998 (loop_vinfo, 0, &dummy2,
1999 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2000 &prologue_cost_vec, &epilogue_cost_vec);
2002 prologue_cost_vec.release ();
2003 epilogue_cost_vec.release ();
2005 npeel = best_peel.peel_info.npeel;
2006 dr0 = best_peel.peel_info.dr;
2008 /* If no peeling is not more expensive than the best peeling we
2009 have so far, don't perform any peeling. */
2010 if (nopeel_inside_cost <= best_peel.inside_cost)
2011 do_peeling = false;
2014 if (do_peeling)
2016 stmt = vect_dr_stmt (dr0);
2017 stmt_info = vinfo_for_stmt (stmt);
2018 vectype = STMT_VINFO_VECTYPE (stmt_info);
2020 if (known_alignment_for_access_p (dr0))
2022 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2023 size_zero_node) < 0;
2024 if (!npeel)
2026 /* Since it's known at compile time, compute the number of
2027 iterations in the peeled loop (the peeling factor) for use in
2028 updating DR_MISALIGNMENT values. The peeling factor is the
2029 vectorization factor minus the misalignment as an element
2030 count. */
2031 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2032 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2033 npeel = ((mis & (target_align - 1))
2034 / vect_get_scalar_dr_size (dr0));
2037 /* For interleaved data access every iteration accesses all the
2038 members of the group, therefore we divide the number of iterations
2039 by the group size. */
2040 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr0));
2041 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2042 npeel /= DR_GROUP_SIZE (stmt_info);
2044 if (dump_enabled_p ())
2045 dump_printf_loc (MSG_NOTE, vect_location,
2046 "Try peeling by %d\n", npeel);
2049 /* Ensure that all datarefs can be vectorized after the peel. */
2050 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2051 do_peeling = false;
2053 /* Check if all datarefs are supportable and log. */
2054 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2056 stat = vect_verify_datarefs_alignment (loop_vinfo);
2057 if (!stat)
2058 do_peeling = false;
2059 else
2060 return stat;
2063 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2064 if (do_peeling)
2066 unsigned max_allowed_peel
2067 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2068 if (max_allowed_peel != (unsigned)-1)
2070 unsigned max_peel = npeel;
2071 if (max_peel == 0)
2073 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2074 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2076 if (max_peel > max_allowed_peel)
2078 do_peeling = false;
2079 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_NOTE, vect_location,
2081 "Disable peeling, max peels reached: %d\n", max_peel);
2086 /* Cost model #2 - if peeling may result in a remaining loop not
2087 iterating enough to be vectorized then do not peel. Since this
2088 is a cost heuristic rather than a correctness decision, use the
2089 most likely runtime value for variable vectorization factors. */
2090 if (do_peeling
2091 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2093 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2094 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2095 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2096 < assumed_vf + max_peel)
2097 do_peeling = false;
2100 if (do_peeling)
2102 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2103 If the misalignment of DR_i is identical to that of dr0 then set
2104 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2105 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2106 by the peeling factor times the element size of DR_i (MOD the
2107 vectorization factor times the size). Otherwise, the
2108 misalignment of DR_i must be set to unknown. */
2109 FOR_EACH_VEC_ELT (datarefs, i, dr)
2110 if (dr != dr0)
2112 /* Strided accesses perform only component accesses, alignment
2113 is irrelevant for them. */
2114 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2115 if (STMT_VINFO_STRIDED_P (stmt_info)
2116 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2117 continue;
2119 vect_update_misalignment_for_peel (dr, dr0, npeel);
2122 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2123 if (npeel)
2124 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2125 else
2126 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2127 = DR_MISALIGNMENT (dr0);
2128 SET_DR_MISALIGNMENT (dr0, 0);
2129 if (dump_enabled_p ())
2131 dump_printf_loc (MSG_NOTE, vect_location,
2132 "Alignment of access forced using peeling.\n");
2133 dump_printf_loc (MSG_NOTE, vect_location,
2134 "Peeling for alignment will be applied.\n");
2137 /* The inside-loop cost will be accounted for in vectorizable_load
2138 and vectorizable_store correctly with adjusted alignments.
2139 Drop the body_cst_vec on the floor here. */
2140 stat = vect_verify_datarefs_alignment (loop_vinfo);
2141 gcc_assert (stat);
2142 return stat;
2146 /* (2) Versioning to force alignment. */
2148 /* Try versioning if:
2149 1) optimize loop for speed
2150 2) there is at least one unsupported misaligned data ref with an unknown
2151 misalignment, and
2152 3) all misaligned data refs with a known misalignment are supported, and
2153 4) the number of runtime alignment checks is within reason. */
2155 do_versioning =
2156 optimize_loop_nest_for_speed_p (loop)
2157 && (!loop->inner); /* FORNOW */
2159 if (do_versioning)
2161 FOR_EACH_VEC_ELT (datarefs, i, dr)
2163 stmt = vect_dr_stmt (dr);
2164 stmt_info = vinfo_for_stmt (stmt);
2166 /* For interleaving, only the alignment of the first access
2167 matters. */
2168 if (aligned_access_p (dr)
2169 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2170 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2171 continue;
2173 if (STMT_VINFO_STRIDED_P (stmt_info))
2175 /* Strided loads perform only component accesses, alignment is
2176 irrelevant for them. */
2177 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2178 continue;
2179 do_versioning = false;
2180 break;
2183 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2185 if (!supportable_dr_alignment)
2187 gimple *stmt;
2188 int mask;
2189 tree vectype;
2191 if (known_alignment_for_access_p (dr)
2192 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2193 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2195 do_versioning = false;
2196 break;
2199 stmt = vect_dr_stmt (dr);
2200 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2201 gcc_assert (vectype);
2203 /* At present we don't support versioning for alignment
2204 with variable VF, since there's no guarantee that the
2205 VF is a power of two. We could relax this if we added
2206 a way of enforcing a power-of-two size. */
2207 unsigned HOST_WIDE_INT size;
2208 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2210 do_versioning = false;
2211 break;
2214 /* The rightmost bits of an aligned address must be zeros.
2215 Construct the mask needed for this test. For example,
2216 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2217 mask must be 15 = 0xf. */
2218 mask = size - 1;
2220 /* FORNOW: use the same mask to test all potentially unaligned
2221 references in the loop. The vectorizer currently supports
2222 a single vector size, see the reference to
2223 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2224 vectorization factor is computed. */
2225 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2226 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2227 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2228 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2229 vect_dr_stmt (dr));
2233 /* Versioning requires at least one misaligned data reference. */
2234 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2235 do_versioning = false;
2236 else if (!do_versioning)
2237 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2240 if (do_versioning)
2242 vec<gimple *> may_misalign_stmts
2243 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2244 gimple *stmt;
2246 /* It can now be assumed that the data references in the statements
2247 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2248 of the loop being vectorized. */
2249 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2251 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2252 dr = STMT_VINFO_DATA_REF (stmt_info);
2253 SET_DR_MISALIGNMENT (dr, 0);
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_NOTE, vect_location,
2256 "Alignment of access forced using versioning.\n");
2259 if (dump_enabled_p ())
2260 dump_printf_loc (MSG_NOTE, vect_location,
2261 "Versioning for alignment will be applied.\n");
2263 /* Peeling and versioning can't be done together at this time. */
2264 gcc_assert (! (do_peeling && do_versioning));
2266 stat = vect_verify_datarefs_alignment (loop_vinfo);
2267 gcc_assert (stat);
2268 return stat;
2271 /* This point is reached if neither peeling nor versioning is being done. */
2272 gcc_assert (! (do_peeling || do_versioning));
2274 stat = vect_verify_datarefs_alignment (loop_vinfo);
2275 return stat;
2279 /* Function vect_find_same_alignment_drs.
2281 Update group and alignment relations according to the chosen
2282 vectorization factor. */
2284 static void
2285 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2287 struct data_reference *dra = DDR_A (ddr);
2288 struct data_reference *drb = DDR_B (ddr);
2289 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2290 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2292 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2293 return;
2295 if (dra == drb)
2296 return;
2298 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2299 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2300 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2301 return;
2303 /* Two references with distance zero have the same alignment. */
2304 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2305 - wi::to_poly_offset (DR_INIT (drb)));
2306 if (maybe_ne (diff, 0))
2308 /* Get the wider of the two alignments. */
2309 unsigned int align_a = (vect_calculate_target_alignment (dra)
2310 / BITS_PER_UNIT);
2311 unsigned int align_b = (vect_calculate_target_alignment (drb)
2312 / BITS_PER_UNIT);
2313 unsigned int max_align = MAX (align_a, align_b);
2315 /* Require the gap to be a multiple of the larger vector alignment. */
2316 if (!multiple_p (diff, max_align))
2317 return;
2320 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2321 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2322 if (dump_enabled_p ())
2324 dump_printf_loc (MSG_NOTE, vect_location,
2325 "accesses have the same alignment: ");
2326 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2327 dump_printf (MSG_NOTE, " and ");
2328 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2329 dump_printf (MSG_NOTE, "\n");
2334 /* Function vect_analyze_data_refs_alignment
2336 Analyze the alignment of the data-references in the loop.
2337 Return FALSE if a data reference is found that cannot be vectorized. */
2339 bool
2340 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2342 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2344 /* Mark groups of data references with same alignment using
2345 data dependence information. */
2346 vec<ddr_p> ddrs = vinfo->ddrs;
2347 struct data_dependence_relation *ddr;
2348 unsigned int i;
2350 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2351 vect_find_same_alignment_drs (ddr);
2353 vec<data_reference_p> datarefs = vinfo->datarefs;
2354 struct data_reference *dr;
2356 vect_record_base_alignments (vinfo);
2357 FOR_EACH_VEC_ELT (datarefs, i, dr)
2359 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2360 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2361 && !vect_compute_data_ref_alignment (dr))
2363 /* Strided accesses perform only component accesses, misalignment
2364 information is irrelevant for them. */
2365 if (STMT_VINFO_STRIDED_P (stmt_info)
2366 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2367 continue;
2369 if (dump_enabled_p ())
2370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2371 "not vectorized: can't calculate alignment "
2372 "for data ref.\n");
2374 return false;
2378 return true;
2382 /* Analyze alignment of DRs of stmts in NODE. */
2384 static bool
2385 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2387 /* We vectorize from the first scalar stmt in the node unless
2388 the node is permuted in which case we start from the first
2389 element in the group. */
2390 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2391 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2392 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2393 first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2395 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2396 if (! vect_compute_data_ref_alignment (dr)
2397 /* For creating the data-ref pointer we need alignment of the
2398 first element anyway. */
2399 || (dr != first_dr
2400 && ! vect_compute_data_ref_alignment (first_dr))
2401 || ! verify_data_ref_alignment (dr))
2403 if (dump_enabled_p ())
2404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405 "not vectorized: bad data alignment in basic "
2406 "block.\n");
2407 return false;
2410 return true;
2413 /* Function vect_slp_analyze_instance_alignment
2415 Analyze the alignment of the data-references in the SLP instance.
2416 Return FALSE if a data reference is found that cannot be vectorized. */
2418 bool
2419 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2421 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2423 slp_tree node;
2424 unsigned i;
2425 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2426 if (! vect_slp_analyze_and_verify_node_alignment (node))
2427 return false;
2429 node = SLP_INSTANCE_TREE (instance);
2430 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2431 && ! vect_slp_analyze_and_verify_node_alignment
2432 (SLP_INSTANCE_TREE (instance)))
2433 return false;
2435 return true;
2439 /* Analyze groups of accesses: check that DR belongs to a group of
2440 accesses of legal size, step, etc. Detect gaps, single element
2441 interleaving, and other special cases. Set grouped access info.
2442 Collect groups of strided stores for further use in SLP analysis.
2443 Worker for vect_analyze_group_access. */
2445 static bool
2446 vect_analyze_group_access_1 (struct data_reference *dr)
2448 tree step = DR_STEP (dr);
2449 tree scalar_type = TREE_TYPE (DR_REF (dr));
2450 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2451 gimple *stmt = vect_dr_stmt (dr);
2452 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2453 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2454 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2455 HOST_WIDE_INT dr_step = -1;
2456 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2457 bool slp_impossible = false;
2459 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2460 size of the interleaving group (including gaps). */
2461 if (tree_fits_shwi_p (step))
2463 dr_step = tree_to_shwi (step);
2464 /* Check that STEP is a multiple of type size. Otherwise there is
2465 a non-element-sized gap at the end of the group which we
2466 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2467 ??? As we can handle non-constant step fine here we should
2468 simply remove uses of DR_GROUP_GAP between the last and first
2469 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2470 simply not include that gap. */
2471 if ((dr_step % type_size) != 0)
2473 if (dump_enabled_p ())
2475 dump_printf_loc (MSG_NOTE, vect_location,
2476 "Step ");
2477 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2478 dump_printf (MSG_NOTE,
2479 " is not a multiple of the element size for ");
2480 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2481 dump_printf (MSG_NOTE, "\n");
2483 return false;
2485 groupsize = absu_hwi (dr_step) / type_size;
2487 else
2488 groupsize = 0;
2490 /* Not consecutive access is possible only if it is a part of interleaving. */
2491 if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2493 /* Check if it this DR is a part of interleaving, and is a single
2494 element of the group that is accessed in the loop. */
2496 /* Gaps are supported only for loads. STEP must be a multiple of the type
2497 size. */
2498 if (DR_IS_READ (dr)
2499 && (dr_step % type_size) == 0
2500 && groupsize > 0)
2502 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2503 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2504 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2505 if (dump_enabled_p ())
2507 dump_printf_loc (MSG_NOTE, vect_location,
2508 "Detected single element interleaving ");
2509 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2510 dump_printf (MSG_NOTE, " step ");
2511 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2512 dump_printf (MSG_NOTE, "\n");
2515 return true;
2518 if (dump_enabled_p ())
2520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2521 "not consecutive access ");
2522 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2525 if (bb_vinfo)
2527 /* Mark the statement as unvectorizable. */
2528 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
2529 return true;
2532 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2533 STMT_VINFO_STRIDED_P (stmt_info) = true;
2534 return true;
2537 if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2539 /* First stmt in the interleaving chain. Check the chain. */
2540 gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2541 struct data_reference *data_ref = dr;
2542 unsigned int count = 1;
2543 tree prev_init = DR_INIT (data_ref);
2544 gimple *prev = stmt;
2545 HOST_WIDE_INT diff, gaps = 0;
2547 /* By construction, all group members have INTEGER_CST DR_INITs. */
2548 while (next)
2550 /* Skip same data-refs. In case that two or more stmts share
2551 data-ref (supported only for loads), we vectorize only the first
2552 stmt, and the rest get their vectorized loads from the first
2553 one. */
2554 if (!tree_int_cst_compare (DR_INIT (data_ref),
2555 DR_INIT (STMT_VINFO_DATA_REF (
2556 vinfo_for_stmt (next)))))
2558 if (DR_IS_WRITE (data_ref))
2560 if (dump_enabled_p ())
2561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2562 "Two store stmts share the same dr.\n");
2563 return false;
2566 if (dump_enabled_p ())
2567 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2568 "Two or more load stmts share the same dr.\n");
2570 /* For load use the same data-ref load. */
2571 DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2573 prev = next;
2574 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2575 continue;
2578 prev = next;
2579 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2581 /* All group members have the same STEP by construction. */
2582 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2584 /* Check that the distance between two accesses is equal to the type
2585 size. Otherwise, we have gaps. */
2586 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2587 - TREE_INT_CST_LOW (prev_init)) / type_size;
2588 if (diff != 1)
2590 /* FORNOW: SLP of accesses with gaps is not supported. */
2591 slp_impossible = true;
2592 if (DR_IS_WRITE (data_ref))
2594 if (dump_enabled_p ())
2595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2596 "interleaved store with gaps\n");
2597 return false;
2600 gaps += diff - 1;
2603 last_accessed_element += diff;
2605 /* Store the gap from the previous member of the group. If there is no
2606 gap in the access, DR_GROUP_GAP is always 1. */
2607 DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2609 prev_init = DR_INIT (data_ref);
2610 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2611 /* Count the number of data-refs in the chain. */
2612 count++;
2615 if (groupsize == 0)
2616 groupsize = count + gaps;
2618 /* This could be UINT_MAX but as we are generating code in a very
2619 inefficient way we have to cap earlier. See PR78699 for example. */
2620 if (groupsize > 4096)
2622 if (dump_enabled_p ())
2623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2624 "group is too large\n");
2625 return false;
2628 /* Check that the size of the interleaving is equal to count for stores,
2629 i.e., that there are no gaps. */
2630 if (groupsize != count
2631 && !DR_IS_READ (dr))
2633 if (dump_enabled_p ())
2634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2635 "interleaved store with gaps\n");
2636 return false;
2639 /* If there is a gap after the last load in the group it is the
2640 difference between the groupsize and the last accessed
2641 element.
2642 When there is no gap, this difference should be 0. */
2643 DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2645 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2646 if (dump_enabled_p ())
2648 dump_printf_loc (MSG_NOTE, vect_location,
2649 "Detected interleaving ");
2650 if (DR_IS_READ (dr))
2651 dump_printf (MSG_NOTE, "load ");
2652 else
2653 dump_printf (MSG_NOTE, "store ");
2654 dump_printf (MSG_NOTE, "of size %u starting with ",
2655 (unsigned)groupsize);
2656 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2657 if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2658 dump_printf_loc (MSG_NOTE, vect_location,
2659 "There is a gap of %u elements after the group\n",
2660 DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2663 /* SLP: create an SLP data structure for every interleaving group of
2664 stores for further analysis in vect_analyse_slp. */
2665 if (DR_IS_WRITE (dr) && !slp_impossible)
2667 if (loop_vinfo)
2668 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2669 if (bb_vinfo)
2670 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2674 return true;
2677 /* Analyze groups of accesses: check that DR belongs to a group of
2678 accesses of legal size, step, etc. Detect gaps, single element
2679 interleaving, and other special cases. Set grouped access info.
2680 Collect groups of strided stores for further use in SLP analysis. */
2682 static bool
2683 vect_analyze_group_access (struct data_reference *dr)
2685 if (!vect_analyze_group_access_1 (dr))
2687 /* Dissolve the group if present. */
2688 gimple *next;
2689 gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dr)));
2690 while (stmt)
2692 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2693 next = DR_GROUP_NEXT_ELEMENT (vinfo);
2694 DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2695 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2696 stmt = next;
2698 return false;
2700 return true;
2703 /* Analyze the access pattern of the data-reference DR.
2704 In case of non-consecutive accesses call vect_analyze_group_access() to
2705 analyze groups of accesses. */
2707 static bool
2708 vect_analyze_data_ref_access (struct data_reference *dr)
2710 tree step = DR_STEP (dr);
2711 tree scalar_type = TREE_TYPE (DR_REF (dr));
2712 gimple *stmt = vect_dr_stmt (dr);
2713 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2714 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2715 struct loop *loop = NULL;
2717 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2718 return true;
2720 if (loop_vinfo)
2721 loop = LOOP_VINFO_LOOP (loop_vinfo);
2723 if (loop_vinfo && !step)
2725 if (dump_enabled_p ())
2726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2727 "bad data-ref access in loop\n");
2728 return false;
2731 /* Allow loads with zero step in inner-loop vectorization. */
2732 if (loop_vinfo && integer_zerop (step))
2734 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2735 if (!nested_in_vect_loop_p (loop, stmt))
2736 return DR_IS_READ (dr);
2737 /* Allow references with zero step for outer loops marked
2738 with pragma omp simd only - it guarantees absence of
2739 loop-carried dependencies between inner loop iterations. */
2740 if (loop->safelen < 2)
2742 if (dump_enabled_p ())
2743 dump_printf_loc (MSG_NOTE, vect_location,
2744 "zero step in inner loop of nest\n");
2745 return false;
2749 if (loop && nested_in_vect_loop_p (loop, stmt))
2751 /* Interleaved accesses are not yet supported within outer-loop
2752 vectorization for references in the inner-loop. */
2753 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2755 /* For the rest of the analysis we use the outer-loop step. */
2756 step = STMT_VINFO_DR_STEP (stmt_info);
2757 if (integer_zerop (step))
2759 if (dump_enabled_p ())
2760 dump_printf_loc (MSG_NOTE, vect_location,
2761 "zero step in outer loop.\n");
2762 return DR_IS_READ (dr);
2766 /* Consecutive? */
2767 if (TREE_CODE (step) == INTEGER_CST)
2769 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2770 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2771 || (dr_step < 0
2772 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2774 /* Mark that it is not interleaving. */
2775 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2776 return true;
2780 if (loop && nested_in_vect_loop_p (loop, stmt))
2782 if (dump_enabled_p ())
2783 dump_printf_loc (MSG_NOTE, vect_location,
2784 "grouped access in outer loop.\n");
2785 return false;
2789 /* Assume this is a DR handled by non-constant strided load case. */
2790 if (TREE_CODE (step) != INTEGER_CST)
2791 return (STMT_VINFO_STRIDED_P (stmt_info)
2792 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2793 || vect_analyze_group_access (dr)));
2795 /* Not consecutive access - check if it's a part of interleaving group. */
2796 return vect_analyze_group_access (dr);
2799 /* Compare two data-references DRA and DRB to group them into chunks
2800 suitable for grouping. */
2802 static int
2803 dr_group_sort_cmp (const void *dra_, const void *drb_)
2805 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2806 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2807 int cmp;
2809 /* Stabilize sort. */
2810 if (dra == drb)
2811 return 0;
2813 /* DRs in different loops never belong to the same group. */
2814 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2815 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2816 if (loopa != loopb)
2817 return loopa->num < loopb->num ? -1 : 1;
2819 /* Ordering of DRs according to base. */
2820 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2821 DR_BASE_ADDRESS (drb));
2822 if (cmp != 0)
2823 return cmp;
2825 /* And according to DR_OFFSET. */
2826 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2827 if (cmp != 0)
2828 return cmp;
2830 /* Put reads before writes. */
2831 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2832 return DR_IS_READ (dra) ? -1 : 1;
2834 /* Then sort after access size. */
2835 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2836 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2837 if (cmp != 0)
2838 return cmp;
2840 /* And after step. */
2841 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2842 if (cmp != 0)
2843 return cmp;
2845 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2846 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2847 if (cmp == 0)
2848 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2849 return cmp;
2852 /* If OP is the result of a conversion, return the unconverted value,
2853 otherwise return null. */
2855 static tree
2856 strip_conversion (tree op)
2858 if (TREE_CODE (op) != SSA_NAME)
2859 return NULL_TREE;
2860 gimple *stmt = SSA_NAME_DEF_STMT (op);
2861 if (!is_gimple_assign (stmt)
2862 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2863 return NULL_TREE;
2864 return gimple_assign_rhs1 (stmt);
2867 /* Return true if vectorizable_* routines can handle statements STMT1
2868 and STMT2 being in a single group. */
2870 static bool
2871 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2873 if (gimple_assign_single_p (stmt1))
2874 return gimple_assign_single_p (stmt2);
2876 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2878 /* Check for two masked loads or two masked stores. */
2879 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2880 return false;
2881 internal_fn ifn = gimple_call_internal_fn (stmt1);
2882 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2883 return false;
2884 if (ifn != gimple_call_internal_fn (stmt2))
2885 return false;
2887 /* Check that the masks are the same. Cope with casts of masks,
2888 like those created by build_mask_conversion. */
2889 tree mask1 = gimple_call_arg (stmt1, 2);
2890 tree mask2 = gimple_call_arg (stmt2, 2);
2891 if (!operand_equal_p (mask1, mask2, 0))
2893 mask1 = strip_conversion (mask1);
2894 if (!mask1)
2895 return false;
2896 mask2 = strip_conversion (mask2);
2897 if (!mask2)
2898 return false;
2899 if (!operand_equal_p (mask1, mask2, 0))
2900 return false;
2902 return true;
2905 return false;
2908 /* Function vect_analyze_data_ref_accesses.
2910 Analyze the access pattern of all the data references in the loop.
2912 FORNOW: the only access pattern that is considered vectorizable is a
2913 simple step 1 (consecutive) access.
2915 FORNOW: handle only arrays and pointer accesses. */
2917 bool
2918 vect_analyze_data_ref_accesses (vec_info *vinfo)
2920 unsigned int i;
2921 vec<data_reference_p> datarefs = vinfo->datarefs;
2922 struct data_reference *dr;
2924 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2926 if (datarefs.is_empty ())
2927 return true;
2929 /* Sort the array of datarefs to make building the interleaving chains
2930 linear. Don't modify the original vector's order, it is needed for
2931 determining what dependencies are reversed. */
2932 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2933 datarefs_copy.qsort (dr_group_sort_cmp);
2935 /* Build the interleaving chains. */
2936 for (i = 0; i < datarefs_copy.length () - 1;)
2938 data_reference_p dra = datarefs_copy[i];
2939 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2940 stmt_vec_info lastinfo = NULL;
2941 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2942 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2944 ++i;
2945 continue;
2947 for (i = i + 1; i < datarefs_copy.length (); ++i)
2949 data_reference_p drb = datarefs_copy[i];
2950 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2951 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2952 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2953 break;
2955 /* ??? Imperfect sorting (non-compatible types, non-modulo
2956 accesses, same accesses) can lead to a group to be artificially
2957 split here as we don't just skip over those. If it really
2958 matters we can push those to a worklist and re-iterate
2959 over them. The we can just skip ahead to the next DR here. */
2961 /* DRs in a different loop should not be put into the same
2962 interleaving group. */
2963 if (gimple_bb (DR_STMT (dra))->loop_father
2964 != gimple_bb (DR_STMT (drb))->loop_father)
2965 break;
2967 /* Check that the data-refs have same first location (except init)
2968 and they are both either store or load (not load and store,
2969 not masked loads or stores). */
2970 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2971 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2972 DR_BASE_ADDRESS (drb)) != 0
2973 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2974 || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2975 break;
2977 /* Check that the data-refs have the same constant size. */
2978 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2979 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2980 if (!tree_fits_uhwi_p (sza)
2981 || !tree_fits_uhwi_p (szb)
2982 || !tree_int_cst_equal (sza, szb))
2983 break;
2985 /* Check that the data-refs have the same step. */
2986 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2987 break;
2989 /* Check the types are compatible.
2990 ??? We don't distinguish this during sorting. */
2991 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2992 TREE_TYPE (DR_REF (drb))))
2993 break;
2995 /* Check that the DR_INITs are compile-time constants. */
2996 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2997 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2998 break;
3000 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3001 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3002 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3003 HOST_WIDE_INT init_prev
3004 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3005 gcc_assert (init_a <= init_b
3006 && init_a <= init_prev
3007 && init_prev <= init_b);
3009 /* Do not place the same access in the interleaving chain twice. */
3010 if (init_b == init_prev)
3012 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3013 < gimple_uid (DR_STMT (drb)));
3014 /* ??? For now we simply "drop" the later reference which is
3015 otherwise the same rather than finishing off this group.
3016 In the end we'd want to re-process duplicates forming
3017 multiple groups from the refs, likely by just collecting
3018 all candidates (including duplicates and split points
3019 below) in a vector and then process them together. */
3020 continue;
3023 /* If init_b == init_a + the size of the type * k, we have an
3024 interleaving, and DRA is accessed before DRB. */
3025 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3026 if (type_size_a == 0
3027 || (init_b - init_a) % type_size_a != 0)
3028 break;
3030 /* If we have a store, the accesses are adjacent. This splits
3031 groups into chunks we support (we don't support vectorization
3032 of stores with gaps). */
3033 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3034 break;
3036 /* If the step (if not zero or non-constant) is greater than the
3037 difference between data-refs' inits this splits groups into
3038 suitable sizes. */
3039 if (tree_fits_shwi_p (DR_STEP (dra)))
3041 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3042 if (step != 0 && step <= (init_b - init_a))
3043 break;
3046 if (dump_enabled_p ())
3048 dump_printf_loc (MSG_NOTE, vect_location,
3049 "Detected interleaving ");
3050 if (DR_IS_READ (dra))
3051 dump_printf (MSG_NOTE, "load ");
3052 else
3053 dump_printf (MSG_NOTE, "store ");
3054 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3055 dump_printf (MSG_NOTE, " and ");
3056 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3057 dump_printf (MSG_NOTE, "\n");
3060 /* Link the found element into the group list. */
3061 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3063 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3064 lastinfo = stmtinfo_a;
3066 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3067 DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3068 lastinfo = stmtinfo_b;
3072 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3073 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr)))
3074 && !vect_analyze_data_ref_access (dr))
3076 if (dump_enabled_p ())
3077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3078 "not vectorized: complicated access pattern.\n");
3080 if (is_a <bb_vec_info> (vinfo))
3082 /* Mark the statement as not vectorizable. */
3083 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
3084 continue;
3086 else
3088 datarefs_copy.release ();
3089 return false;
3093 datarefs_copy.release ();
3094 return true;
3097 /* Function vect_vfa_segment_size.
3099 Input:
3100 DR: The data reference.
3101 LENGTH_FACTOR: segment length to consider.
3103 Return a value suitable for the dr_with_seg_len::seg_len field.
3104 This is the "distance travelled" by the pointer from the first
3105 iteration in the segment to the last. Note that it does not include
3106 the size of the access; in effect it only describes the first byte. */
3108 static tree
3109 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3111 length_factor = size_binop (MINUS_EXPR,
3112 fold_convert (sizetype, length_factor),
3113 size_one_node);
3114 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3115 length_factor);
3118 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3119 gives the worst-case number of bytes covered by the segment. */
3121 static unsigned HOST_WIDE_INT
3122 vect_vfa_access_size (data_reference *dr)
3124 stmt_vec_info stmt_vinfo = vinfo_for_stmt (vect_dr_stmt (dr));
3125 tree ref_type = TREE_TYPE (DR_REF (dr));
3126 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3127 unsigned HOST_WIDE_INT access_size = ref_size;
3128 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3130 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3131 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3133 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3134 && (vect_supportable_dr_alignment (dr, false)
3135 == dr_explicit_realign_optimized))
3137 /* We might access a full vector's worth. */
3138 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3139 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3141 return access_size;
3144 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3146 static unsigned int
3147 vect_vfa_align (const data_reference *dr)
3149 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3152 /* Function vect_no_alias_p.
3154 Given data references A and B with equal base and offset, see whether
3155 the alias relation can be decided at compilation time. Return 1 if
3156 it can and the references alias, 0 if it can and the references do
3157 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3158 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3159 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3161 static int
3162 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3163 tree segment_length_a, tree segment_length_b,
3164 unsigned HOST_WIDE_INT access_size_a,
3165 unsigned HOST_WIDE_INT access_size_b)
3167 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3168 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3169 poly_uint64 const_length_a;
3170 poly_uint64 const_length_b;
3172 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3173 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3174 [a, a+12) */
3175 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3177 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3178 offset_a = (offset_a + access_size_a) - const_length_a;
3180 else
3181 const_length_a = tree_to_poly_uint64 (segment_length_a);
3182 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3184 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3185 offset_b = (offset_b + access_size_b) - const_length_b;
3187 else
3188 const_length_b = tree_to_poly_uint64 (segment_length_b);
3190 const_length_a += access_size_a;
3191 const_length_b += access_size_b;
3193 if (ranges_known_overlap_p (offset_a, const_length_a,
3194 offset_b, const_length_b))
3195 return 1;
3197 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3198 offset_b, const_length_b))
3199 return 0;
3201 return -1;
3204 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3205 in DDR is >= VF. */
3207 static bool
3208 dependence_distance_ge_vf (data_dependence_relation *ddr,
3209 unsigned int loop_depth, poly_uint64 vf)
3211 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3212 || DDR_NUM_DIST_VECTS (ddr) == 0)
3213 return false;
3215 /* If the dependence is exact, we should have limited the VF instead. */
3216 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3218 unsigned int i;
3219 lambda_vector dist_v;
3220 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3222 HOST_WIDE_INT dist = dist_v[loop_depth];
3223 if (dist != 0
3224 && !(dist > 0 && DDR_REVERSED_P (ddr))
3225 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3226 return false;
3229 if (dump_enabled_p ())
3231 dump_printf_loc (MSG_NOTE, vect_location,
3232 "dependence distance between ");
3233 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3234 dump_printf (MSG_NOTE, " and ");
3235 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3236 dump_printf (MSG_NOTE, " is >= VF\n");
3239 return true;
3242 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3244 static void
3245 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3247 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3248 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3249 dump_printf (dump_kind, ") >= ");
3250 dump_dec (dump_kind, lower_bound.min_value);
3253 /* Record that the vectorized loop requires the vec_lower_bound described
3254 by EXPR, UNSIGNED_P and MIN_VALUE. */
3256 static void
3257 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3258 poly_uint64 min_value)
3260 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3261 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3262 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3264 unsigned_p &= lower_bounds[i].unsigned_p;
3265 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3266 if (lower_bounds[i].unsigned_p != unsigned_p
3267 || maybe_lt (lower_bounds[i].min_value, min_value))
3269 lower_bounds[i].unsigned_p = unsigned_p;
3270 lower_bounds[i].min_value = min_value;
3271 if (dump_enabled_p ())
3273 dump_printf_loc (MSG_NOTE, vect_location,
3274 "updating run-time check to ");
3275 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3276 dump_printf (MSG_NOTE, "\n");
3279 return;
3282 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3283 if (dump_enabled_p ())
3285 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3286 dump_lower_bound (MSG_NOTE, lower_bound);
3287 dump_printf (MSG_NOTE, "\n");
3289 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3292 /* Return true if it's unlikely that the step of the vectorized form of DR
3293 will span fewer than GAP bytes. */
3295 static bool
3296 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3298 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
3299 HOST_WIDE_INT count
3300 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3301 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3302 count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3303 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3306 /* Return true if we know that there is no alias between DR_A and DR_B
3307 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3308 *LOWER_BOUND_OUT to this N. */
3310 static bool
3311 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3312 poly_uint64 *lower_bound_out)
3314 /* Check that there is a constant gap of known sign between DR_A
3315 and DR_B. */
3316 poly_int64 init_a, init_b;
3317 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3318 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3319 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3320 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3321 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3322 || !ordered_p (init_a, init_b))
3323 return false;
3325 /* Sort DR_A and DR_B by the address they access. */
3326 if (maybe_lt (init_b, init_a))
3328 std::swap (init_a, init_b);
3329 std::swap (dr_a, dr_b);
3332 /* If the two accesses could be dependent within a scalar iteration,
3333 make sure that we'd retain their order. */
3334 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3335 && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3336 vect_dr_stmt (dr_b)))
3337 return false;
3339 /* There is no alias if abs (DR_STEP) is greater than or equal to
3340 the bytes spanned by the combination of the two accesses. */
3341 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3342 return true;
3345 /* Function vect_prune_runtime_alias_test_list.
3347 Prune a list of ddrs to be tested at run-time by versioning for alias.
3348 Merge several alias checks into one if possible.
3349 Return FALSE if resulting list of ddrs is longer then allowed by
3350 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3352 bool
3353 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3355 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3356 hash_set <tree_pair_hash> compared_objects;
3358 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3359 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3360 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3361 vec<vec_object_pair> &check_unequal_addrs
3362 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3363 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3364 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3366 ddr_p ddr;
3367 unsigned int i;
3368 tree length_factor;
3370 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3372 /* Step values are irrelevant for aliasing if the number of vector
3373 iterations is equal to the number of scalar iterations (which can
3374 happen for fully-SLP loops). */
3375 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3377 if (!ignore_step_p)
3379 /* Convert the checks for nonzero steps into bound tests. */
3380 tree value;
3381 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3382 vect_check_lower_bound (loop_vinfo, value, true, 1);
3385 if (may_alias_ddrs.is_empty ())
3386 return true;
3388 comp_alias_ddrs.create (may_alias_ddrs.length ());
3390 unsigned int loop_depth
3391 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3392 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3394 /* First, we collect all data ref pairs for aliasing checks. */
3395 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3397 int comp_res;
3398 poly_uint64 lower_bound;
3399 struct data_reference *dr_a, *dr_b;
3400 gimple *dr_group_first_a, *dr_group_first_b;
3401 tree segment_length_a, segment_length_b;
3402 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3403 unsigned int align_a, align_b;
3404 gimple *stmt_a, *stmt_b;
3406 /* Ignore the alias if the VF we chose ended up being no greater
3407 than the dependence distance. */
3408 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3409 continue;
3411 if (DDR_OBJECT_A (ddr))
3413 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3414 if (!compared_objects.add (new_pair))
3416 if (dump_enabled_p ())
3418 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3419 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3420 dump_printf (MSG_NOTE, " and ");
3421 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3422 dump_printf (MSG_NOTE, " have different addresses\n");
3424 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3426 continue;
3429 dr_a = DDR_A (ddr);
3430 stmt_a = vect_dr_stmt (DDR_A (ddr));
3432 dr_b = DDR_B (ddr);
3433 stmt_b = vect_dr_stmt (DDR_B (ddr));
3435 /* Skip the pair if inter-iteration dependencies are irrelevant
3436 and intra-iteration dependencies are guaranteed to be honored. */
3437 if (ignore_step_p
3438 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3439 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3441 if (dump_enabled_p ())
3443 dump_printf_loc (MSG_NOTE, vect_location,
3444 "no need for alias check between ");
3445 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3446 dump_printf (MSG_NOTE, " and ");
3447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3448 dump_printf (MSG_NOTE, " when VF is 1\n");
3450 continue;
3453 /* See whether we can handle the alias using a bounds check on
3454 the step, and whether that's likely to be the best approach.
3455 (It might not be, for example, if the minimum step is much larger
3456 than the number of bytes handled by one vector iteration.) */
3457 if (!ignore_step_p
3458 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3459 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3460 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3461 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3463 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3464 if (dump_enabled_p ())
3466 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3467 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3468 dump_printf (MSG_NOTE, " and ");
3469 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3470 dump_printf (MSG_NOTE, " when the step ");
3471 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3472 dump_printf (MSG_NOTE, " is outside ");
3473 if (unsigned_p)
3474 dump_printf (MSG_NOTE, "[0");
3475 else
3477 dump_printf (MSG_NOTE, "(");
3478 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3480 dump_printf (MSG_NOTE, ", ");
3481 dump_dec (MSG_NOTE, lower_bound);
3482 dump_printf (MSG_NOTE, ")\n");
3484 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3485 lower_bound);
3486 continue;
3489 dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3490 if (dr_group_first_a)
3492 stmt_a = dr_group_first_a;
3493 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3496 dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3497 if (dr_group_first_b)
3499 stmt_b = dr_group_first_b;
3500 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3503 if (ignore_step_p)
3505 segment_length_a = size_zero_node;
3506 segment_length_b = size_zero_node;
3508 else
3510 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3511 length_factor = scalar_loop_iters;
3512 else
3513 length_factor = size_int (vect_factor);
3514 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3515 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3517 access_size_a = vect_vfa_access_size (dr_a);
3518 access_size_b = vect_vfa_access_size (dr_b);
3519 align_a = vect_vfa_align (dr_a);
3520 align_b = vect_vfa_align (dr_b);
3522 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3523 DR_BASE_ADDRESS (dr_b));
3524 if (comp_res == 0)
3525 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3526 DR_OFFSET (dr_b));
3528 /* See whether the alias is known at compilation time. */
3529 if (comp_res == 0
3530 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3531 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3532 && poly_int_tree_p (segment_length_a)
3533 && poly_int_tree_p (segment_length_b))
3535 int res = vect_compile_time_alias (dr_a, dr_b,
3536 segment_length_a,
3537 segment_length_b,
3538 access_size_a,
3539 access_size_b);
3540 if (res >= 0 && dump_enabled_p ())
3542 dump_printf_loc (MSG_NOTE, vect_location,
3543 "can tell at compile time that ");
3544 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3545 dump_printf (MSG_NOTE, " and ");
3546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3547 if (res == 0)
3548 dump_printf (MSG_NOTE, " do not alias\n");
3549 else
3550 dump_printf (MSG_NOTE, " alias\n");
3553 if (res == 0)
3554 continue;
3556 if (res == 1)
3558 if (dump_enabled_p ())
3559 dump_printf_loc (MSG_NOTE, vect_location,
3560 "not vectorized: compilation time alias.\n");
3561 return false;
3565 dr_with_seg_len_pair_t dr_with_seg_len_pair
3566 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3567 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3569 /* Canonicalize pairs by sorting the two DR members. */
3570 if (comp_res > 0)
3571 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3573 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3576 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3578 unsigned int count = (comp_alias_ddrs.length ()
3579 + check_unequal_addrs.length ());
3581 dump_printf_loc (MSG_NOTE, vect_location,
3582 "improved number of alias checks from %d to %d\n",
3583 may_alias_ddrs.length (), count);
3584 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3586 if (dump_enabled_p ())
3587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3588 "number of versioning for alias "
3589 "run-time tests exceeds %d "
3590 "(--param vect-max-version-for-alias-checks)\n",
3591 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3592 return false;
3595 return true;
3598 /* Check whether we can use an internal function for a gather load
3599 or scatter store. READ_P is true for loads and false for stores.
3600 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3601 the type of the memory elements being loaded or stored. OFFSET_BITS
3602 is the number of bits in each scalar offset and OFFSET_SIGN is the
3603 sign of the offset. SCALE is the amount by which the offset should
3604 be multiplied *after* it has been converted to address width.
3606 Return true if the function is supported, storing the function
3607 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3609 bool
3610 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3611 tree memory_type, unsigned int offset_bits,
3612 signop offset_sign, int scale,
3613 internal_fn *ifn_out, tree *element_type_out)
3615 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3616 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3617 if (offset_bits > element_bits)
3618 /* Internal functions require the offset to be the same width as
3619 the vector elements. We can extend narrower offsets, but it isn't
3620 safe to truncate wider offsets. */
3621 return false;
3623 if (element_bits != memory_bits)
3624 /* For now the vector elements must be the same width as the
3625 memory elements. */
3626 return false;
3628 /* Work out which function we need. */
3629 internal_fn ifn;
3630 if (read_p)
3631 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3632 else
3633 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3635 /* Test whether the target supports this combination. */
3636 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3637 offset_sign, scale))
3638 return false;
3640 *ifn_out = ifn;
3641 *element_type_out = TREE_TYPE (vectype);
3642 return true;
3645 /* CALL is a call to an internal gather load or scatter store function.
3646 Describe the operation in INFO. */
3648 static void
3649 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3651 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3652 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3653 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3655 info->ifn = gimple_call_internal_fn (call);
3656 info->decl = NULL_TREE;
3657 info->base = gimple_call_arg (call, 0);
3658 info->offset = gimple_call_arg (call, 1);
3659 info->offset_dt = vect_unknown_def_type;
3660 info->offset_vectype = NULL_TREE;
3661 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3662 info->element_type = TREE_TYPE (vectype);
3663 info->memory_type = TREE_TYPE (DR_REF (dr));
3666 /* Return true if a non-affine read or write in STMT is suitable for a
3667 gather load or scatter store. Describe the operation in *INFO if so. */
3669 bool
3670 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3671 gather_scatter_info *info)
3673 HOST_WIDE_INT scale = 1;
3674 poly_int64 pbitpos, pbitsize;
3675 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3676 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3677 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3678 tree offtype = NULL_TREE;
3679 tree decl = NULL_TREE, base, off;
3680 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3681 tree memory_type = TREE_TYPE (DR_REF (dr));
3682 machine_mode pmode;
3683 int punsignedp, reversep, pvolatilep = 0;
3684 internal_fn ifn;
3685 tree element_type;
3686 bool masked_p = false;
3688 /* See whether this is already a call to a gather/scatter internal function.
3689 If not, see whether it's a masked load or store. */
3690 gcall *call = dyn_cast <gcall *> (stmt);
3691 if (call && gimple_call_internal_p (call))
3693 ifn = gimple_call_internal_fn (stmt);
3694 if (internal_gather_scatter_fn_p (ifn))
3696 vect_describe_gather_scatter_call (call, info);
3697 return true;
3699 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3702 /* True if we should aim to use internal functions rather than
3703 built-in functions. */
3704 bool use_ifn_p = (DR_IS_READ (dr)
3705 ? supports_vec_gather_load_p ()
3706 : supports_vec_scatter_store_p ());
3708 base = DR_REF (dr);
3709 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3710 see if we can use the def stmt of the address. */
3711 if (masked_p
3712 && TREE_CODE (base) == MEM_REF
3713 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3714 && integer_zerop (TREE_OPERAND (base, 1))
3715 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3717 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3718 if (is_gimple_assign (def_stmt)
3719 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3720 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3723 /* The gather and scatter builtins need address of the form
3724 loop_invariant + vector * {1, 2, 4, 8}
3726 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3727 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3728 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3729 multiplications and additions in it. To get a vector, we need
3730 a single SSA_NAME that will be defined in the loop and will
3731 contain everything that is not loop invariant and that can be
3732 vectorized. The following code attempts to find such a preexistng
3733 SSA_NAME OFF and put the loop invariants into a tree BASE
3734 that can be gimplified before the loop. */
3735 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3736 &punsignedp, &reversep, &pvolatilep);
3737 gcc_assert (base && !reversep);
3738 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3740 if (TREE_CODE (base) == MEM_REF)
3742 if (!integer_zerop (TREE_OPERAND (base, 1)))
3744 if (off == NULL_TREE)
3745 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3746 else
3747 off = size_binop (PLUS_EXPR, off,
3748 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3750 base = TREE_OPERAND (base, 0);
3752 else
3753 base = build_fold_addr_expr (base);
3755 if (off == NULL_TREE)
3756 off = size_zero_node;
3758 /* If base is not loop invariant, either off is 0, then we start with just
3759 the constant offset in the loop invariant BASE and continue with base
3760 as OFF, otherwise give up.
3761 We could handle that case by gimplifying the addition of base + off
3762 into some SSA_NAME and use that as off, but for now punt. */
3763 if (!expr_invariant_in_loop_p (loop, base))
3765 if (!integer_zerop (off))
3766 return false;
3767 off = base;
3768 base = size_int (pbytepos);
3770 /* Otherwise put base + constant offset into the loop invariant BASE
3771 and continue with OFF. */
3772 else
3774 base = fold_convert (sizetype, base);
3775 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3778 /* OFF at this point may be either a SSA_NAME or some tree expression
3779 from get_inner_reference. Try to peel off loop invariants from it
3780 into BASE as long as possible. */
3781 STRIP_NOPS (off);
3782 while (offtype == NULL_TREE)
3784 enum tree_code code;
3785 tree op0, op1, add = NULL_TREE;
3787 if (TREE_CODE (off) == SSA_NAME)
3789 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3791 if (expr_invariant_in_loop_p (loop, off))
3792 return false;
3794 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3795 break;
3797 op0 = gimple_assign_rhs1 (def_stmt);
3798 code = gimple_assign_rhs_code (def_stmt);
3799 op1 = gimple_assign_rhs2 (def_stmt);
3801 else
3803 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3804 return false;
3805 code = TREE_CODE (off);
3806 extract_ops_from_tree (off, &code, &op0, &op1);
3808 switch (code)
3810 case POINTER_PLUS_EXPR:
3811 case PLUS_EXPR:
3812 if (expr_invariant_in_loop_p (loop, op0))
3814 add = op0;
3815 off = op1;
3816 do_add:
3817 add = fold_convert (sizetype, add);
3818 if (scale != 1)
3819 add = size_binop (MULT_EXPR, add, size_int (scale));
3820 base = size_binop (PLUS_EXPR, base, add);
3821 continue;
3823 if (expr_invariant_in_loop_p (loop, op1))
3825 add = op1;
3826 off = op0;
3827 goto do_add;
3829 break;
3830 case MINUS_EXPR:
3831 if (expr_invariant_in_loop_p (loop, op1))
3833 add = fold_convert (sizetype, op1);
3834 add = size_binop (MINUS_EXPR, size_zero_node, add);
3835 off = op0;
3836 goto do_add;
3838 break;
3839 case MULT_EXPR:
3840 if (scale == 1 && tree_fits_shwi_p (op1))
3842 int new_scale = tree_to_shwi (op1);
3843 /* Only treat this as a scaling operation if the target
3844 supports it. */
3845 if (use_ifn_p
3846 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3847 vectype, memory_type, 1,
3848 TYPE_SIGN (TREE_TYPE (op0)),
3849 new_scale, &ifn,
3850 &element_type))
3851 break;
3852 scale = new_scale;
3853 off = op0;
3854 continue;
3856 break;
3857 case SSA_NAME:
3858 off = op0;
3859 continue;
3860 CASE_CONVERT:
3861 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3862 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3863 break;
3864 if (TYPE_PRECISION (TREE_TYPE (op0))
3865 == TYPE_PRECISION (TREE_TYPE (off)))
3867 off = op0;
3868 continue;
3871 /* The internal functions need the offset to be the same width
3872 as the elements of VECTYPE. Don't include operations that
3873 cast the offset from that width to a different width. */
3874 if (use_ifn_p
3875 && (int_size_in_bytes (TREE_TYPE (vectype))
3876 == int_size_in_bytes (TREE_TYPE (off))))
3877 break;
3879 if (TYPE_PRECISION (TREE_TYPE (op0))
3880 < TYPE_PRECISION (TREE_TYPE (off)))
3882 off = op0;
3883 offtype = TREE_TYPE (off);
3884 STRIP_NOPS (off);
3885 continue;
3887 break;
3888 default:
3889 break;
3891 break;
3894 /* If at the end OFF still isn't a SSA_NAME or isn't
3895 defined in the loop, punt. */
3896 if (TREE_CODE (off) != SSA_NAME
3897 || expr_invariant_in_loop_p (loop, off))
3898 return false;
3900 if (offtype == NULL_TREE)
3901 offtype = TREE_TYPE (off);
3903 if (use_ifn_p)
3905 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3906 memory_type, TYPE_PRECISION (offtype),
3907 TYPE_SIGN (offtype), scale, &ifn,
3908 &element_type))
3909 return false;
3911 else
3913 if (DR_IS_READ (dr))
3915 if (targetm.vectorize.builtin_gather)
3916 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3918 else
3920 if (targetm.vectorize.builtin_scatter)
3921 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3924 if (!decl)
3925 return false;
3927 ifn = IFN_LAST;
3928 element_type = TREE_TYPE (vectype);
3931 info->ifn = ifn;
3932 info->decl = decl;
3933 info->base = base;
3934 info->offset = off;
3935 info->offset_dt = vect_unknown_def_type;
3936 info->offset_vectype = NULL_TREE;
3937 info->scale = scale;
3938 info->element_type = element_type;
3939 info->memory_type = memory_type;
3940 return true;
3943 /* Find the data references in STMT, analyze them with respect to LOOP and
3944 append them to DATAREFS. Return false if datarefs in this stmt cannot
3945 be handled. */
3947 bool
3948 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3949 vec<data_reference_p> *datarefs)
3951 /* We can ignore clobbers for dataref analysis - they are removed during
3952 loop vectorization and BB vectorization checks dependences with a
3953 stmt walk. */
3954 if (gimple_clobber_p (stmt))
3955 return true;
3957 if (gimple_has_volatile_ops (stmt))
3959 if (dump_enabled_p ())
3961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3962 "not vectorized: volatile type ");
3963 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3965 return false;
3968 if (stmt_can_throw_internal (stmt))
3970 if (dump_enabled_p ())
3972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3973 "not vectorized: statement can throw an "
3974 "exception ");
3975 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3977 return false;
3980 auto_vec<data_reference_p, 2> refs;
3981 if (!find_data_references_in_stmt (loop, stmt, &refs))
3982 return false;
3984 if (refs.is_empty ())
3985 return true;
3987 if (refs.length () > 1)
3989 if (dump_enabled_p ())
3991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3992 "not vectorized: more than one data ref "
3993 "in stmt: ");
3994 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3996 return false;
3999 if (gcall *call = dyn_cast <gcall *> (stmt))
4000 if (!gimple_call_internal_p (call)
4001 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4002 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4004 if (dump_enabled_p ())
4006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4007 "not vectorized: dr in a call ");
4008 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4010 return false;
4013 data_reference_p dr = refs.pop ();
4014 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4015 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4017 if (dump_enabled_p ())
4019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4020 "not vectorized: statement is bitfield "
4021 "access ");
4022 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4024 return false;
4027 if (DR_BASE_ADDRESS (dr)
4028 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4030 if (dump_enabled_p ())
4031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4032 "not vectorized: base addr of dr is a "
4033 "constant\n");
4034 return false;
4037 datarefs->safe_push (dr);
4038 return true;
4041 /* Function vect_analyze_data_refs.
4043 Find all the data references in the loop or basic block.
4045 The general structure of the analysis of data refs in the vectorizer is as
4046 follows:
4047 1- vect_analyze_data_refs(loop/bb): call
4048 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4049 in the loop/bb and their dependences.
4050 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4051 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4052 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4056 bool
4057 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4059 struct loop *loop = NULL;
4060 unsigned int i;
4061 struct data_reference *dr;
4062 tree scalar_type;
4064 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4066 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4067 loop = LOOP_VINFO_LOOP (loop_vinfo);
4069 /* Go through the data-refs, check that the analysis succeeded. Update
4070 pointer from stmt_vec_info struct to DR and vectype. */
4072 vec<data_reference_p> datarefs = vinfo->datarefs;
4073 FOR_EACH_VEC_ELT (datarefs, i, dr)
4075 gimple *stmt;
4076 stmt_vec_info stmt_info;
4077 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4078 bool simd_lane_access = false;
4079 poly_uint64 vf;
4081 gcc_assert (DR_REF (dr));
4082 stmt = vect_dr_stmt (dr);
4083 stmt_info = vinfo_for_stmt (stmt);
4085 /* Check that analysis of the data-ref succeeded. */
4086 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4087 || !DR_STEP (dr))
4089 bool maybe_gather
4090 = DR_IS_READ (dr)
4091 && !TREE_THIS_VOLATILE (DR_REF (dr))
4092 && (targetm.vectorize.builtin_gather != NULL
4093 || supports_vec_gather_load_p ());
4094 bool maybe_scatter
4095 = DR_IS_WRITE (dr)
4096 && !TREE_THIS_VOLATILE (DR_REF (dr))
4097 && (targetm.vectorize.builtin_scatter != NULL
4098 || supports_vec_scatter_store_p ());
4099 bool maybe_simd_lane_access
4100 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4102 /* If target supports vector gather loads or scatter stores, or if
4103 this might be a SIMD lane access, see if they can't be used. */
4104 if (is_a <loop_vec_info> (vinfo)
4105 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4106 && !nested_in_vect_loop_p (loop, stmt))
4108 struct data_reference *newdr
4109 = create_data_ref (NULL, loop_containing_stmt (stmt),
4110 DR_REF (dr), stmt, !maybe_scatter,
4111 DR_IS_CONDITIONAL_IN_STMT (dr));
4112 gcc_assert (newdr != NULL && DR_REF (newdr));
4113 if (DR_BASE_ADDRESS (newdr)
4114 && DR_OFFSET (newdr)
4115 && DR_INIT (newdr)
4116 && DR_STEP (newdr)
4117 && integer_zerop (DR_STEP (newdr)))
4119 if (maybe_simd_lane_access)
4121 tree off = DR_OFFSET (newdr);
4122 STRIP_NOPS (off);
4123 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4124 && TREE_CODE (off) == MULT_EXPR
4125 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4127 tree step = TREE_OPERAND (off, 1);
4128 off = TREE_OPERAND (off, 0);
4129 STRIP_NOPS (off);
4130 if (CONVERT_EXPR_P (off)
4131 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4132 0)))
4133 < TYPE_PRECISION (TREE_TYPE (off)))
4134 off = TREE_OPERAND (off, 0);
4135 if (TREE_CODE (off) == SSA_NAME)
4137 gimple *def = SSA_NAME_DEF_STMT (off);
4138 tree reft = TREE_TYPE (DR_REF (newdr));
4139 if (is_gimple_call (def)
4140 && gimple_call_internal_p (def)
4141 && (gimple_call_internal_fn (def)
4142 == IFN_GOMP_SIMD_LANE))
4144 tree arg = gimple_call_arg (def, 0);
4145 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4146 arg = SSA_NAME_VAR (arg);
4147 if (arg == loop->simduid
4148 /* For now. */
4149 && tree_int_cst_equal
4150 (TYPE_SIZE_UNIT (reft),
4151 step))
4153 DR_OFFSET (newdr) = ssize_int (0);
4154 DR_STEP (newdr) = step;
4155 DR_OFFSET_ALIGNMENT (newdr)
4156 = BIGGEST_ALIGNMENT;
4157 DR_STEP_ALIGNMENT (newdr)
4158 = highest_pow2_factor (step);
4159 dr = newdr;
4160 simd_lane_access = true;
4166 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4168 dr = newdr;
4169 if (maybe_gather)
4170 gatherscatter = GATHER;
4171 else
4172 gatherscatter = SCATTER;
4175 if (gatherscatter == SG_NONE && !simd_lane_access)
4176 free_data_ref (newdr);
4179 if (gatherscatter == SG_NONE && !simd_lane_access)
4181 if (dump_enabled_p ())
4183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4184 "not vectorized: data ref analysis "
4185 "failed ");
4186 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4188 if (is_a <bb_vec_info> (vinfo))
4190 /* In BB vectorization the ref can still participate
4191 in dependence analysis, we just can't vectorize it. */
4192 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4193 continue;
4195 return false;
4199 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4200 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4201 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4203 if (dump_enabled_p ())
4205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4206 "not vectorized: base object not addressable "
4207 "for stmt: ");
4208 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4210 if (is_a <bb_vec_info> (vinfo))
4212 /* In BB vectorization the ref can still participate
4213 in dependence analysis, we just can't vectorize it. */
4214 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4215 continue;
4217 return false;
4220 if (is_a <loop_vec_info> (vinfo)
4221 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4223 if (nested_in_vect_loop_p (loop, stmt))
4225 if (dump_enabled_p ())
4227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4228 "not vectorized: not suitable for strided "
4229 "load ");
4230 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4232 return false;
4234 STMT_VINFO_STRIDED_P (stmt_info) = true;
4237 /* Update DR field in stmt_vec_info struct. */
4239 /* If the dataref is in an inner-loop of the loop that is considered for
4240 for vectorization, we also want to analyze the access relative to
4241 the outer-loop (DR contains information only relative to the
4242 inner-most enclosing loop). We do that by building a reference to the
4243 first location accessed by the inner-loop, and analyze it relative to
4244 the outer-loop. */
4245 if (loop && nested_in_vect_loop_p (loop, stmt))
4247 /* Build a reference to the first location accessed by the
4248 inner loop: *(BASE + INIT + OFFSET). By construction,
4249 this address must be invariant in the inner loop, so we
4250 can consider it as being used in the outer loop. */
4251 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4252 tree offset = unshare_expr (DR_OFFSET (dr));
4253 tree init = unshare_expr (DR_INIT (dr));
4254 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4255 init, offset);
4256 tree init_addr = fold_build_pointer_plus (base, init_offset);
4257 tree init_ref = build_fold_indirect_ref (init_addr);
4259 if (dump_enabled_p ())
4261 dump_printf_loc (MSG_NOTE, vect_location,
4262 "analyze in outer loop: ");
4263 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4264 dump_printf (MSG_NOTE, "\n");
4267 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4268 init_ref, loop))
4269 /* dr_analyze_innermost already explained the failure. */
4270 return false;
4272 if (dump_enabled_p ())
4274 dump_printf_loc (MSG_NOTE, vect_location,
4275 "\touter base_address: ");
4276 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4277 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4278 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4279 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4280 STMT_VINFO_DR_OFFSET (stmt_info));
4281 dump_printf (MSG_NOTE,
4282 "\n\touter constant offset from base address: ");
4283 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4284 STMT_VINFO_DR_INIT (stmt_info));
4285 dump_printf (MSG_NOTE, "\n\touter step: ");
4286 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4287 STMT_VINFO_DR_STEP (stmt_info));
4288 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4289 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4290 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4291 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4292 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4293 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4294 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4295 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4299 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4300 STMT_VINFO_DATA_REF (stmt_info) = dr;
4301 if (simd_lane_access)
4303 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4304 free_data_ref (datarefs[i]);
4305 datarefs[i] = dr;
4308 /* Set vectype for STMT. */
4309 scalar_type = TREE_TYPE (DR_REF (dr));
4310 STMT_VINFO_VECTYPE (stmt_info)
4311 = get_vectype_for_scalar_type (scalar_type);
4312 if (!STMT_VINFO_VECTYPE (stmt_info))
4314 if (dump_enabled_p ())
4316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4317 "not vectorized: no vectype for stmt: ");
4318 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4319 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4320 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4321 scalar_type);
4322 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4325 if (is_a <bb_vec_info> (vinfo))
4327 /* No vector type is fine, the ref can still participate
4328 in dependence analysis, we just can't vectorize it. */
4329 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4330 continue;
4333 if (gatherscatter != SG_NONE || simd_lane_access)
4335 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4336 if (gatherscatter != SG_NONE)
4337 free_data_ref (dr);
4339 return false;
4341 else
4343 if (dump_enabled_p ())
4345 dump_printf_loc (MSG_NOTE, vect_location,
4346 "got vectype for stmt: ");
4347 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4348 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4349 STMT_VINFO_VECTYPE (stmt_info));
4350 dump_printf (MSG_NOTE, "\n");
4354 /* Adjust the minimal vectorization factor according to the
4355 vector type. */
4356 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4357 *min_vf = upper_bound (*min_vf, vf);
4359 if (gatherscatter != SG_NONE)
4361 gather_scatter_info gs_info;
4362 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4363 &gs_info)
4364 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4366 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4367 free_data_ref (dr);
4368 if (dump_enabled_p ())
4370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4371 (gatherscatter == GATHER) ?
4372 "not vectorized: not suitable for gather "
4373 "load " :
4374 "not vectorized: not suitable for scatter "
4375 "store ");
4376 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4378 return false;
4381 free_data_ref (datarefs[i]);
4382 datarefs[i] = dr;
4383 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4387 /* We used to stop processing and prune the list here. Verify we no
4388 longer need to. */
4389 gcc_assert (i == datarefs.length ());
4391 return true;
4395 /* Function vect_get_new_vect_var.
4397 Returns a name for a new variable. The current naming scheme appends the
4398 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4399 the name of vectorizer generated variables, and appends that to NAME if
4400 provided. */
4402 tree
4403 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4405 const char *prefix;
4406 tree new_vect_var;
4408 switch (var_kind)
4410 case vect_simple_var:
4411 prefix = "vect";
4412 break;
4413 case vect_scalar_var:
4414 prefix = "stmp";
4415 break;
4416 case vect_mask_var:
4417 prefix = "mask";
4418 break;
4419 case vect_pointer_var:
4420 prefix = "vectp";
4421 break;
4422 default:
4423 gcc_unreachable ();
4426 if (name)
4428 char* tmp = concat (prefix, "_", name, NULL);
4429 new_vect_var = create_tmp_reg (type, tmp);
4430 free (tmp);
4432 else
4433 new_vect_var = create_tmp_reg (type, prefix);
4435 return new_vect_var;
4438 /* Like vect_get_new_vect_var but return an SSA name. */
4440 tree
4441 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4443 const char *prefix;
4444 tree new_vect_var;
4446 switch (var_kind)
4448 case vect_simple_var:
4449 prefix = "vect";
4450 break;
4451 case vect_scalar_var:
4452 prefix = "stmp";
4453 break;
4454 case vect_pointer_var:
4455 prefix = "vectp";
4456 break;
4457 default:
4458 gcc_unreachable ();
4461 if (name)
4463 char* tmp = concat (prefix, "_", name, NULL);
4464 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4465 free (tmp);
4467 else
4468 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4470 return new_vect_var;
4473 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4475 static void
4476 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4478 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4479 int misalign = DR_MISALIGNMENT (dr);
4480 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4481 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4482 else
4483 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4484 DR_TARGET_ALIGNMENT (dr), misalign);
4487 /* Function vect_create_addr_base_for_vector_ref.
4489 Create an expression that computes the address of the first memory location
4490 that will be accessed for a data reference.
4492 Input:
4493 STMT: The statement containing the data reference.
4494 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4495 OFFSET: Optional. If supplied, it is be added to the initial address.
4496 LOOP: Specify relative to which loop-nest should the address be computed.
4497 For example, when the dataref is in an inner-loop nested in an
4498 outer-loop that is now being vectorized, LOOP can be either the
4499 outer-loop, or the inner-loop. The first memory location accessed
4500 by the following dataref ('in' points to short):
4502 for (i=0; i<N; i++)
4503 for (j=0; j<M; j++)
4504 s += in[i+j]
4506 is as follows:
4507 if LOOP=i_loop: &in (relative to i_loop)
4508 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4509 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4510 initial address. Unlike OFFSET, which is number of elements to
4511 be added, BYTE_OFFSET is measured in bytes.
4513 Output:
4514 1. Return an SSA_NAME whose value is the address of the memory location of
4515 the first vector of the data reference.
4516 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4517 these statement(s) which define the returned SSA_NAME.
4519 FORNOW: We are only handling array accesses with step 1. */
4521 tree
4522 vect_create_addr_base_for_vector_ref (gimple *stmt,
4523 gimple_seq *new_stmt_list,
4524 tree offset,
4525 tree byte_offset)
4527 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4528 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4529 const char *base_name;
4530 tree addr_base;
4531 tree dest;
4532 gimple_seq seq = NULL;
4533 tree vect_ptr_type;
4534 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4535 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4536 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4538 tree data_ref_base = unshare_expr (drb->base_address);
4539 tree base_offset = unshare_expr (drb->offset);
4540 tree init = unshare_expr (drb->init);
4542 if (loop_vinfo)
4543 base_name = get_name (data_ref_base);
4544 else
4546 base_offset = ssize_int (0);
4547 init = ssize_int (0);
4548 base_name = get_name (DR_REF (dr));
4551 /* Create base_offset */
4552 base_offset = size_binop (PLUS_EXPR,
4553 fold_convert (sizetype, base_offset),
4554 fold_convert (sizetype, init));
4556 if (offset)
4558 offset = fold_build2 (MULT_EXPR, sizetype,
4559 fold_convert (sizetype, offset), step);
4560 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4561 base_offset, offset);
4563 if (byte_offset)
4565 byte_offset = fold_convert (sizetype, byte_offset);
4566 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4567 base_offset, byte_offset);
4570 /* base + base_offset */
4571 if (loop_vinfo)
4572 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4573 else
4575 addr_base = build1 (ADDR_EXPR,
4576 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4577 unshare_expr (DR_REF (dr)));
4580 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4581 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4582 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4583 gimple_seq_add_seq (new_stmt_list, seq);
4585 if (DR_PTR_INFO (dr)
4586 && TREE_CODE (addr_base) == SSA_NAME
4587 && !SSA_NAME_PTR_INFO (addr_base))
4589 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4590 if (offset || byte_offset)
4591 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4594 if (dump_enabled_p ())
4596 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4597 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4598 dump_printf (MSG_NOTE, "\n");
4601 return addr_base;
4605 /* Function vect_create_data_ref_ptr.
4607 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4608 location accessed in the loop by STMT, along with the def-use update
4609 chain to appropriately advance the pointer through the loop iterations.
4610 Also set aliasing information for the pointer. This pointer is used by
4611 the callers to this function to create a memory reference expression for
4612 vector load/store access.
4614 Input:
4615 1. STMT: a stmt that references memory. Expected to be of the form
4616 GIMPLE_ASSIGN <name, data-ref> or
4617 GIMPLE_ASSIGN <data-ref, name>.
4618 2. AGGR_TYPE: the type of the reference, which should be either a vector
4619 or an array.
4620 3. AT_LOOP: the loop where the vector memref is to be created.
4621 4. OFFSET (optional): an offset to be added to the initial address accessed
4622 by the data-ref in STMT.
4623 5. BSI: location where the new stmts are to be placed if there is no loop
4624 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4625 pointing to the initial address.
4626 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4627 to the initial address accessed by the data-ref in STMT. This is
4628 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4629 in bytes.
4630 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4631 to the IV during each iteration of the loop. NULL says to move
4632 by one copy of AGGR_TYPE up or down, depending on the step of the
4633 data reference.
4635 Output:
4636 1. Declare a new ptr to vector_type, and have it point to the base of the
4637 data reference (initial addressed accessed by the data reference).
4638 For example, for vector of type V8HI, the following code is generated:
4640 v8hi *ap;
4641 ap = (v8hi *)initial_address;
4643 if OFFSET is not supplied:
4644 initial_address = &a[init];
4645 if OFFSET is supplied:
4646 initial_address = &a[init + OFFSET];
4647 if BYTE_OFFSET is supplied:
4648 initial_address = &a[init] + BYTE_OFFSET;
4650 Return the initial_address in INITIAL_ADDRESS.
4652 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4653 update the pointer in each iteration of the loop.
4655 Return the increment stmt that updates the pointer in PTR_INCR.
4657 3. Set INV_P to true if the access pattern of the data reference in the
4658 vectorized loop is invariant. Set it to false otherwise.
4660 4. Return the pointer. */
4662 tree
4663 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4664 tree offset, tree *initial_address,
4665 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4666 bool only_init, bool *inv_p, tree byte_offset,
4667 tree iv_step)
4669 const char *base_name;
4670 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4671 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4672 struct loop *loop = NULL;
4673 bool nested_in_vect_loop = false;
4674 struct loop *containing_loop = NULL;
4675 tree aggr_ptr_type;
4676 tree aggr_ptr;
4677 tree new_temp;
4678 gimple_seq new_stmt_list = NULL;
4679 edge pe = NULL;
4680 basic_block new_bb;
4681 tree aggr_ptr_init;
4682 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4683 tree aptr;
4684 gimple_stmt_iterator incr_gsi;
4685 bool insert_after;
4686 tree indx_before_incr, indx_after_incr;
4687 gimple *incr;
4688 tree step;
4689 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4691 gcc_assert (iv_step != NULL_TREE
4692 || TREE_CODE (aggr_type) == ARRAY_TYPE
4693 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4695 if (loop_vinfo)
4697 loop = LOOP_VINFO_LOOP (loop_vinfo);
4698 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4699 containing_loop = (gimple_bb (stmt))->loop_father;
4700 pe = loop_preheader_edge (loop);
4702 else
4704 gcc_assert (bb_vinfo);
4705 only_init = true;
4706 *ptr_incr = NULL;
4709 /* Check the step (evolution) of the load in LOOP, and record
4710 whether it's invariant. */
4711 step = vect_dr_behavior (dr)->step;
4712 if (integer_zerop (step))
4713 *inv_p = true;
4714 else
4715 *inv_p = false;
4717 /* Create an expression for the first address accessed by this load
4718 in LOOP. */
4719 base_name = get_name (DR_BASE_ADDRESS (dr));
4721 if (dump_enabled_p ())
4723 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4724 dump_printf_loc (MSG_NOTE, vect_location,
4725 "create %s-pointer variable to type: ",
4726 get_tree_code_name (TREE_CODE (aggr_type)));
4727 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4728 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4729 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4730 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4731 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4732 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4733 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4734 else
4735 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4736 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4737 dump_printf (MSG_NOTE, "\n");
4740 /* (1) Create the new aggregate-pointer variable.
4741 Vector and array types inherit the alias set of their component
4742 type by default so we need to use a ref-all pointer if the data
4743 reference does not conflict with the created aggregated data
4744 reference because it is not addressable. */
4745 bool need_ref_all = false;
4746 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4747 get_alias_set (DR_REF (dr))))
4748 need_ref_all = true;
4749 /* Likewise for any of the data references in the stmt group. */
4750 else if (DR_GROUP_SIZE (stmt_info) > 1)
4752 gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4755 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4756 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4757 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4758 get_alias_set (DR_REF (sdr))))
4760 need_ref_all = true;
4761 break;
4763 orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4765 while (orig_stmt);
4767 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4768 need_ref_all);
4769 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4772 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4773 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4774 def-use update cycles for the pointer: one relative to the outer-loop
4775 (LOOP), which is what steps (3) and (4) below do. The other is relative
4776 to the inner-loop (which is the inner-most loop containing the dataref),
4777 and this is done be step (5) below.
4779 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4780 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4781 redundant. Steps (3),(4) create the following:
4783 vp0 = &base_addr;
4784 LOOP: vp1 = phi(vp0,vp2)
4787 vp2 = vp1 + step
4788 goto LOOP
4790 If there is an inner-loop nested in loop, then step (5) will also be
4791 applied, and an additional update in the inner-loop will be created:
4793 vp0 = &base_addr;
4794 LOOP: vp1 = phi(vp0,vp2)
4796 inner: vp3 = phi(vp1,vp4)
4797 vp4 = vp3 + inner_step
4798 if () goto inner
4800 vp2 = vp1 + step
4801 if () goto LOOP */
4803 /* (2) Calculate the initial address of the aggregate-pointer, and set
4804 the aggregate-pointer to point to it before the loop. */
4806 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4808 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4809 offset, byte_offset);
4810 if (new_stmt_list)
4812 if (pe)
4814 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4815 gcc_assert (!new_bb);
4817 else
4818 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4821 *initial_address = new_temp;
4822 aggr_ptr_init = new_temp;
4824 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4825 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4826 inner-loop nested in LOOP (during outer-loop vectorization). */
4828 /* No update in loop is required. */
4829 if (only_init && (!loop_vinfo || at_loop == loop))
4830 aptr = aggr_ptr_init;
4831 else
4833 if (iv_step == NULL_TREE)
4835 /* The step of the aggregate pointer is the type size. */
4836 iv_step = TYPE_SIZE_UNIT (aggr_type);
4837 /* One exception to the above is when the scalar step of the load in
4838 LOOP is zero. In this case the step here is also zero. */
4839 if (*inv_p)
4840 iv_step = size_zero_node;
4841 else if (tree_int_cst_sgn (step) == -1)
4842 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4845 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4847 create_iv (aggr_ptr_init,
4848 fold_convert (aggr_ptr_type, iv_step),
4849 aggr_ptr, loop, &incr_gsi, insert_after,
4850 &indx_before_incr, &indx_after_incr);
4851 incr = gsi_stmt (incr_gsi);
4852 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4854 /* Copy the points-to information if it exists. */
4855 if (DR_PTR_INFO (dr))
4857 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4858 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4860 if (ptr_incr)
4861 *ptr_incr = incr;
4863 aptr = indx_before_incr;
4866 if (!nested_in_vect_loop || only_init)
4867 return aptr;
4870 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4871 nested in LOOP, if exists. */
4873 gcc_assert (nested_in_vect_loop);
4874 if (!only_init)
4876 standard_iv_increment_position (containing_loop, &incr_gsi,
4877 &insert_after);
4878 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4879 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4880 &indx_after_incr);
4881 incr = gsi_stmt (incr_gsi);
4882 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4884 /* Copy the points-to information if it exists. */
4885 if (DR_PTR_INFO (dr))
4887 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4888 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4890 if (ptr_incr)
4891 *ptr_incr = incr;
4893 return indx_before_incr;
4895 else
4896 gcc_unreachable ();
4900 /* Function bump_vector_ptr
4902 Increment a pointer (to a vector type) by vector-size. If requested,
4903 i.e. if PTR-INCR is given, then also connect the new increment stmt
4904 to the existing def-use update-chain of the pointer, by modifying
4905 the PTR_INCR as illustrated below:
4907 The pointer def-use update-chain before this function:
4908 DATAREF_PTR = phi (p_0, p_2)
4909 ....
4910 PTR_INCR: p_2 = DATAREF_PTR + step
4912 The pointer def-use update-chain after this function:
4913 DATAREF_PTR = phi (p_0, p_2)
4914 ....
4915 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4916 ....
4917 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4919 Input:
4920 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4921 in the loop.
4922 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4923 the loop. The increment amount across iterations is expected
4924 to be vector_size.
4925 BSI - location where the new update stmt is to be placed.
4926 STMT - the original scalar memory-access stmt that is being vectorized.
4927 BUMP - optional. The offset by which to bump the pointer. If not given,
4928 the offset is assumed to be vector_size.
4930 Output: Return NEW_DATAREF_PTR as illustrated above.
4934 tree
4935 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4936 gimple *stmt, tree bump)
4938 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4939 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4940 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4941 tree update = TYPE_SIZE_UNIT (vectype);
4942 gassign *incr_stmt;
4943 ssa_op_iter iter;
4944 use_operand_p use_p;
4945 tree new_dataref_ptr;
4947 if (bump)
4948 update = bump;
4950 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4951 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4952 else
4953 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4954 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4955 dataref_ptr, update);
4956 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4958 /* Copy the points-to information if it exists. */
4959 if (DR_PTR_INFO (dr))
4961 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4962 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4965 if (!ptr_incr)
4966 return new_dataref_ptr;
4968 /* Update the vector-pointer's cross-iteration increment. */
4969 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4971 tree use = USE_FROM_PTR (use_p);
4973 if (use == dataref_ptr)
4974 SET_USE (use_p, new_dataref_ptr);
4975 else
4976 gcc_assert (operand_equal_p (use, update, 0));
4979 return new_dataref_ptr;
4983 /* Copy memory reference info such as base/clique from the SRC reference
4984 to the DEST MEM_REF. */
4986 void
4987 vect_copy_ref_info (tree dest, tree src)
4989 if (TREE_CODE (dest) != MEM_REF)
4990 return;
4992 tree src_base = src;
4993 while (handled_component_p (src_base))
4994 src_base = TREE_OPERAND (src_base, 0);
4995 if (TREE_CODE (src_base) != MEM_REF
4996 && TREE_CODE (src_base) != TARGET_MEM_REF)
4997 return;
4999 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5000 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5004 /* Function vect_create_destination_var.
5006 Create a new temporary of type VECTYPE. */
5008 tree
5009 vect_create_destination_var (tree scalar_dest, tree vectype)
5011 tree vec_dest;
5012 const char *name;
5013 char *new_name;
5014 tree type;
5015 enum vect_var_kind kind;
5017 kind = vectype
5018 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5019 ? vect_mask_var
5020 : vect_simple_var
5021 : vect_scalar_var;
5022 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5024 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5026 name = get_name (scalar_dest);
5027 if (name)
5028 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5029 else
5030 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5031 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5032 free (new_name);
5034 return vec_dest;
5037 /* Function vect_grouped_store_supported.
5039 Returns TRUE if interleave high and interleave low permutations
5040 are supported, and FALSE otherwise. */
5042 bool
5043 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5045 machine_mode mode = TYPE_MODE (vectype);
5047 /* vect_permute_store_chain requires the group size to be equal to 3 or
5048 be a power of two. */
5049 if (count != 3 && exact_log2 (count) == -1)
5051 if (dump_enabled_p ())
5052 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5053 "the size of the group of accesses"
5054 " is not a power of 2 or not eqaul to 3\n");
5055 return false;
5058 /* Check that the permutation is supported. */
5059 if (VECTOR_MODE_P (mode))
5061 unsigned int i;
5062 if (count == 3)
5064 unsigned int j0 = 0, j1 = 0, j2 = 0;
5065 unsigned int i, j;
5067 unsigned int nelt;
5068 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5070 if (dump_enabled_p ())
5071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5072 "cannot handle groups of 3 stores for"
5073 " variable-length vectors\n");
5074 return false;
5077 vec_perm_builder sel (nelt, nelt, 1);
5078 sel.quick_grow (nelt);
5079 vec_perm_indices indices;
5080 for (j = 0; j < 3; j++)
5082 int nelt0 = ((3 - j) * nelt) % 3;
5083 int nelt1 = ((3 - j) * nelt + 1) % 3;
5084 int nelt2 = ((3 - j) * nelt + 2) % 3;
5085 for (i = 0; i < nelt; i++)
5087 if (3 * i + nelt0 < nelt)
5088 sel[3 * i + nelt0] = j0++;
5089 if (3 * i + nelt1 < nelt)
5090 sel[3 * i + nelt1] = nelt + j1++;
5091 if (3 * i + nelt2 < nelt)
5092 sel[3 * i + nelt2] = 0;
5094 indices.new_vector (sel, 2, nelt);
5095 if (!can_vec_perm_const_p (mode, indices))
5097 if (dump_enabled_p ())
5098 dump_printf (MSG_MISSED_OPTIMIZATION,
5099 "permutation op not supported by target.\n");
5100 return false;
5103 for (i = 0; i < nelt; i++)
5105 if (3 * i + nelt0 < nelt)
5106 sel[3 * i + nelt0] = 3 * i + nelt0;
5107 if (3 * i + nelt1 < nelt)
5108 sel[3 * i + nelt1] = 3 * i + nelt1;
5109 if (3 * i + nelt2 < nelt)
5110 sel[3 * i + nelt2] = nelt + j2++;
5112 indices.new_vector (sel, 2, nelt);
5113 if (!can_vec_perm_const_p (mode, indices))
5115 if (dump_enabled_p ())
5116 dump_printf (MSG_MISSED_OPTIMIZATION,
5117 "permutation op not supported by target.\n");
5118 return false;
5121 return true;
5123 else
5125 /* If length is not equal to 3 then only power of 2 is supported. */
5126 gcc_assert (pow2p_hwi (count));
5127 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5129 /* The encoding has 2 interleaved stepped patterns. */
5130 vec_perm_builder sel (nelt, 2, 3);
5131 sel.quick_grow (6);
5132 for (i = 0; i < 3; i++)
5134 sel[i * 2] = i;
5135 sel[i * 2 + 1] = i + nelt;
5137 vec_perm_indices indices (sel, 2, nelt);
5138 if (can_vec_perm_const_p (mode, indices))
5140 for (i = 0; i < 6; i++)
5141 sel[i] += exact_div (nelt, 2);
5142 indices.new_vector (sel, 2, nelt);
5143 if (can_vec_perm_const_p (mode, indices))
5144 return true;
5149 if (dump_enabled_p ())
5150 dump_printf (MSG_MISSED_OPTIMIZATION,
5151 "permutaion op not supported by target.\n");
5152 return false;
5156 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5157 type VECTYPE. MASKED_P says whether the masked form is needed. */
5159 bool
5160 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5161 bool masked_p)
5163 if (masked_p)
5164 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5165 vec_mask_store_lanes_optab,
5166 vectype, count);
5167 else
5168 return vect_lanes_optab_supported_p ("vec_store_lanes",
5169 vec_store_lanes_optab,
5170 vectype, count);
5174 /* Function vect_permute_store_chain.
5176 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5177 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5178 the data correctly for the stores. Return the final references for stores
5179 in RESULT_CHAIN.
5181 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5182 The input is 4 vectors each containing 8 elements. We assign a number to
5183 each element, the input sequence is:
5185 1st vec: 0 1 2 3 4 5 6 7
5186 2nd vec: 8 9 10 11 12 13 14 15
5187 3rd vec: 16 17 18 19 20 21 22 23
5188 4th vec: 24 25 26 27 28 29 30 31
5190 The output sequence should be:
5192 1st vec: 0 8 16 24 1 9 17 25
5193 2nd vec: 2 10 18 26 3 11 19 27
5194 3rd vec: 4 12 20 28 5 13 21 30
5195 4th vec: 6 14 22 30 7 15 23 31
5197 i.e., we interleave the contents of the four vectors in their order.
5199 We use interleave_high/low instructions to create such output. The input of
5200 each interleave_high/low operation is two vectors:
5201 1st vec 2nd vec
5202 0 1 2 3 4 5 6 7
5203 the even elements of the result vector are obtained left-to-right from the
5204 high/low elements of the first vector. The odd elements of the result are
5205 obtained left-to-right from the high/low elements of the second vector.
5206 The output of interleave_high will be: 0 4 1 5
5207 and of interleave_low: 2 6 3 7
5210 The permutation is done in log LENGTH stages. In each stage interleave_high
5211 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5212 where the first argument is taken from the first half of DR_CHAIN and the
5213 second argument from it's second half.
5214 In our example,
5216 I1: interleave_high (1st vec, 3rd vec)
5217 I2: interleave_low (1st vec, 3rd vec)
5218 I3: interleave_high (2nd vec, 4th vec)
5219 I4: interleave_low (2nd vec, 4th vec)
5221 The output for the first stage is:
5223 I1: 0 16 1 17 2 18 3 19
5224 I2: 4 20 5 21 6 22 7 23
5225 I3: 8 24 9 25 10 26 11 27
5226 I4: 12 28 13 29 14 30 15 31
5228 The output of the second stage, i.e. the final result is:
5230 I1: 0 8 16 24 1 9 17 25
5231 I2: 2 10 18 26 3 11 19 27
5232 I3: 4 12 20 28 5 13 21 30
5233 I4: 6 14 22 30 7 15 23 31. */
5235 void
5236 vect_permute_store_chain (vec<tree> dr_chain,
5237 unsigned int length,
5238 gimple *stmt,
5239 gimple_stmt_iterator *gsi,
5240 vec<tree> *result_chain)
5242 tree vect1, vect2, high, low;
5243 gimple *perm_stmt;
5244 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5245 tree perm_mask_low, perm_mask_high;
5246 tree data_ref;
5247 tree perm3_mask_low, perm3_mask_high;
5248 unsigned int i, j, n, log_length = exact_log2 (length);
5250 result_chain->quick_grow (length);
5251 memcpy (result_chain->address (), dr_chain.address (),
5252 length * sizeof (tree));
5254 if (length == 3)
5256 /* vect_grouped_store_supported ensures that this is constant. */
5257 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5258 unsigned int j0 = 0, j1 = 0, j2 = 0;
5260 vec_perm_builder sel (nelt, nelt, 1);
5261 sel.quick_grow (nelt);
5262 vec_perm_indices indices;
5263 for (j = 0; j < 3; j++)
5265 int nelt0 = ((3 - j) * nelt) % 3;
5266 int nelt1 = ((3 - j) * nelt + 1) % 3;
5267 int nelt2 = ((3 - j) * nelt + 2) % 3;
5269 for (i = 0; i < nelt; i++)
5271 if (3 * i + nelt0 < nelt)
5272 sel[3 * i + nelt0] = j0++;
5273 if (3 * i + nelt1 < nelt)
5274 sel[3 * i + nelt1] = nelt + j1++;
5275 if (3 * i + nelt2 < nelt)
5276 sel[3 * i + nelt2] = 0;
5278 indices.new_vector (sel, 2, nelt);
5279 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5281 for (i = 0; i < nelt; i++)
5283 if (3 * i + nelt0 < nelt)
5284 sel[3 * i + nelt0] = 3 * i + nelt0;
5285 if (3 * i + nelt1 < nelt)
5286 sel[3 * i + nelt1] = 3 * i + nelt1;
5287 if (3 * i + nelt2 < nelt)
5288 sel[3 * i + nelt2] = nelt + j2++;
5290 indices.new_vector (sel, 2, nelt);
5291 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5293 vect1 = dr_chain[0];
5294 vect2 = dr_chain[1];
5296 /* Create interleaving stmt:
5297 low = VEC_PERM_EXPR <vect1, vect2,
5298 {j, nelt, *, j + 1, nelt + j + 1, *,
5299 j + 2, nelt + j + 2, *, ...}> */
5300 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5301 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5302 vect2, perm3_mask_low);
5303 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5305 vect1 = data_ref;
5306 vect2 = dr_chain[2];
5307 /* Create interleaving stmt:
5308 low = VEC_PERM_EXPR <vect1, vect2,
5309 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5310 6, 7, nelt + j + 2, ...}> */
5311 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5312 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5313 vect2, perm3_mask_high);
5314 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5315 (*result_chain)[j] = data_ref;
5318 else
5320 /* If length is not equal to 3 then only power of 2 is supported. */
5321 gcc_assert (pow2p_hwi (length));
5323 /* The encoding has 2 interleaved stepped patterns. */
5324 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5325 vec_perm_builder sel (nelt, 2, 3);
5326 sel.quick_grow (6);
5327 for (i = 0; i < 3; i++)
5329 sel[i * 2] = i;
5330 sel[i * 2 + 1] = i + nelt;
5332 vec_perm_indices indices (sel, 2, nelt);
5333 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5335 for (i = 0; i < 6; i++)
5336 sel[i] += exact_div (nelt, 2);
5337 indices.new_vector (sel, 2, nelt);
5338 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5340 for (i = 0, n = log_length; i < n; i++)
5342 for (j = 0; j < length/2; j++)
5344 vect1 = dr_chain[j];
5345 vect2 = dr_chain[j+length/2];
5347 /* Create interleaving stmt:
5348 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5349 ...}> */
5350 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5351 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5352 vect2, perm_mask_high);
5353 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5354 (*result_chain)[2*j] = high;
5356 /* Create interleaving stmt:
5357 low = VEC_PERM_EXPR <vect1, vect2,
5358 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5359 ...}> */
5360 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5361 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5362 vect2, perm_mask_low);
5363 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5364 (*result_chain)[2*j+1] = low;
5366 memcpy (dr_chain.address (), result_chain->address (),
5367 length * sizeof (tree));
5372 /* Function vect_setup_realignment
5374 This function is called when vectorizing an unaligned load using
5375 the dr_explicit_realign[_optimized] scheme.
5376 This function generates the following code at the loop prolog:
5378 p = initial_addr;
5379 x msq_init = *(floor(p)); # prolog load
5380 realignment_token = call target_builtin;
5381 loop:
5382 x msq = phi (msq_init, ---)
5384 The stmts marked with x are generated only for the case of
5385 dr_explicit_realign_optimized.
5387 The code above sets up a new (vector) pointer, pointing to the first
5388 location accessed by STMT, and a "floor-aligned" load using that pointer.
5389 It also generates code to compute the "realignment-token" (if the relevant
5390 target hook was defined), and creates a phi-node at the loop-header bb
5391 whose arguments are the result of the prolog-load (created by this
5392 function) and the result of a load that takes place in the loop (to be
5393 created by the caller to this function).
5395 For the case of dr_explicit_realign_optimized:
5396 The caller to this function uses the phi-result (msq) to create the
5397 realignment code inside the loop, and sets up the missing phi argument,
5398 as follows:
5399 loop:
5400 msq = phi (msq_init, lsq)
5401 lsq = *(floor(p')); # load in loop
5402 result = realign_load (msq, lsq, realignment_token);
5404 For the case of dr_explicit_realign:
5405 loop:
5406 msq = *(floor(p)); # load in loop
5407 p' = p + (VS-1);
5408 lsq = *(floor(p')); # load in loop
5409 result = realign_load (msq, lsq, realignment_token);
5411 Input:
5412 STMT - (scalar) load stmt to be vectorized. This load accesses
5413 a memory location that may be unaligned.
5414 BSI - place where new code is to be inserted.
5415 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5416 is used.
5418 Output:
5419 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5420 target hook, if defined.
5421 Return value - the result of the loop-header phi node. */
5423 tree
5424 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5425 tree *realignment_token,
5426 enum dr_alignment_support alignment_support_scheme,
5427 tree init_addr,
5428 struct loop **at_loop)
5430 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5431 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5432 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5433 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5434 struct loop *loop = NULL;
5435 edge pe = NULL;
5436 tree scalar_dest = gimple_assign_lhs (stmt);
5437 tree vec_dest;
5438 gimple *inc;
5439 tree ptr;
5440 tree data_ref;
5441 basic_block new_bb;
5442 tree msq_init = NULL_TREE;
5443 tree new_temp;
5444 gphi *phi_stmt;
5445 tree msq = NULL_TREE;
5446 gimple_seq stmts = NULL;
5447 bool inv_p;
5448 bool compute_in_loop = false;
5449 bool nested_in_vect_loop = false;
5450 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5451 struct loop *loop_for_initial_load = NULL;
5453 if (loop_vinfo)
5455 loop = LOOP_VINFO_LOOP (loop_vinfo);
5456 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5459 gcc_assert (alignment_support_scheme == dr_explicit_realign
5460 || alignment_support_scheme == dr_explicit_realign_optimized);
5462 /* We need to generate three things:
5463 1. the misalignment computation
5464 2. the extra vector load (for the optimized realignment scheme).
5465 3. the phi node for the two vectors from which the realignment is
5466 done (for the optimized realignment scheme). */
5468 /* 1. Determine where to generate the misalignment computation.
5470 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5471 calculation will be generated by this function, outside the loop (in the
5472 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5473 caller, inside the loop.
5475 Background: If the misalignment remains fixed throughout the iterations of
5476 the loop, then both realignment schemes are applicable, and also the
5477 misalignment computation can be done outside LOOP. This is because we are
5478 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5479 are a multiple of VS (the Vector Size), and therefore the misalignment in
5480 different vectorized LOOP iterations is always the same.
5481 The problem arises only if the memory access is in an inner-loop nested
5482 inside LOOP, which is now being vectorized using outer-loop vectorization.
5483 This is the only case when the misalignment of the memory access may not
5484 remain fixed throughout the iterations of the inner-loop (as explained in
5485 detail in vect_supportable_dr_alignment). In this case, not only is the
5486 optimized realignment scheme not applicable, but also the misalignment
5487 computation (and generation of the realignment token that is passed to
5488 REALIGN_LOAD) have to be done inside the loop.
5490 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5491 or not, which in turn determines if the misalignment is computed inside
5492 the inner-loop, or outside LOOP. */
5494 if (init_addr != NULL_TREE || !loop_vinfo)
5496 compute_in_loop = true;
5497 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5501 /* 2. Determine where to generate the extra vector load.
5503 For the optimized realignment scheme, instead of generating two vector
5504 loads in each iteration, we generate a single extra vector load in the
5505 preheader of the loop, and in each iteration reuse the result of the
5506 vector load from the previous iteration. In case the memory access is in
5507 an inner-loop nested inside LOOP, which is now being vectorized using
5508 outer-loop vectorization, we need to determine whether this initial vector
5509 load should be generated at the preheader of the inner-loop, or can be
5510 generated at the preheader of LOOP. If the memory access has no evolution
5511 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5512 to be generated inside LOOP (in the preheader of the inner-loop). */
5514 if (nested_in_vect_loop)
5516 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5517 bool invariant_in_outerloop =
5518 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5519 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5521 else
5522 loop_for_initial_load = loop;
5523 if (at_loop)
5524 *at_loop = loop_for_initial_load;
5526 if (loop_for_initial_load)
5527 pe = loop_preheader_edge (loop_for_initial_load);
5529 /* 3. For the case of the optimized realignment, create the first vector
5530 load at the loop preheader. */
5532 if (alignment_support_scheme == dr_explicit_realign_optimized)
5534 /* Create msq_init = *(floor(p1)) in the loop preheader */
5535 gassign *new_stmt;
5537 gcc_assert (!compute_in_loop);
5538 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5539 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5540 NULL_TREE, &init_addr, NULL, &inc,
5541 true, &inv_p);
5542 if (TREE_CODE (ptr) == SSA_NAME)
5543 new_temp = copy_ssa_name (ptr);
5544 else
5545 new_temp = make_ssa_name (TREE_TYPE (ptr));
5546 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5547 new_stmt = gimple_build_assign
5548 (new_temp, BIT_AND_EXPR, ptr,
5549 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5550 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5551 gcc_assert (!new_bb);
5552 data_ref
5553 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5554 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5555 vect_copy_ref_info (data_ref, DR_REF (dr));
5556 new_stmt = gimple_build_assign (vec_dest, data_ref);
5557 new_temp = make_ssa_name (vec_dest, new_stmt);
5558 gimple_assign_set_lhs (new_stmt, new_temp);
5559 if (pe)
5561 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5562 gcc_assert (!new_bb);
5564 else
5565 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5567 msq_init = gimple_assign_lhs (new_stmt);
5570 /* 4. Create realignment token using a target builtin, if available.
5571 It is done either inside the containing loop, or before LOOP (as
5572 determined above). */
5574 if (targetm.vectorize.builtin_mask_for_load)
5576 gcall *new_stmt;
5577 tree builtin_decl;
5579 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5580 if (!init_addr)
5582 /* Generate the INIT_ADDR computation outside LOOP. */
5583 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5584 NULL_TREE);
5585 if (loop)
5587 pe = loop_preheader_edge (loop);
5588 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5589 gcc_assert (!new_bb);
5591 else
5592 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5595 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5596 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5597 vec_dest =
5598 vect_create_destination_var (scalar_dest,
5599 gimple_call_return_type (new_stmt));
5600 new_temp = make_ssa_name (vec_dest, new_stmt);
5601 gimple_call_set_lhs (new_stmt, new_temp);
5603 if (compute_in_loop)
5604 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5605 else
5607 /* Generate the misalignment computation outside LOOP. */
5608 pe = loop_preheader_edge (loop);
5609 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5610 gcc_assert (!new_bb);
5613 *realignment_token = gimple_call_lhs (new_stmt);
5615 /* The result of the CALL_EXPR to this builtin is determined from
5616 the value of the parameter and no global variables are touched
5617 which makes the builtin a "const" function. Requiring the
5618 builtin to have the "const" attribute makes it unnecessary
5619 to call mark_call_clobbered. */
5620 gcc_assert (TREE_READONLY (builtin_decl));
5623 if (alignment_support_scheme == dr_explicit_realign)
5624 return msq;
5626 gcc_assert (!compute_in_loop);
5627 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5630 /* 5. Create msq = phi <msq_init, lsq> in loop */
5632 pe = loop_preheader_edge (containing_loop);
5633 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5634 msq = make_ssa_name (vec_dest);
5635 phi_stmt = create_phi_node (msq, containing_loop->header);
5636 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5638 return msq;
5642 /* Function vect_grouped_load_supported.
5644 COUNT is the size of the load group (the number of statements plus the
5645 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5646 only one statement, with a gap of COUNT - 1.
5648 Returns true if a suitable permute exists. */
5650 bool
5651 vect_grouped_load_supported (tree vectype, bool single_element_p,
5652 unsigned HOST_WIDE_INT count)
5654 machine_mode mode = TYPE_MODE (vectype);
5656 /* If this is single-element interleaving with an element distance
5657 that leaves unused vector loads around punt - we at least create
5658 very sub-optimal code in that case (and blow up memory,
5659 see PR65518). */
5660 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5662 if (dump_enabled_p ())
5663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5664 "single-element interleaving not supported "
5665 "for not adjacent vector loads\n");
5666 return false;
5669 /* vect_permute_load_chain requires the group size to be equal to 3 or
5670 be a power of two. */
5671 if (count != 3 && exact_log2 (count) == -1)
5673 if (dump_enabled_p ())
5674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5675 "the size of the group of accesses"
5676 " is not a power of 2 or not equal to 3\n");
5677 return false;
5680 /* Check that the permutation is supported. */
5681 if (VECTOR_MODE_P (mode))
5683 unsigned int i, j;
5684 if (count == 3)
5686 unsigned int nelt;
5687 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5689 if (dump_enabled_p ())
5690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5691 "cannot handle groups of 3 loads for"
5692 " variable-length vectors\n");
5693 return false;
5696 vec_perm_builder sel (nelt, nelt, 1);
5697 sel.quick_grow (nelt);
5698 vec_perm_indices indices;
5699 unsigned int k;
5700 for (k = 0; k < 3; k++)
5702 for (i = 0; i < nelt; i++)
5703 if (3 * i + k < 2 * nelt)
5704 sel[i] = 3 * i + k;
5705 else
5706 sel[i] = 0;
5707 indices.new_vector (sel, 2, nelt);
5708 if (!can_vec_perm_const_p (mode, indices))
5710 if (dump_enabled_p ())
5711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5712 "shuffle of 3 loads is not supported by"
5713 " target\n");
5714 return false;
5716 for (i = 0, j = 0; i < nelt; i++)
5717 if (3 * i + k < 2 * nelt)
5718 sel[i] = i;
5719 else
5720 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5721 indices.new_vector (sel, 2, nelt);
5722 if (!can_vec_perm_const_p (mode, indices))
5724 if (dump_enabled_p ())
5725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5726 "shuffle of 3 loads is not supported by"
5727 " target\n");
5728 return false;
5731 return true;
5733 else
5735 /* If length is not equal to 3 then only power of 2 is supported. */
5736 gcc_assert (pow2p_hwi (count));
5737 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5739 /* The encoding has a single stepped pattern. */
5740 vec_perm_builder sel (nelt, 1, 3);
5741 sel.quick_grow (3);
5742 for (i = 0; i < 3; i++)
5743 sel[i] = i * 2;
5744 vec_perm_indices indices (sel, 2, nelt);
5745 if (can_vec_perm_const_p (mode, indices))
5747 for (i = 0; i < 3; i++)
5748 sel[i] = i * 2 + 1;
5749 indices.new_vector (sel, 2, nelt);
5750 if (can_vec_perm_const_p (mode, indices))
5751 return true;
5756 if (dump_enabled_p ())
5757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5758 "extract even/odd not supported by target\n");
5759 return false;
5762 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5763 type VECTYPE. MASKED_P says whether the masked form is needed. */
5765 bool
5766 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5767 bool masked_p)
5769 if (masked_p)
5770 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5771 vec_mask_load_lanes_optab,
5772 vectype, count);
5773 else
5774 return vect_lanes_optab_supported_p ("vec_load_lanes",
5775 vec_load_lanes_optab,
5776 vectype, count);
5779 /* Function vect_permute_load_chain.
5781 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5782 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5783 the input data correctly. Return the final references for loads in
5784 RESULT_CHAIN.
5786 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5787 The input is 4 vectors each containing 8 elements. We assign a number to each
5788 element, the input sequence is:
5790 1st vec: 0 1 2 3 4 5 6 7
5791 2nd vec: 8 9 10 11 12 13 14 15
5792 3rd vec: 16 17 18 19 20 21 22 23
5793 4th vec: 24 25 26 27 28 29 30 31
5795 The output sequence should be:
5797 1st vec: 0 4 8 12 16 20 24 28
5798 2nd vec: 1 5 9 13 17 21 25 29
5799 3rd vec: 2 6 10 14 18 22 26 30
5800 4th vec: 3 7 11 15 19 23 27 31
5802 i.e., the first output vector should contain the first elements of each
5803 interleaving group, etc.
5805 We use extract_even/odd instructions to create such output. The input of
5806 each extract_even/odd operation is two vectors
5807 1st vec 2nd vec
5808 0 1 2 3 4 5 6 7
5810 and the output is the vector of extracted even/odd elements. The output of
5811 extract_even will be: 0 2 4 6
5812 and of extract_odd: 1 3 5 7
5815 The permutation is done in log LENGTH stages. In each stage extract_even
5816 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5817 their order. In our example,
5819 E1: extract_even (1st vec, 2nd vec)
5820 E2: extract_odd (1st vec, 2nd vec)
5821 E3: extract_even (3rd vec, 4th vec)
5822 E4: extract_odd (3rd vec, 4th vec)
5824 The output for the first stage will be:
5826 E1: 0 2 4 6 8 10 12 14
5827 E2: 1 3 5 7 9 11 13 15
5828 E3: 16 18 20 22 24 26 28 30
5829 E4: 17 19 21 23 25 27 29 31
5831 In order to proceed and create the correct sequence for the next stage (or
5832 for the correct output, if the second stage is the last one, as in our
5833 example), we first put the output of extract_even operation and then the
5834 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5835 The input for the second stage is:
5837 1st vec (E1): 0 2 4 6 8 10 12 14
5838 2nd vec (E3): 16 18 20 22 24 26 28 30
5839 3rd vec (E2): 1 3 5 7 9 11 13 15
5840 4th vec (E4): 17 19 21 23 25 27 29 31
5842 The output of the second stage:
5844 E1: 0 4 8 12 16 20 24 28
5845 E2: 2 6 10 14 18 22 26 30
5846 E3: 1 5 9 13 17 21 25 29
5847 E4: 3 7 11 15 19 23 27 31
5849 And RESULT_CHAIN after reordering:
5851 1st vec (E1): 0 4 8 12 16 20 24 28
5852 2nd vec (E3): 1 5 9 13 17 21 25 29
5853 3rd vec (E2): 2 6 10 14 18 22 26 30
5854 4th vec (E4): 3 7 11 15 19 23 27 31. */
5856 static void
5857 vect_permute_load_chain (vec<tree> dr_chain,
5858 unsigned int length,
5859 gimple *stmt,
5860 gimple_stmt_iterator *gsi,
5861 vec<tree> *result_chain)
5863 tree data_ref, first_vect, second_vect;
5864 tree perm_mask_even, perm_mask_odd;
5865 tree perm3_mask_low, perm3_mask_high;
5866 gimple *perm_stmt;
5867 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5868 unsigned int i, j, log_length = exact_log2 (length);
5870 result_chain->quick_grow (length);
5871 memcpy (result_chain->address (), dr_chain.address (),
5872 length * sizeof (tree));
5874 if (length == 3)
5876 /* vect_grouped_load_supported ensures that this is constant. */
5877 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5878 unsigned int k;
5880 vec_perm_builder sel (nelt, nelt, 1);
5881 sel.quick_grow (nelt);
5882 vec_perm_indices indices;
5883 for (k = 0; k < 3; k++)
5885 for (i = 0; i < nelt; i++)
5886 if (3 * i + k < 2 * nelt)
5887 sel[i] = 3 * i + k;
5888 else
5889 sel[i] = 0;
5890 indices.new_vector (sel, 2, nelt);
5891 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5893 for (i = 0, j = 0; i < nelt; i++)
5894 if (3 * i + k < 2 * nelt)
5895 sel[i] = i;
5896 else
5897 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5898 indices.new_vector (sel, 2, nelt);
5899 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5901 first_vect = dr_chain[0];
5902 second_vect = dr_chain[1];
5904 /* Create interleaving stmt (low part of):
5905 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5906 ...}> */
5907 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5908 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5909 second_vect, perm3_mask_low);
5910 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5912 /* Create interleaving stmt (high part of):
5913 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5914 ...}> */
5915 first_vect = data_ref;
5916 second_vect = dr_chain[2];
5917 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5918 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5919 second_vect, perm3_mask_high);
5920 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5921 (*result_chain)[k] = data_ref;
5924 else
5926 /* If length is not equal to 3 then only power of 2 is supported. */
5927 gcc_assert (pow2p_hwi (length));
5929 /* The encoding has a single stepped pattern. */
5930 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5931 vec_perm_builder sel (nelt, 1, 3);
5932 sel.quick_grow (3);
5933 for (i = 0; i < 3; ++i)
5934 sel[i] = i * 2;
5935 vec_perm_indices indices (sel, 2, nelt);
5936 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5938 for (i = 0; i < 3; ++i)
5939 sel[i] = i * 2 + 1;
5940 indices.new_vector (sel, 2, nelt);
5941 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5943 for (i = 0; i < log_length; i++)
5945 for (j = 0; j < length; j += 2)
5947 first_vect = dr_chain[j];
5948 second_vect = dr_chain[j+1];
5950 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5951 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5952 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5953 first_vect, second_vect,
5954 perm_mask_even);
5955 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5956 (*result_chain)[j/2] = data_ref;
5958 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5959 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5960 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5961 first_vect, second_vect,
5962 perm_mask_odd);
5963 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5964 (*result_chain)[j/2+length/2] = data_ref;
5966 memcpy (dr_chain.address (), result_chain->address (),
5967 length * sizeof (tree));
5972 /* Function vect_shift_permute_load_chain.
5974 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5975 sequence of stmts to reorder the input data accordingly.
5976 Return the final references for loads in RESULT_CHAIN.
5977 Return true if successed, false otherwise.
5979 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5980 The input is 3 vectors each containing 8 elements. We assign a
5981 number to each element, the input sequence is:
5983 1st vec: 0 1 2 3 4 5 6 7
5984 2nd vec: 8 9 10 11 12 13 14 15
5985 3rd vec: 16 17 18 19 20 21 22 23
5987 The output sequence should be:
5989 1st vec: 0 3 6 9 12 15 18 21
5990 2nd vec: 1 4 7 10 13 16 19 22
5991 3rd vec: 2 5 8 11 14 17 20 23
5993 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5995 First we shuffle all 3 vectors to get correct elements order:
5997 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5998 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5999 3rd vec: (16 19 22) (17 20 23) (18 21)
6001 Next we unite and shift vector 3 times:
6003 1st step:
6004 shift right by 6 the concatenation of:
6005 "1st vec" and "2nd vec"
6006 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6007 "2nd vec" and "3rd vec"
6008 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6009 "3rd vec" and "1st vec"
6010 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6011 | New vectors |
6013 So that now new vectors are:
6015 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6016 2nd vec: (10 13) (16 19 22) (17 20 23)
6017 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6019 2nd step:
6020 shift right by 5 the concatenation of:
6021 "1st vec" and "3rd vec"
6022 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6023 "2nd vec" and "1st vec"
6024 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6025 "3rd vec" and "2nd vec"
6026 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6027 | New vectors |
6029 So that now new vectors are:
6031 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6032 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6033 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6035 3rd step:
6036 shift right by 5 the concatenation of:
6037 "1st vec" and "1st vec"
6038 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6039 shift right by 3 the concatenation of:
6040 "2nd vec" and "2nd vec"
6041 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6042 | New vectors |
6044 So that now all vectors are READY:
6045 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6046 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6047 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6049 This algorithm is faster than one in vect_permute_load_chain if:
6050 1. "shift of a concatination" is faster than general permutation.
6051 This is usually so.
6052 2. The TARGET machine can't execute vector instructions in parallel.
6053 This is because each step of the algorithm depends on previous.
6054 The algorithm in vect_permute_load_chain is much more parallel.
6056 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6059 static bool
6060 vect_shift_permute_load_chain (vec<tree> dr_chain,
6061 unsigned int length,
6062 gimple *stmt,
6063 gimple_stmt_iterator *gsi,
6064 vec<tree> *result_chain)
6066 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6067 tree perm2_mask1, perm2_mask2, perm3_mask;
6068 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6069 gimple *perm_stmt;
6071 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6072 unsigned int i;
6073 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6074 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6076 unsigned HOST_WIDE_INT nelt, vf;
6077 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6078 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6079 /* Not supported for variable-length vectors. */
6080 return false;
6082 vec_perm_builder sel (nelt, nelt, 1);
6083 sel.quick_grow (nelt);
6085 result_chain->quick_grow (length);
6086 memcpy (result_chain->address (), dr_chain.address (),
6087 length * sizeof (tree));
6089 if (pow2p_hwi (length) && vf > 4)
6091 unsigned int j, log_length = exact_log2 (length);
6092 for (i = 0; i < nelt / 2; ++i)
6093 sel[i] = i * 2;
6094 for (i = 0; i < nelt / 2; ++i)
6095 sel[nelt / 2 + i] = i * 2 + 1;
6096 vec_perm_indices indices (sel, 2, nelt);
6097 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6099 if (dump_enabled_p ())
6100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6101 "shuffle of 2 fields structure is not \
6102 supported by target\n");
6103 return false;
6105 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6107 for (i = 0; i < nelt / 2; ++i)
6108 sel[i] = i * 2 + 1;
6109 for (i = 0; i < nelt / 2; ++i)
6110 sel[nelt / 2 + i] = i * 2;
6111 indices.new_vector (sel, 2, nelt);
6112 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6114 if (dump_enabled_p ())
6115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6116 "shuffle of 2 fields structure is not \
6117 supported by target\n");
6118 return false;
6120 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6122 /* Generating permutation constant to shift all elements.
6123 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6124 for (i = 0; i < nelt; i++)
6125 sel[i] = nelt / 2 + i;
6126 indices.new_vector (sel, 2, nelt);
6127 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6129 if (dump_enabled_p ())
6130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6131 "shift permutation is not supported by target\n");
6132 return false;
6134 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6136 /* Generating permutation constant to select vector from 2.
6137 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6138 for (i = 0; i < nelt / 2; i++)
6139 sel[i] = i;
6140 for (i = nelt / 2; i < nelt; i++)
6141 sel[i] = nelt + i;
6142 indices.new_vector (sel, 2, nelt);
6143 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6145 if (dump_enabled_p ())
6146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6147 "select is not supported by target\n");
6148 return false;
6150 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6152 for (i = 0; i < log_length; i++)
6154 for (j = 0; j < length; j += 2)
6156 first_vect = dr_chain[j];
6157 second_vect = dr_chain[j + 1];
6159 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6160 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6161 first_vect, first_vect,
6162 perm2_mask1);
6163 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6164 vect[0] = data_ref;
6166 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6167 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6168 second_vect, second_vect,
6169 perm2_mask2);
6170 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6171 vect[1] = data_ref;
6173 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6174 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6175 vect[0], vect[1], shift1_mask);
6176 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6177 (*result_chain)[j/2 + length/2] = data_ref;
6179 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6180 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6181 vect[0], vect[1], select_mask);
6182 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6183 (*result_chain)[j/2] = data_ref;
6185 memcpy (dr_chain.address (), result_chain->address (),
6186 length * sizeof (tree));
6188 return true;
6190 if (length == 3 && vf > 2)
6192 unsigned int k = 0, l = 0;
6194 /* Generating permutation constant to get all elements in rigth order.
6195 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6196 for (i = 0; i < nelt; i++)
6198 if (3 * k + (l % 3) >= nelt)
6200 k = 0;
6201 l += (3 - (nelt % 3));
6203 sel[i] = 3 * k + (l % 3);
6204 k++;
6206 vec_perm_indices indices (sel, 2, nelt);
6207 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6209 if (dump_enabled_p ())
6210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6211 "shuffle of 3 fields structure is not \
6212 supported by target\n");
6213 return false;
6215 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6217 /* Generating permutation constant to shift all elements.
6218 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6219 for (i = 0; i < nelt; i++)
6220 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6221 indices.new_vector (sel, 2, nelt);
6222 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6224 if (dump_enabled_p ())
6225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6226 "shift permutation is not supported by target\n");
6227 return false;
6229 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6231 /* Generating permutation constant to shift all elements.
6232 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6233 for (i = 0; i < nelt; i++)
6234 sel[i] = 2 * (nelt / 3) + 1 + i;
6235 indices.new_vector (sel, 2, nelt);
6236 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6238 if (dump_enabled_p ())
6239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6240 "shift permutation is not supported by target\n");
6241 return false;
6243 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6245 /* Generating permutation constant to shift all elements.
6246 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6247 for (i = 0; i < nelt; i++)
6248 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6249 indices.new_vector (sel, 2, nelt);
6250 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6252 if (dump_enabled_p ())
6253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6254 "shift permutation is not supported by target\n");
6255 return false;
6257 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6259 /* Generating permutation constant to shift all elements.
6260 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6261 for (i = 0; i < nelt; i++)
6262 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6263 indices.new_vector (sel, 2, nelt);
6264 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6266 if (dump_enabled_p ())
6267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6268 "shift permutation is not supported by target\n");
6269 return false;
6271 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6273 for (k = 0; k < 3; k++)
6275 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6276 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6277 dr_chain[k], dr_chain[k],
6278 perm3_mask);
6279 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6280 vect[k] = data_ref;
6283 for (k = 0; k < 3; k++)
6285 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6286 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6287 vect[k % 3], vect[(k + 1) % 3],
6288 shift1_mask);
6289 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6290 vect_shift[k] = data_ref;
6293 for (k = 0; k < 3; k++)
6295 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6296 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6297 vect_shift[(4 - k) % 3],
6298 vect_shift[(3 - k) % 3],
6299 shift2_mask);
6300 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6301 vect[k] = data_ref;
6304 (*result_chain)[3 - (nelt % 3)] = vect[2];
6306 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6307 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6308 vect[0], shift3_mask);
6309 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6310 (*result_chain)[nelt % 3] = data_ref;
6312 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6313 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6314 vect[1], shift4_mask);
6315 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6316 (*result_chain)[0] = data_ref;
6317 return true;
6319 return false;
6322 /* Function vect_transform_grouped_load.
6324 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6325 to perform their permutation and ascribe the result vectorized statements to
6326 the scalar statements.
6329 void
6330 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6331 gimple_stmt_iterator *gsi)
6333 machine_mode mode;
6334 vec<tree> result_chain = vNULL;
6336 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6337 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6338 vectors, that are ready for vector computation. */
6339 result_chain.create (size);
6341 /* If reassociation width for vector type is 2 or greater target machine can
6342 execute 2 or more vector instructions in parallel. Otherwise try to
6343 get chain for loads group using vect_shift_permute_load_chain. */
6344 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6345 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6346 || pow2p_hwi (size)
6347 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6348 gsi, &result_chain))
6349 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6350 vect_record_grouped_load_vectors (stmt, result_chain);
6351 result_chain.release ();
6354 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6355 generated as part of the vectorization of STMT. Assign the statement
6356 for each vector to the associated scalar statement. */
6358 void
6359 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6361 gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6362 gimple *next_stmt, *new_stmt;
6363 unsigned int i, gap_count;
6364 tree tmp_data_ref;
6366 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6367 Since we scan the chain starting from it's first node, their order
6368 corresponds the order of data-refs in RESULT_CHAIN. */
6369 next_stmt = first_stmt;
6370 gap_count = 1;
6371 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6373 if (!next_stmt)
6374 break;
6376 /* Skip the gaps. Loads created for the gaps will be removed by dead
6377 code elimination pass later. No need to check for the first stmt in
6378 the group, since it always exists.
6379 DR_GROUP_GAP is the number of steps in elements from the previous
6380 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6381 correspond to the gaps. */
6382 if (next_stmt != first_stmt
6383 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6385 gap_count++;
6386 continue;
6389 while (next_stmt)
6391 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6392 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6393 copies, and we put the new vector statement in the first available
6394 RELATED_STMT. */
6395 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6396 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6397 else
6399 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6401 gimple *prev_stmt =
6402 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6403 gimple *rel_stmt =
6404 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6405 while (rel_stmt)
6407 prev_stmt = rel_stmt;
6408 rel_stmt =
6409 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6412 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6413 new_stmt;
6417 next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6418 gap_count = 1;
6419 /* If NEXT_STMT accesses the same DR as the previous statement,
6420 put the same TMP_DATA_REF as its vectorized statement; otherwise
6421 get the next data-ref from RESULT_CHAIN. */
6422 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6423 break;
6428 /* Function vect_force_dr_alignment_p.
6430 Returns whether the alignment of a DECL can be forced to be aligned
6431 on ALIGNMENT bit boundary. */
6433 bool
6434 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6436 if (!VAR_P (decl))
6437 return false;
6439 if (decl_in_symtab_p (decl)
6440 && !symtab_node::get (decl)->can_increase_alignment_p ())
6441 return false;
6443 if (TREE_STATIC (decl))
6444 return (alignment <= MAX_OFILE_ALIGNMENT);
6445 else
6446 return (alignment <= MAX_STACK_ALIGNMENT);
6450 /* Return whether the data reference DR is supported with respect to its
6451 alignment.
6452 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6453 it is aligned, i.e., check if it is possible to vectorize it with different
6454 alignment. */
6456 enum dr_alignment_support
6457 vect_supportable_dr_alignment (struct data_reference *dr,
6458 bool check_aligned_accesses)
6460 gimple *stmt = vect_dr_stmt (dr);
6461 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6462 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6463 machine_mode mode = TYPE_MODE (vectype);
6464 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6465 struct loop *vect_loop = NULL;
6466 bool nested_in_vect_loop = false;
6468 if (aligned_access_p (dr) && !check_aligned_accesses)
6469 return dr_aligned;
6471 /* For now assume all conditional loads/stores support unaligned
6472 access without any special code. */
6473 if (is_gimple_call (stmt)
6474 && gimple_call_internal_p (stmt)
6475 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6476 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6477 return dr_unaligned_supported;
6479 if (loop_vinfo)
6481 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6482 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6485 /* Possibly unaligned access. */
6487 /* We can choose between using the implicit realignment scheme (generating
6488 a misaligned_move stmt) and the explicit realignment scheme (generating
6489 aligned loads with a REALIGN_LOAD). There are two variants to the
6490 explicit realignment scheme: optimized, and unoptimized.
6491 We can optimize the realignment only if the step between consecutive
6492 vector loads is equal to the vector size. Since the vector memory
6493 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6494 is guaranteed that the misalignment amount remains the same throughout the
6495 execution of the vectorized loop. Therefore, we can create the
6496 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6497 at the loop preheader.
6499 However, in the case of outer-loop vectorization, when vectorizing a
6500 memory access in the inner-loop nested within the LOOP that is now being
6501 vectorized, while it is guaranteed that the misalignment of the
6502 vectorized memory access will remain the same in different outer-loop
6503 iterations, it is *not* guaranteed that is will remain the same throughout
6504 the execution of the inner-loop. This is because the inner-loop advances
6505 with the original scalar step (and not in steps of VS). If the inner-loop
6506 step happens to be a multiple of VS, then the misalignment remains fixed
6507 and we can use the optimized realignment scheme. For example:
6509 for (i=0; i<N; i++)
6510 for (j=0; j<M; j++)
6511 s += a[i+j];
6513 When vectorizing the i-loop in the above example, the step between
6514 consecutive vector loads is 1, and so the misalignment does not remain
6515 fixed across the execution of the inner-loop, and the realignment cannot
6516 be optimized (as illustrated in the following pseudo vectorized loop):
6518 for (i=0; i<N; i+=4)
6519 for (j=0; j<M; j++){
6520 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6521 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6522 // (assuming that we start from an aligned address).
6525 We therefore have to use the unoptimized realignment scheme:
6527 for (i=0; i<N; i+=4)
6528 for (j=k; j<M; j+=4)
6529 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6530 // that the misalignment of the initial address is
6531 // 0).
6533 The loop can then be vectorized as follows:
6535 for (k=0; k<4; k++){
6536 rt = get_realignment_token (&vp[k]);
6537 for (i=0; i<N; i+=4){
6538 v1 = vp[i+k];
6539 for (j=k; j<M; j+=4){
6540 v2 = vp[i+j+VS-1];
6541 va = REALIGN_LOAD <v1,v2,rt>;
6542 vs += va;
6543 v1 = v2;
6546 } */
6548 if (DR_IS_READ (dr))
6550 bool is_packed = false;
6551 tree type = (TREE_TYPE (DR_REF (dr)));
6553 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6554 && (!targetm.vectorize.builtin_mask_for_load
6555 || targetm.vectorize.builtin_mask_for_load ()))
6557 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6559 /* If we are doing SLP then the accesses need not have the
6560 same alignment, instead it depends on the SLP group size. */
6561 if (loop_vinfo
6562 && STMT_SLP_TYPE (stmt_info)
6563 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6564 * DR_GROUP_SIZE (vinfo_for_stmt
6565 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6566 TYPE_VECTOR_SUBPARTS (vectype)))
6568 else if (!loop_vinfo
6569 || (nested_in_vect_loop
6570 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6571 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6572 return dr_explicit_realign;
6573 else
6574 return dr_explicit_realign_optimized;
6576 if (!known_alignment_for_access_p (dr))
6577 is_packed = not_size_aligned (DR_REF (dr));
6579 if (targetm.vectorize.support_vector_misalignment
6580 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6581 /* Can't software pipeline the loads, but can at least do them. */
6582 return dr_unaligned_supported;
6584 else
6586 bool is_packed = false;
6587 tree type = (TREE_TYPE (DR_REF (dr)));
6589 if (!known_alignment_for_access_p (dr))
6590 is_packed = not_size_aligned (DR_REF (dr));
6592 if (targetm.vectorize.support_vector_misalignment
6593 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6594 return dr_unaligned_supported;
6597 /* Unsupported. */
6598 return dr_unaligned_unsupported;