PR tree-optimization/84740
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobc0094989c7ba7e53353dc94f6773c4d3d1da38c0
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
137 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
139 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
141 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
142 if (rhs < lhs)
143 scalar_type = rhs_type;
146 *lhs_size_unit = lhs;
147 *rhs_size_unit = rhs;
148 return scalar_type;
152 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
153 tested at run-time. Return TRUE if DDR was successfully inserted.
154 Return false if versioning is not supported. */
156 static bool
157 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
159 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
161 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
162 return false;
164 if (!runtime_alias_check_p (ddr, loop,
165 optimize_loop_nest_for_speed_p (loop)))
166 return false;
168 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
169 return true;
172 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
174 static void
175 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
177 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
178 for (unsigned int i = 0; i < checks.length(); ++i)
179 if (checks[i] == value)
180 return;
182 if (dump_enabled_p ())
184 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
185 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
186 dump_printf (MSG_NOTE, " is nonzero\n");
188 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
191 /* Return true if we know that the order of vectorized STMT_A and
192 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
193 At least one of the statements is a write. */
195 static bool
196 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
198 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
199 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
201 /* Single statements are always kept in their original order. */
202 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
203 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
204 return true;
206 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
207 group are emitted at the position of the first scalar load and all
208 stores in a group are emitted at the position of the last scalar store.
209 Thus writes will happen no earlier than their current position
210 (but could happen later) while reads will happen no later than their
211 current position (but could happen earlier). Reordering is therefore
212 only possible if the first access is a write. */
213 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
214 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
217 /* A subroutine of vect_analyze_data_ref_dependence. Handle
218 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
219 distances. These distances are conservatively correct but they don't
220 reflect a guaranteed dependence.
222 Return true if this function does all the work necessary to avoid
223 an alias or false if the caller should use the dependence distances
224 to limit the vectorization factor in the usual way. LOOP_DEPTH is
225 the depth of the loop described by LOOP_VINFO and the other arguments
226 are as for vect_analyze_data_ref_dependence. */
228 static bool
229 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
230 loop_vec_info loop_vinfo,
231 int loop_depth, unsigned int *max_vf)
233 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
234 lambda_vector dist_v;
235 unsigned int i;
236 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
238 int dist = dist_v[loop_depth];
239 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
241 /* If the user asserted safelen >= DIST consecutive iterations
242 can be executed concurrently, assume independence.
244 ??? An alternative would be to add the alias check even
245 in this case, and vectorize the fallback loop with the
246 maximum VF set to safelen. However, if the user has
247 explicitly given a length, it's less likely that that
248 would be a win. */
249 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
251 if ((unsigned int) loop->safelen < *max_vf)
252 *max_vf = loop->safelen;
253 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
254 continue;
257 /* For dependence distances of 2 or more, we have the option
258 of limiting VF or checking for an alias at runtime.
259 Prefer to check at runtime if we can, to avoid limiting
260 the VF unnecessarily when the bases are in fact independent.
262 Note that the alias checks will be removed if the VF ends up
263 being small enough. */
264 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
267 return true;
271 /* Function vect_analyze_data_ref_dependence.
273 Return TRUE if there (might) exist a dependence between a memory-reference
274 DRA and a memory-reference DRB. When versioning for alias may check a
275 dependence at run-time, return FALSE. Adjust *MAX_VF according to
276 the data dependence. */
278 static bool
279 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
280 loop_vec_info loop_vinfo,
281 unsigned int *max_vf)
283 unsigned int i;
284 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
285 struct data_reference *dra = DDR_A (ddr);
286 struct data_reference *drb = DDR_B (ddr);
287 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
288 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
289 lambda_vector dist_v;
290 unsigned int loop_depth;
292 /* In loop analysis all data references should be vectorizable. */
293 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
294 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
295 gcc_unreachable ();
297 /* Independent data accesses. */
298 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
299 return false;
301 if (dra == drb
302 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
303 return false;
305 /* We do not have to consider dependences between accesses that belong
306 to the same group. */
307 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
308 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
309 return false;
311 /* Even if we have an anti-dependence then, as the vectorized loop covers at
312 least two scalar iterations, there is always also a true dependence.
313 As the vectorizer does not re-order loads and stores we can ignore
314 the anti-dependence if TBAA can disambiguate both DRs similar to the
315 case with known negative distance anti-dependences (positive
316 distance anti-dependences would violate TBAA constraints). */
317 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
318 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
319 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
320 get_alias_set (DR_REF (drb))))
321 return false;
323 /* Unknown data dependence. */
324 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
326 /* If user asserted safelen consecutive iterations can be
327 executed concurrently, assume independence. */
328 if (loop->safelen >= 2)
330 if ((unsigned int) loop->safelen < *max_vf)
331 *max_vf = loop->safelen;
332 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
333 return false;
336 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
337 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
339 if (dump_enabled_p ())
341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
342 "versioning for alias not supported for: "
343 "can't determine dependence between ");
344 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
345 DR_REF (dra));
346 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
347 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
348 DR_REF (drb));
349 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
351 return true;
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
357 "versioning for alias required: "
358 "can't determine dependence between ");
359 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
360 DR_REF (dra));
361 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
362 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
363 DR_REF (drb));
364 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
367 /* Add to list of ddrs that need to be tested at run-time. */
368 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
371 /* Known data dependence. */
372 if (DDR_NUM_DIST_VECTS (ddr) == 0)
374 /* If user asserted safelen consecutive iterations can be
375 executed concurrently, assume independence. */
376 if (loop->safelen >= 2)
378 if ((unsigned int) loop->safelen < *max_vf)
379 *max_vf = loop->safelen;
380 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
381 return false;
384 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
385 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
387 if (dump_enabled_p ())
389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
390 "versioning for alias not supported for: "
391 "bad dist vector for ");
392 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
393 DR_REF (dra));
394 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
395 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
396 DR_REF (drb));
397 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
399 return true;
402 if (dump_enabled_p ())
404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
405 "versioning for alias required: "
406 "bad dist vector for ");
407 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
408 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
409 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
410 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
412 /* Add to list of ddrs that need to be tested at run-time. */
413 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
416 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
418 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
419 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
420 loop_depth, max_vf))
421 return false;
423 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
425 int dist = dist_v[loop_depth];
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "dependence distance = %d.\n", dist);
431 if (dist == 0)
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE, vect_location,
436 "dependence distance == 0 between ");
437 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
438 dump_printf (MSG_NOTE, " and ");
439 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
440 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
443 /* When we perform grouped accesses and perform implicit CSE
444 by detecting equal accesses and doing disambiguation with
445 runtime alias tests like for
446 .. = a[i];
447 .. = a[i+1];
448 a[i] = ..;
449 a[i+1] = ..;
450 *p = ..;
451 .. = a[i];
452 .. = a[i+1];
453 where we will end up loading { a[i], a[i+1] } once, make
454 sure that inserting group loads before the first load and
455 stores after the last store will do the right thing.
456 Similar for groups like
457 a[i] = ...;
458 ... = a[i];
459 a[i+1] = ...;
460 where loads from the group interleave with the store. */
461 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
463 if (dump_enabled_p ())
464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
465 "READ_WRITE dependence in interleaving.\n");
466 return true;
469 if (loop->safelen < 2)
471 tree indicator = dr_zero_step_indicator (dra);
472 if (TREE_CODE (indicator) != INTEGER_CST)
473 vect_check_nonzero_value (loop_vinfo, indicator);
474 else if (integer_zerop (indicator))
476 if (dump_enabled_p ())
477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
478 "access also has a zero step\n");
479 return true;
482 continue;
485 if (dist > 0 && DDR_REVERSED_P (ddr))
487 /* If DDR_REVERSED_P the order of the data-refs in DDR was
488 reversed (to make distance vector positive), and the actual
489 distance is negative. */
490 if (dump_enabled_p ())
491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
492 "dependence distance negative.\n");
493 /* Record a negative dependence distance to later limit the
494 amount of stmt copying / unrolling we can perform.
495 Only need to handle read-after-write dependence. */
496 if (DR_IS_READ (drb)
497 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
498 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
499 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
500 continue;
503 unsigned int abs_dist = abs (dist);
504 if (abs_dist >= 2 && abs_dist < *max_vf)
506 /* The dependence distance requires reduction of the maximal
507 vectorization factor. */
508 *max_vf = abs (dist);
509 if (dump_enabled_p ())
510 dump_printf_loc (MSG_NOTE, vect_location,
511 "adjusting maximal vectorization factor to %i\n",
512 *max_vf);
515 if (abs_dist >= *max_vf)
517 /* Dependence distance does not create dependence, as far as
518 vectorization is concerned, in this case. */
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "dependence distance >= VF.\n");
522 continue;
525 if (dump_enabled_p ())
527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
528 "not vectorized, possible dependence "
529 "between data-refs ");
530 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
531 dump_printf (MSG_NOTE, " and ");
532 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
533 dump_printf (MSG_NOTE, "\n");
536 return true;
539 return false;
542 /* Function vect_analyze_data_ref_dependences.
544 Examine all the data references in the loop, and make sure there do not
545 exist any data dependences between them. Set *MAX_VF according to
546 the maximum vectorization factor the data dependences allow. */
548 bool
549 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
550 unsigned int *max_vf)
552 unsigned int i;
553 struct data_dependence_relation *ddr;
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location,
557 "=== vect_analyze_data_ref_dependences ===\n");
559 LOOP_VINFO_DDRS (loop_vinfo)
560 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
561 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
562 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
563 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
564 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
565 &LOOP_VINFO_DDRS (loop_vinfo),
566 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
567 return false;
569 /* For epilogues we either have no aliases or alias versioning
570 was applied to original loop. Therefore we may just get max_vf
571 using VF of original loop. */
572 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
573 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
574 else
575 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
576 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
577 return false;
579 return true;
583 /* Function vect_slp_analyze_data_ref_dependence.
585 Return TRUE if there (might) exist a dependence between a memory-reference
586 DRA and a memory-reference DRB. When versioning for alias may check a
587 dependence at run-time, return FALSE. Adjust *MAX_VF according to
588 the data dependence. */
590 static bool
591 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
593 struct data_reference *dra = DDR_A (ddr);
594 struct data_reference *drb = DDR_B (ddr);
596 /* We need to check dependences of statements marked as unvectorizable
597 as well, they still can prohibit vectorization. */
599 /* Independent data accesses. */
600 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
601 return false;
603 if (dra == drb)
604 return false;
606 /* Read-read is OK. */
607 if (DR_IS_READ (dra) && DR_IS_READ (drb))
608 return false;
610 /* If dra and drb are part of the same interleaving chain consider
611 them independent. */
612 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
613 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
614 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
615 return false;
617 /* Unknown data dependence. */
618 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
620 if (dump_enabled_p ())
622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
623 "can't determine dependence between ");
624 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
625 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
626 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
627 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
630 else if (dump_enabled_p ())
632 dump_printf_loc (MSG_NOTE, vect_location,
633 "determined dependence between ");
634 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
635 dump_printf (MSG_NOTE, " and ");
636 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
637 dump_printf (MSG_NOTE, "\n");
640 return true;
644 /* Analyze dependences involved in the transform of SLP NODE. STORES
645 contain the vector of scalar stores of this instance if we are
646 disambiguating the loads. */
648 static bool
649 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
650 vec<gimple *> stores, gimple *last_store)
652 /* This walks over all stmts involved in the SLP load/store done
653 in NODE verifying we can sink them up to the last stmt in the
654 group. */
655 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
656 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
658 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
659 if (access == last_access)
660 continue;
661 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
662 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
663 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
665 gimple *stmt = gsi_stmt (gsi);
666 if (! gimple_vuse (stmt)
667 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
668 continue;
670 /* If we couldn't record a (single) data reference for this
671 stmt we have to give up. */
672 /* ??? Here and below if dependence analysis fails we can resort
673 to the alias oracle which can handle more kinds of stmts. */
674 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
675 if (!dr_b)
676 return false;
678 bool dependent = false;
679 /* If we run into a store of this same instance (we've just
680 marked those) then delay dependence checking until we run
681 into the last store because this is where it will have
682 been sunk to (and we verify if we can do that as well). */
683 if (gimple_visited_p (stmt))
685 if (stmt != last_store)
686 continue;
687 unsigned i;
688 gimple *store;
689 FOR_EACH_VEC_ELT (stores, i, store)
691 data_reference *store_dr
692 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
693 ddr_p ddr = initialize_data_dependence_relation
694 (dr_a, store_dr, vNULL);
695 dependent = vect_slp_analyze_data_ref_dependence (ddr);
696 free_dependence_relation (ddr);
697 if (dependent)
698 break;
701 else
703 ddr_p ddr = initialize_data_dependence_relation (dr_a,
704 dr_b, vNULL);
705 dependent = vect_slp_analyze_data_ref_dependence (ddr);
706 free_dependence_relation (ddr);
708 if (dependent)
709 return false;
712 return true;
716 /* Function vect_analyze_data_ref_dependences.
718 Examine all the data references in the basic-block, and make sure there
719 do not exist any data dependences between them. Set *MAX_VF according to
720 the maximum vectorization factor the data dependences allow. */
722 bool
723 vect_slp_analyze_instance_dependence (slp_instance instance)
725 if (dump_enabled_p ())
726 dump_printf_loc (MSG_NOTE, vect_location,
727 "=== vect_slp_analyze_instance_dependence ===\n");
729 /* The stores of this instance are at the root of the SLP tree. */
730 slp_tree store = SLP_INSTANCE_TREE (instance);
731 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
732 store = NULL;
734 /* Verify we can sink stores to the vectorized stmt insert location. */
735 gimple *last_store = NULL;
736 if (store)
738 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
739 return false;
741 /* Mark stores in this instance and remember the last one. */
742 last_store = vect_find_last_scalar_stmt_in_slp (store);
743 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
744 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
747 bool res = true;
749 /* Verify we can sink loads to the vectorized stmt insert location,
750 special-casing stores of this instance. */
751 slp_tree load;
752 unsigned int i;
753 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
754 if (! vect_slp_analyze_node_dependences (instance, load,
755 store
756 ? SLP_TREE_SCALAR_STMTS (store)
757 : vNULL, last_store))
759 res = false;
760 break;
763 /* Unset the visited flag. */
764 if (store)
765 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
766 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
768 return res;
771 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
772 the statement that contains DRB, which is useful for recording in the
773 dump file. */
775 static void
776 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
777 innermost_loop_behavior *drb)
779 bool existed;
780 innermost_loop_behavior *&entry
781 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
782 if (!existed || entry->base_alignment < drb->base_alignment)
784 entry = drb;
785 if (dump_enabled_p ())
787 dump_printf_loc (MSG_NOTE, vect_location,
788 "recording new base alignment for ");
789 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
790 dump_printf (MSG_NOTE, "\n");
791 dump_printf_loc (MSG_NOTE, vect_location,
792 " alignment: %d\n", drb->base_alignment);
793 dump_printf_loc (MSG_NOTE, vect_location,
794 " misalignment: %d\n", drb->base_misalignment);
795 dump_printf_loc (MSG_NOTE, vect_location,
796 " based on: ");
797 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
802 /* If the region we're going to vectorize is reached, all unconditional
803 data references occur at least once. We can therefore pool the base
804 alignment guarantees from each unconditional reference. Do this by
805 going through all the data references in VINFO and checking whether
806 the containing statement makes the reference unconditionally. If so,
807 record the alignment of the base address in VINFO so that it can be
808 used for all other references with the same base. */
810 void
811 vect_record_base_alignments (vec_info *vinfo)
813 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
814 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
815 data_reference *dr;
816 unsigned int i;
817 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
818 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
820 gimple *stmt = DR_STMT (dr);
821 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
823 /* If DR is nested in the loop that is being vectorized, we can also
824 record the alignment of the base wrt the outer loop. */
825 if (loop && nested_in_vect_loop_p (loop, stmt))
827 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
828 vect_record_base_alignment
829 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
834 /* Return the target alignment for the vectorized form of DR. */
836 static unsigned int
837 vect_calculate_target_alignment (struct data_reference *dr)
839 gimple *stmt = DR_STMT (dr);
840 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
841 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
842 return targetm.vectorize.preferred_vector_alignment (vectype);
845 /* Function vect_compute_data_ref_alignment
847 Compute the misalignment of the data reference DR.
849 Output:
850 1. If during the misalignment computation it is found that the data reference
851 cannot be vectorized then false is returned.
852 2. DR_MISALIGNMENT (DR) is defined.
854 FOR NOW: No analysis is actually performed. Misalignment is calculated
855 only for trivial cases. TODO. */
857 bool
858 vect_compute_data_ref_alignment (struct data_reference *dr)
860 gimple *stmt = DR_STMT (dr);
861 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
862 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
863 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
864 struct loop *loop = NULL;
865 tree ref = DR_REF (dr);
866 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
868 if (dump_enabled_p ())
869 dump_printf_loc (MSG_NOTE, vect_location,
870 "vect_compute_data_ref_alignment:\n");
872 if (loop_vinfo)
873 loop = LOOP_VINFO_LOOP (loop_vinfo);
875 /* Initialize misalignment to unknown. */
876 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
878 innermost_loop_behavior *drb = vect_dr_behavior (dr);
879 bool step_preserves_misalignment_p;
881 unsigned HOST_WIDE_INT vector_alignment
882 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
883 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
885 /* No step for BB vectorization. */
886 if (!loop)
888 gcc_assert (integer_zerop (drb->step));
889 step_preserves_misalignment_p = true;
892 /* In case the dataref is in an inner-loop of the loop that is being
893 vectorized (LOOP), we use the base and misalignment information
894 relative to the outer-loop (LOOP). This is ok only if the misalignment
895 stays the same throughout the execution of the inner-loop, which is why
896 we have to check that the stride of the dataref in the inner-loop evenly
897 divides by the vector alignment. */
898 else if (nested_in_vect_loop_p (loop, stmt))
900 step_preserves_misalignment_p
901 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
903 if (dump_enabled_p ())
905 if (step_preserves_misalignment_p)
906 dump_printf_loc (MSG_NOTE, vect_location,
907 "inner step divides the vector alignment.\n");
908 else
909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
910 "inner step doesn't divide the vector"
911 " alignment.\n");
915 /* Similarly we can only use base and misalignment information relative to
916 an innermost loop if the misalignment stays the same throughout the
917 execution of the loop. As above, this is the case if the stride of
918 the dataref evenly divides by the alignment. */
919 else
921 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
922 step_preserves_misalignment_p
923 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
925 if (!step_preserves_misalignment_p && dump_enabled_p ())
926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
927 "step doesn't divide the vector alignment.\n");
930 unsigned int base_alignment = drb->base_alignment;
931 unsigned int base_misalignment = drb->base_misalignment;
933 /* Calculate the maximum of the pooled base address alignment and the
934 alignment that we can compute for DR itself. */
935 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
936 if (entry && base_alignment < (*entry)->base_alignment)
938 base_alignment = (*entry)->base_alignment;
939 base_misalignment = (*entry)->base_misalignment;
942 if (drb->offset_alignment < vector_alignment
943 || !step_preserves_misalignment_p
944 /* We need to know whether the step wrt the vectorized loop is
945 negative when computing the starting misalignment below. */
946 || TREE_CODE (drb->step) != INTEGER_CST)
948 if (dump_enabled_p ())
950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
951 "Unknown alignment for access: ");
952 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
953 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
955 return true;
958 if (base_alignment < vector_alignment)
960 tree base = drb->base_address;
961 if (TREE_CODE (base) == ADDR_EXPR)
962 base = TREE_OPERAND (base, 0);
963 if (!vect_can_force_dr_alignment_p (base,
964 vector_alignment * BITS_PER_UNIT))
966 if (dump_enabled_p ())
968 dump_printf_loc (MSG_NOTE, vect_location,
969 "can't force alignment of ref: ");
970 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
971 dump_printf (MSG_NOTE, "\n");
973 return true;
976 /* Force the alignment of the decl.
977 NOTE: This is the only change to the code we make during
978 the analysis phase, before deciding to vectorize the loop. */
979 if (dump_enabled_p ())
981 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
982 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
983 dump_printf (MSG_NOTE, "\n");
986 DR_VECT_AUX (dr)->base_decl = base;
987 DR_VECT_AUX (dr)->base_misaligned = true;
988 base_misalignment = 0;
990 poly_int64 misalignment
991 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
993 /* If this is a backward running DR then first access in the larger
994 vectype actually is N-1 elements before the address in the DR.
995 Adjust misalign accordingly. */
996 if (tree_int_cst_sgn (drb->step) < 0)
997 /* PLUS because STEP is negative. */
998 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
999 * TREE_INT_CST_LOW (drb->step));
1001 unsigned int const_misalignment;
1002 if (!known_misalignment (misalignment, vector_alignment,
1003 &const_misalignment))
1005 if (dump_enabled_p ())
1007 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1008 "Non-constant misalignment for access: ");
1009 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1010 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1012 return true;
1015 SET_DR_MISALIGNMENT (dr, const_misalignment);
1017 if (dump_enabled_p ())
1019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1020 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1021 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1022 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1025 return true;
1028 /* Function vect_update_misalignment_for_peel.
1029 Sets DR's misalignment
1030 - to 0 if it has the same alignment as DR_PEEL,
1031 - to the misalignment computed using NPEEL if DR's salignment is known,
1032 - to -1 (unknown) otherwise.
1034 DR - the data reference whose misalignment is to be adjusted.
1035 DR_PEEL - the data reference whose misalignment is being made
1036 zero in the vector loop by the peel.
1037 NPEEL - the number of iterations in the peel loop if the misalignment
1038 of DR_PEEL is known at compile time. */
1040 static void
1041 vect_update_misalignment_for_peel (struct data_reference *dr,
1042 struct data_reference *dr_peel, int npeel)
1044 unsigned int i;
1045 vec<dr_p> same_aligned_drs;
1046 struct data_reference *current_dr;
1047 int dr_size = vect_get_scalar_dr_size (dr);
1048 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1049 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1050 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1052 /* For interleaved data accesses the step in the loop must be multiplied by
1053 the size of the interleaving group. */
1054 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1055 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1056 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1057 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1059 /* It can be assumed that the data refs with the same alignment as dr_peel
1060 are aligned in the vector loop. */
1061 same_aligned_drs
1062 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1063 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1065 if (current_dr != dr)
1066 continue;
1067 gcc_assert (!known_alignment_for_access_p (dr)
1068 || !known_alignment_for_access_p (dr_peel)
1069 || (DR_MISALIGNMENT (dr) / dr_size
1070 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1071 SET_DR_MISALIGNMENT (dr, 0);
1072 return;
1075 if (known_alignment_for_access_p (dr)
1076 && known_alignment_for_access_p (dr_peel))
1078 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1079 int misal = DR_MISALIGNMENT (dr);
1080 misal += negative ? -npeel * dr_size : npeel * dr_size;
1081 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1082 SET_DR_MISALIGNMENT (dr, misal);
1083 return;
1086 if (dump_enabled_p ())
1087 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1088 "to unknown (-1).\n");
1089 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1093 /* Function verify_data_ref_alignment
1095 Return TRUE if DR can be handled with respect to alignment. */
1097 static bool
1098 verify_data_ref_alignment (data_reference_p dr)
1100 enum dr_alignment_support supportable_dr_alignment
1101 = vect_supportable_dr_alignment (dr, false);
1102 if (!supportable_dr_alignment)
1104 if (dump_enabled_p ())
1106 if (DR_IS_READ (dr))
1107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1108 "not vectorized: unsupported unaligned load.");
1109 else
1110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1111 "not vectorized: unsupported unaligned "
1112 "store.");
1114 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1115 DR_REF (dr));
1116 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1118 return false;
1121 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1122 dump_printf_loc (MSG_NOTE, vect_location,
1123 "Vectorizing an unaligned access.\n");
1125 return true;
1128 /* Function vect_verify_datarefs_alignment
1130 Return TRUE if all data references in the loop can be
1131 handled with respect to alignment. */
1133 bool
1134 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1136 vec<data_reference_p> datarefs = vinfo->datarefs;
1137 struct data_reference *dr;
1138 unsigned int i;
1140 FOR_EACH_VEC_ELT (datarefs, i, dr)
1142 gimple *stmt = DR_STMT (dr);
1143 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1145 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1146 continue;
1148 /* For interleaving, only the alignment of the first access matters. */
1149 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1150 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1151 continue;
1153 /* Strided accesses perform only component accesses, alignment is
1154 irrelevant for them. */
1155 if (STMT_VINFO_STRIDED_P (stmt_info)
1156 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1157 continue;
1159 if (! verify_data_ref_alignment (dr))
1160 return false;
1163 return true;
1166 /* Given an memory reference EXP return whether its alignment is less
1167 than its size. */
1169 static bool
1170 not_size_aligned (tree exp)
1172 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1173 return true;
1175 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1176 > get_object_alignment (exp));
1179 /* Function vector_alignment_reachable_p
1181 Return true if vector alignment for DR is reachable by peeling
1182 a few loop iterations. Return false otherwise. */
1184 static bool
1185 vector_alignment_reachable_p (struct data_reference *dr)
1187 gimple *stmt = DR_STMT (dr);
1188 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1189 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1191 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1193 /* For interleaved access we peel only if number of iterations in
1194 the prolog loop ({VF - misalignment}), is a multiple of the
1195 number of the interleaved accesses. */
1196 int elem_size, mis_in_elements;
1198 /* FORNOW: handle only known alignment. */
1199 if (!known_alignment_for_access_p (dr))
1200 return false;
1202 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1203 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1204 elem_size = vector_element_size (vector_size, nelements);
1205 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1207 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1208 return false;
1211 /* If misalignment is known at the compile time then allow peeling
1212 only if natural alignment is reachable through peeling. */
1213 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1215 HOST_WIDE_INT elmsize =
1216 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1217 if (dump_enabled_p ())
1219 dump_printf_loc (MSG_NOTE, vect_location,
1220 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1221 dump_printf (MSG_NOTE,
1222 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1224 if (DR_MISALIGNMENT (dr) % elmsize)
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228 "data size does not divide the misalignment.\n");
1229 return false;
1233 if (!known_alignment_for_access_p (dr))
1235 tree type = TREE_TYPE (DR_REF (dr));
1236 bool is_packed = not_size_aligned (DR_REF (dr));
1237 if (dump_enabled_p ())
1238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1239 "Unknown misalignment, %snaturally aligned\n",
1240 is_packed ? "not " : "");
1241 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1244 return true;
1248 /* Calculate the cost of the memory access represented by DR. */
1250 static void
1251 vect_get_data_access_cost (struct data_reference *dr,
1252 unsigned int *inside_cost,
1253 unsigned int *outside_cost,
1254 stmt_vector_for_cost *body_cost_vec)
1256 gimple *stmt = DR_STMT (dr);
1257 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1258 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1259 int ncopies;
1261 if (PURE_SLP_STMT (stmt_info))
1262 ncopies = 1;
1263 else
1264 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1266 if (DR_IS_READ (dr))
1267 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1268 NULL, body_cost_vec, false);
1269 else
1270 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1272 if (dump_enabled_p ())
1273 dump_printf_loc (MSG_NOTE, vect_location,
1274 "vect_get_data_access_cost: inside_cost = %d, "
1275 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1279 typedef struct _vect_peel_info
1281 struct data_reference *dr;
1282 int npeel;
1283 unsigned int count;
1284 } *vect_peel_info;
1286 typedef struct _vect_peel_extended_info
1288 struct _vect_peel_info peel_info;
1289 unsigned int inside_cost;
1290 unsigned int outside_cost;
1291 } *vect_peel_extended_info;
1294 /* Peeling hashtable helpers. */
1296 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1298 static inline hashval_t hash (const _vect_peel_info *);
1299 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1302 inline hashval_t
1303 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1305 return (hashval_t) peel_info->npeel;
1308 inline bool
1309 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1311 return (a->npeel == b->npeel);
1315 /* Insert DR into peeling hash table with NPEEL as key. */
1317 static void
1318 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1319 loop_vec_info loop_vinfo, struct data_reference *dr,
1320 int npeel)
1322 struct _vect_peel_info elem, *slot;
1323 _vect_peel_info **new_slot;
1324 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1326 elem.npeel = npeel;
1327 slot = peeling_htab->find (&elem);
1328 if (slot)
1329 slot->count++;
1330 else
1332 slot = XNEW (struct _vect_peel_info);
1333 slot->npeel = npeel;
1334 slot->dr = dr;
1335 slot->count = 1;
1336 new_slot = peeling_htab->find_slot (slot, INSERT);
1337 *new_slot = slot;
1340 if (!supportable_dr_alignment
1341 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1342 slot->count += VECT_MAX_COST;
1346 /* Traverse peeling hash table to find peeling option that aligns maximum
1347 number of data accesses. */
1350 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1351 _vect_peel_extended_info *max)
1353 vect_peel_info elem = *slot;
1355 if (elem->count > max->peel_info.count
1356 || (elem->count == max->peel_info.count
1357 && max->peel_info.npeel > elem->npeel))
1359 max->peel_info.npeel = elem->npeel;
1360 max->peel_info.count = elem->count;
1361 max->peel_info.dr = elem->dr;
1364 return 1;
1367 /* Get the costs of peeling NPEEL iterations checking data access costs
1368 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1369 misalignment will be zero after peeling. */
1371 static void
1372 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1373 struct data_reference *dr0,
1374 unsigned int *inside_cost,
1375 unsigned int *outside_cost,
1376 stmt_vector_for_cost *body_cost_vec,
1377 unsigned int npeel,
1378 bool unknown_misalignment)
1380 unsigned i;
1381 data_reference *dr;
1383 FOR_EACH_VEC_ELT (datarefs, i, dr)
1385 gimple *stmt = DR_STMT (dr);
1386 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1387 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1388 continue;
1390 /* For interleaving, only the alignment of the first access
1391 matters. */
1392 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1393 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1394 continue;
1396 /* Strided accesses perform only component accesses, alignment is
1397 irrelevant for them. */
1398 if (STMT_VINFO_STRIDED_P (stmt_info)
1399 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1400 continue;
1402 int save_misalignment;
1403 save_misalignment = DR_MISALIGNMENT (dr);
1404 if (npeel == 0)
1406 else if (unknown_misalignment && dr == dr0)
1407 SET_DR_MISALIGNMENT (dr, 0);
1408 else
1409 vect_update_misalignment_for_peel (dr, dr0, npeel);
1410 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1411 body_cost_vec);
1412 SET_DR_MISALIGNMENT (dr, save_misalignment);
1416 /* Traverse peeling hash table and calculate cost for each peeling option.
1417 Find the one with the lowest cost. */
1420 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1421 _vect_peel_extended_info *min)
1423 vect_peel_info elem = *slot;
1424 int dummy;
1425 unsigned int inside_cost = 0, outside_cost = 0;
1426 gimple *stmt = DR_STMT (elem->dr);
1427 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1428 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1429 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1430 epilogue_cost_vec;
1432 prologue_cost_vec.create (2);
1433 body_cost_vec.create (2);
1434 epilogue_cost_vec.create (2);
1436 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1437 elem->dr, &inside_cost, &outside_cost,
1438 &body_cost_vec, elem->npeel, false);
1440 body_cost_vec.release ();
1442 outside_cost += vect_get_known_peeling_cost
1443 (loop_vinfo, elem->npeel, &dummy,
1444 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1445 &prologue_cost_vec, &epilogue_cost_vec);
1447 /* Prologue and epilogue costs are added to the target model later.
1448 These costs depend only on the scalar iteration cost, the
1449 number of peeling iterations finally chosen, and the number of
1450 misaligned statements. So discard the information found here. */
1451 prologue_cost_vec.release ();
1452 epilogue_cost_vec.release ();
1454 if (inside_cost < min->inside_cost
1455 || (inside_cost == min->inside_cost
1456 && outside_cost < min->outside_cost))
1458 min->inside_cost = inside_cost;
1459 min->outside_cost = outside_cost;
1460 min->peel_info.dr = elem->dr;
1461 min->peel_info.npeel = elem->npeel;
1462 min->peel_info.count = elem->count;
1465 return 1;
1469 /* Choose best peeling option by traversing peeling hash table and either
1470 choosing an option with the lowest cost (if cost model is enabled) or the
1471 option that aligns as many accesses as possible. */
1473 static struct _vect_peel_extended_info
1474 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1475 loop_vec_info loop_vinfo)
1477 struct _vect_peel_extended_info res;
1479 res.peel_info.dr = NULL;
1481 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1483 res.inside_cost = INT_MAX;
1484 res.outside_cost = INT_MAX;
1485 peeling_htab->traverse <_vect_peel_extended_info *,
1486 vect_peeling_hash_get_lowest_cost> (&res);
1488 else
1490 res.peel_info.count = 0;
1491 peeling_htab->traverse <_vect_peel_extended_info *,
1492 vect_peeling_hash_get_most_frequent> (&res);
1493 res.inside_cost = 0;
1494 res.outside_cost = 0;
1497 return res;
1500 /* Return true if the new peeling NPEEL is supported. */
1502 static bool
1503 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1504 unsigned npeel)
1506 unsigned i;
1507 struct data_reference *dr = NULL;
1508 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1509 gimple *stmt;
1510 stmt_vec_info stmt_info;
1511 enum dr_alignment_support supportable_dr_alignment;
1513 /* Ensure that all data refs can be vectorized after the peel. */
1514 FOR_EACH_VEC_ELT (datarefs, i, dr)
1516 int save_misalignment;
1518 if (dr == dr0)
1519 continue;
1521 stmt = DR_STMT (dr);
1522 stmt_info = vinfo_for_stmt (stmt);
1523 /* For interleaving, only the alignment of the first access
1524 matters. */
1525 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1526 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1527 continue;
1529 /* Strided accesses perform only component accesses, alignment is
1530 irrelevant for them. */
1531 if (STMT_VINFO_STRIDED_P (stmt_info)
1532 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1533 continue;
1535 save_misalignment = DR_MISALIGNMENT (dr);
1536 vect_update_misalignment_for_peel (dr, dr0, npeel);
1537 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1538 SET_DR_MISALIGNMENT (dr, save_misalignment);
1540 if (!supportable_dr_alignment)
1541 return false;
1544 return true;
1547 /* Function vect_enhance_data_refs_alignment
1549 This pass will use loop versioning and loop peeling in order to enhance
1550 the alignment of data references in the loop.
1552 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1553 original loop is to be vectorized. Any other loops that are created by
1554 the transformations performed in this pass - are not supposed to be
1555 vectorized. This restriction will be relaxed.
1557 This pass will require a cost model to guide it whether to apply peeling
1558 or versioning or a combination of the two. For example, the scheme that
1559 intel uses when given a loop with several memory accesses, is as follows:
1560 choose one memory access ('p') which alignment you want to force by doing
1561 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1562 other accesses are not necessarily aligned, or (2) use loop versioning to
1563 generate one loop in which all accesses are aligned, and another loop in
1564 which only 'p' is necessarily aligned.
1566 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1567 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1568 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1570 Devising a cost model is the most critical aspect of this work. It will
1571 guide us on which access to peel for, whether to use loop versioning, how
1572 many versions to create, etc. The cost model will probably consist of
1573 generic considerations as well as target specific considerations (on
1574 powerpc for example, misaligned stores are more painful than misaligned
1575 loads).
1577 Here are the general steps involved in alignment enhancements:
1579 -- original loop, before alignment analysis:
1580 for (i=0; i<N; i++){
1581 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1582 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1585 -- After vect_compute_data_refs_alignment:
1586 for (i=0; i<N; i++){
1587 x = q[i]; # DR_MISALIGNMENT(q) = 3
1588 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1591 -- Possibility 1: we do loop versioning:
1592 if (p is aligned) {
1593 for (i=0; i<N; i++){ # loop 1A
1594 x = q[i]; # DR_MISALIGNMENT(q) = 3
1595 p[i] = y; # DR_MISALIGNMENT(p) = 0
1598 else {
1599 for (i=0; i<N; i++){ # loop 1B
1600 x = q[i]; # DR_MISALIGNMENT(q) = 3
1601 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1605 -- Possibility 2: we do loop peeling:
1606 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1607 x = q[i];
1608 p[i] = y;
1610 for (i = 3; i < N; i++){ # loop 2A
1611 x = q[i]; # DR_MISALIGNMENT(q) = 0
1612 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1615 -- Possibility 3: combination of loop peeling and versioning:
1616 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1617 x = q[i];
1618 p[i] = y;
1620 if (p is aligned) {
1621 for (i = 3; i<N; i++){ # loop 3A
1622 x = q[i]; # DR_MISALIGNMENT(q) = 0
1623 p[i] = y; # DR_MISALIGNMENT(p) = 0
1626 else {
1627 for (i = 3; i<N; i++){ # loop 3B
1628 x = q[i]; # DR_MISALIGNMENT(q) = 0
1629 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1633 These loops are later passed to loop_transform to be vectorized. The
1634 vectorizer will use the alignment information to guide the transformation
1635 (whether to generate regular loads/stores, or with special handling for
1636 misalignment). */
1638 bool
1639 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1641 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1642 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1643 enum dr_alignment_support supportable_dr_alignment;
1644 struct data_reference *dr0 = NULL, *first_store = NULL;
1645 struct data_reference *dr;
1646 unsigned int i, j;
1647 bool do_peeling = false;
1648 bool do_versioning = false;
1649 bool stat;
1650 gimple *stmt;
1651 stmt_vec_info stmt_info;
1652 unsigned int npeel = 0;
1653 bool one_misalignment_known = false;
1654 bool one_misalignment_unknown = false;
1655 bool one_dr_unsupportable = false;
1656 struct data_reference *unsupportable_dr = NULL;
1657 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1658 unsigned possible_npeel_number = 1;
1659 tree vectype;
1660 unsigned int mis, same_align_drs_max = 0;
1661 hash_table<peel_info_hasher> peeling_htab (1);
1663 if (dump_enabled_p ())
1664 dump_printf_loc (MSG_NOTE, vect_location,
1665 "=== vect_enhance_data_refs_alignment ===\n");
1667 /* Reset data so we can safely be called multiple times. */
1668 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1669 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1671 /* While cost model enhancements are expected in the future, the high level
1672 view of the code at this time is as follows:
1674 A) If there is a misaligned access then see if peeling to align
1675 this access can make all data references satisfy
1676 vect_supportable_dr_alignment. If so, update data structures
1677 as needed and return true.
1679 B) If peeling wasn't possible and there is a data reference with an
1680 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1681 then see if loop versioning checks can be used to make all data
1682 references satisfy vect_supportable_dr_alignment. If so, update
1683 data structures as needed and return true.
1685 C) If neither peeling nor versioning were successful then return false if
1686 any data reference does not satisfy vect_supportable_dr_alignment.
1688 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1690 Note, Possibility 3 above (which is peeling and versioning together) is not
1691 being done at this time. */
1693 /* (1) Peeling to force alignment. */
1695 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1696 Considerations:
1697 + How many accesses will become aligned due to the peeling
1698 - How many accesses will become unaligned due to the peeling,
1699 and the cost of misaligned accesses.
1700 - The cost of peeling (the extra runtime checks, the increase
1701 in code size). */
1703 FOR_EACH_VEC_ELT (datarefs, i, dr)
1705 stmt = DR_STMT (dr);
1706 stmt_info = vinfo_for_stmt (stmt);
1708 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1709 continue;
1711 /* For interleaving, only the alignment of the first access
1712 matters. */
1713 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1714 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1715 continue;
1717 /* For invariant accesses there is nothing to enhance. */
1718 if (integer_zerop (DR_STEP (dr)))
1719 continue;
1721 /* Strided accesses perform only component accesses, alignment is
1722 irrelevant for them. */
1723 if (STMT_VINFO_STRIDED_P (stmt_info)
1724 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1725 continue;
1727 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1728 do_peeling = vector_alignment_reachable_p (dr);
1729 if (do_peeling)
1731 if (known_alignment_for_access_p (dr))
1733 unsigned int npeel_tmp = 0;
1734 bool negative = tree_int_cst_compare (DR_STEP (dr),
1735 size_zero_node) < 0;
1737 vectype = STMT_VINFO_VECTYPE (stmt_info);
1738 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1739 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1740 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1741 if (DR_MISALIGNMENT (dr) != 0)
1742 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1744 /* For multiple types, it is possible that the bigger type access
1745 will have more than one peeling option. E.g., a loop with two
1746 types: one of size (vector size / 4), and the other one of
1747 size (vector size / 8). Vectorization factor will 8. If both
1748 accesses are misaligned by 3, the first one needs one scalar
1749 iteration to be aligned, and the second one needs 5. But the
1750 first one will be aligned also by peeling 5 scalar
1751 iterations, and in that case both accesses will be aligned.
1752 Hence, except for the immediate peeling amount, we also want
1753 to try to add full vector size, while we don't exceed
1754 vectorization factor.
1755 We do this automatically for cost model, since we calculate
1756 cost for every peeling option. */
1757 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1759 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1760 ? vf * GROUP_SIZE (stmt_info) : vf);
1761 possible_npeel_number
1762 = vect_get_num_vectors (nscalars, vectype);
1764 /* NPEEL_TMP is 0 when there is no misalignment, but also
1765 allow peeling NELEMENTS. */
1766 if (DR_MISALIGNMENT (dr) == 0)
1767 possible_npeel_number++;
1770 /* Save info about DR in the hash table. Also include peeling
1771 amounts according to the explanation above. */
1772 for (j = 0; j < possible_npeel_number; j++)
1774 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1775 dr, npeel_tmp);
1776 npeel_tmp += target_align / dr_size;
1779 one_misalignment_known = true;
1781 else
1783 /* If we don't know any misalignment values, we prefer
1784 peeling for data-ref that has the maximum number of data-refs
1785 with the same alignment, unless the target prefers to align
1786 stores over load. */
1787 unsigned same_align_drs
1788 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1789 if (!dr0
1790 || same_align_drs_max < same_align_drs)
1792 same_align_drs_max = same_align_drs;
1793 dr0 = dr;
1795 /* For data-refs with the same number of related
1796 accesses prefer the one where the misalign
1797 computation will be invariant in the outermost loop. */
1798 else if (same_align_drs_max == same_align_drs)
1800 struct loop *ivloop0, *ivloop;
1801 ivloop0 = outermost_invariant_loop_for_expr
1802 (loop, DR_BASE_ADDRESS (dr0));
1803 ivloop = outermost_invariant_loop_for_expr
1804 (loop, DR_BASE_ADDRESS (dr));
1805 if ((ivloop && !ivloop0)
1806 || (ivloop && ivloop0
1807 && flow_loop_nested_p (ivloop, ivloop0)))
1808 dr0 = dr;
1811 one_misalignment_unknown = true;
1813 /* Check for data refs with unsupportable alignment that
1814 can be peeled. */
1815 if (!supportable_dr_alignment)
1817 one_dr_unsupportable = true;
1818 unsupportable_dr = dr;
1821 if (!first_store && DR_IS_WRITE (dr))
1822 first_store = dr;
1825 else
1827 if (!aligned_access_p (dr))
1829 if (dump_enabled_p ())
1830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1831 "vector alignment may not be reachable\n");
1832 break;
1837 /* Check if we can possibly peel the loop. */
1838 if (!vect_can_advance_ivs_p (loop_vinfo)
1839 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1840 || loop->inner)
1841 do_peeling = false;
1843 struct _vect_peel_extended_info peel_for_known_alignment;
1844 struct _vect_peel_extended_info peel_for_unknown_alignment;
1845 struct _vect_peel_extended_info best_peel;
1847 peel_for_unknown_alignment.inside_cost = INT_MAX;
1848 peel_for_unknown_alignment.outside_cost = INT_MAX;
1849 peel_for_unknown_alignment.peel_info.count = 0;
1851 if (do_peeling
1852 && one_misalignment_unknown)
1854 /* Check if the target requires to prefer stores over loads, i.e., if
1855 misaligned stores are more expensive than misaligned loads (taking
1856 drs with same alignment into account). */
1857 unsigned int load_inside_cost = 0;
1858 unsigned int load_outside_cost = 0;
1859 unsigned int store_inside_cost = 0;
1860 unsigned int store_outside_cost = 0;
1861 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1863 stmt_vector_for_cost dummy;
1864 dummy.create (2);
1865 vect_get_peeling_costs_all_drs (datarefs, dr0,
1866 &load_inside_cost,
1867 &load_outside_cost,
1868 &dummy, estimated_npeels, true);
1869 dummy.release ();
1871 if (first_store)
1873 dummy.create (2);
1874 vect_get_peeling_costs_all_drs (datarefs, first_store,
1875 &store_inside_cost,
1876 &store_outside_cost,
1877 &dummy, estimated_npeels, true);
1878 dummy.release ();
1880 else
1882 store_inside_cost = INT_MAX;
1883 store_outside_cost = INT_MAX;
1886 if (load_inside_cost > store_inside_cost
1887 || (load_inside_cost == store_inside_cost
1888 && load_outside_cost > store_outside_cost))
1890 dr0 = first_store;
1891 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1892 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1894 else
1896 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1897 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1900 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1901 prologue_cost_vec.create (2);
1902 epilogue_cost_vec.create (2);
1904 int dummy2;
1905 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1906 (loop_vinfo, estimated_npeels, &dummy2,
1907 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1908 &prologue_cost_vec, &epilogue_cost_vec);
1910 prologue_cost_vec.release ();
1911 epilogue_cost_vec.release ();
1913 peel_for_unknown_alignment.peel_info.count = 1
1914 + STMT_VINFO_SAME_ALIGN_REFS
1915 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1918 peel_for_unknown_alignment.peel_info.npeel = 0;
1919 peel_for_unknown_alignment.peel_info.dr = dr0;
1921 best_peel = peel_for_unknown_alignment;
1923 peel_for_known_alignment.inside_cost = INT_MAX;
1924 peel_for_known_alignment.outside_cost = INT_MAX;
1925 peel_for_known_alignment.peel_info.count = 0;
1926 peel_for_known_alignment.peel_info.dr = NULL;
1928 if (do_peeling && one_misalignment_known)
1930 /* Peeling is possible, but there is no data access that is not supported
1931 unless aligned. So we try to choose the best possible peeling from
1932 the hash table. */
1933 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1934 (&peeling_htab, loop_vinfo);
1937 /* Compare costs of peeling for known and unknown alignment. */
1938 if (peel_for_known_alignment.peel_info.dr != NULL
1939 && peel_for_unknown_alignment.inside_cost
1940 >= peel_for_known_alignment.inside_cost)
1942 best_peel = peel_for_known_alignment;
1944 /* If the best peeling for known alignment has NPEEL == 0, perform no
1945 peeling at all except if there is an unsupportable dr that we can
1946 align. */
1947 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1948 do_peeling = false;
1951 /* If there is an unsupportable data ref, prefer this over all choices so far
1952 since we'd have to discard a chosen peeling except when it accidentally
1953 aligned the unsupportable data ref. */
1954 if (one_dr_unsupportable)
1955 dr0 = unsupportable_dr;
1956 else if (do_peeling)
1958 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1959 TODO: Use nopeel_outside_cost or get rid of it? */
1960 unsigned nopeel_inside_cost = 0;
1961 unsigned nopeel_outside_cost = 0;
1963 stmt_vector_for_cost dummy;
1964 dummy.create (2);
1965 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1966 &nopeel_outside_cost, &dummy, 0, false);
1967 dummy.release ();
1969 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1970 costs will be recorded. */
1971 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1972 prologue_cost_vec.create (2);
1973 epilogue_cost_vec.create (2);
1975 int dummy2;
1976 nopeel_outside_cost += vect_get_known_peeling_cost
1977 (loop_vinfo, 0, &dummy2,
1978 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1979 &prologue_cost_vec, &epilogue_cost_vec);
1981 prologue_cost_vec.release ();
1982 epilogue_cost_vec.release ();
1984 npeel = best_peel.peel_info.npeel;
1985 dr0 = best_peel.peel_info.dr;
1987 /* If no peeling is not more expensive than the best peeling we
1988 have so far, don't perform any peeling. */
1989 if (nopeel_inside_cost <= best_peel.inside_cost)
1990 do_peeling = false;
1993 if (do_peeling)
1995 stmt = DR_STMT (dr0);
1996 stmt_info = vinfo_for_stmt (stmt);
1997 vectype = STMT_VINFO_VECTYPE (stmt_info);
1999 if (known_alignment_for_access_p (dr0))
2001 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2002 size_zero_node) < 0;
2003 if (!npeel)
2005 /* Since it's known at compile time, compute the number of
2006 iterations in the peeled loop (the peeling factor) for use in
2007 updating DR_MISALIGNMENT values. The peeling factor is the
2008 vectorization factor minus the misalignment as an element
2009 count. */
2010 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2011 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2012 npeel = ((mis & (target_align - 1))
2013 / vect_get_scalar_dr_size (dr0));
2016 /* For interleaved data access every iteration accesses all the
2017 members of the group, therefore we divide the number of iterations
2018 by the group size. */
2019 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2020 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2021 npeel /= GROUP_SIZE (stmt_info);
2023 if (dump_enabled_p ())
2024 dump_printf_loc (MSG_NOTE, vect_location,
2025 "Try peeling by %d\n", npeel);
2028 /* Ensure that all datarefs can be vectorized after the peel. */
2029 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2030 do_peeling = false;
2032 /* Check if all datarefs are supportable and log. */
2033 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2035 stat = vect_verify_datarefs_alignment (loop_vinfo);
2036 if (!stat)
2037 do_peeling = false;
2038 else
2039 return stat;
2042 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2043 if (do_peeling)
2045 unsigned max_allowed_peel
2046 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2047 if (max_allowed_peel != (unsigned)-1)
2049 unsigned max_peel = npeel;
2050 if (max_peel == 0)
2052 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2053 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2055 if (max_peel > max_allowed_peel)
2057 do_peeling = false;
2058 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_NOTE, vect_location,
2060 "Disable peeling, max peels reached: %d\n", max_peel);
2065 /* Cost model #2 - if peeling may result in a remaining loop not
2066 iterating enough to be vectorized then do not peel. Since this
2067 is a cost heuristic rather than a correctness decision, use the
2068 most likely runtime value for variable vectorization factors. */
2069 if (do_peeling
2070 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2072 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2073 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2074 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2075 < assumed_vf + max_peel)
2076 do_peeling = false;
2079 if (do_peeling)
2081 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2082 If the misalignment of DR_i is identical to that of dr0 then set
2083 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2084 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2085 by the peeling factor times the element size of DR_i (MOD the
2086 vectorization factor times the size). Otherwise, the
2087 misalignment of DR_i must be set to unknown. */
2088 FOR_EACH_VEC_ELT (datarefs, i, dr)
2089 if (dr != dr0)
2091 /* Strided accesses perform only component accesses, alignment
2092 is irrelevant for them. */
2093 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2094 if (STMT_VINFO_STRIDED_P (stmt_info)
2095 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2096 continue;
2098 vect_update_misalignment_for_peel (dr, dr0, npeel);
2101 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2102 if (npeel)
2103 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2104 else
2105 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2106 = DR_MISALIGNMENT (dr0);
2107 SET_DR_MISALIGNMENT (dr0, 0);
2108 if (dump_enabled_p ())
2110 dump_printf_loc (MSG_NOTE, vect_location,
2111 "Alignment of access forced using peeling.\n");
2112 dump_printf_loc (MSG_NOTE, vect_location,
2113 "Peeling for alignment will be applied.\n");
2116 /* The inside-loop cost will be accounted for in vectorizable_load
2117 and vectorizable_store correctly with adjusted alignments.
2118 Drop the body_cst_vec on the floor here. */
2119 stat = vect_verify_datarefs_alignment (loop_vinfo);
2120 gcc_assert (stat);
2121 return stat;
2125 /* (2) Versioning to force alignment. */
2127 /* Try versioning if:
2128 1) optimize loop for speed
2129 2) there is at least one unsupported misaligned data ref with an unknown
2130 misalignment, and
2131 3) all misaligned data refs with a known misalignment are supported, and
2132 4) the number of runtime alignment checks is within reason. */
2134 do_versioning =
2135 optimize_loop_nest_for_speed_p (loop)
2136 && (!loop->inner); /* FORNOW */
2138 if (do_versioning)
2140 FOR_EACH_VEC_ELT (datarefs, i, dr)
2142 stmt = DR_STMT (dr);
2143 stmt_info = vinfo_for_stmt (stmt);
2145 /* For interleaving, only the alignment of the first access
2146 matters. */
2147 if (aligned_access_p (dr)
2148 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2149 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2150 continue;
2152 if (STMT_VINFO_STRIDED_P (stmt_info))
2154 /* Strided loads perform only component accesses, alignment is
2155 irrelevant for them. */
2156 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2157 continue;
2158 do_versioning = false;
2159 break;
2162 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2164 if (!supportable_dr_alignment)
2166 gimple *stmt;
2167 int mask;
2168 tree vectype;
2170 if (known_alignment_for_access_p (dr)
2171 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2172 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2174 do_versioning = false;
2175 break;
2178 stmt = DR_STMT (dr);
2179 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2180 gcc_assert (vectype);
2182 /* At present we don't support versioning for alignment
2183 with variable VF, since there's no guarantee that the
2184 VF is a power of two. We could relax this if we added
2185 a way of enforcing a power-of-two size. */
2186 unsigned HOST_WIDE_INT size;
2187 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2189 do_versioning = false;
2190 break;
2193 /* The rightmost bits of an aligned address must be zeros.
2194 Construct the mask needed for this test. For example,
2195 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2196 mask must be 15 = 0xf. */
2197 mask = size - 1;
2199 /* FORNOW: use the same mask to test all potentially unaligned
2200 references in the loop. The vectorizer currently supports
2201 a single vector size, see the reference to
2202 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2203 vectorization factor is computed. */
2204 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2205 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2206 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2207 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2208 DR_STMT (dr));
2212 /* Versioning requires at least one misaligned data reference. */
2213 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2214 do_versioning = false;
2215 else if (!do_versioning)
2216 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2219 if (do_versioning)
2221 vec<gimple *> may_misalign_stmts
2222 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2223 gimple *stmt;
2225 /* It can now be assumed that the data references in the statements
2226 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2227 of the loop being vectorized. */
2228 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2230 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2231 dr = STMT_VINFO_DATA_REF (stmt_info);
2232 SET_DR_MISALIGNMENT (dr, 0);
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_NOTE, vect_location,
2235 "Alignment of access forced using versioning.\n");
2238 if (dump_enabled_p ())
2239 dump_printf_loc (MSG_NOTE, vect_location,
2240 "Versioning for alignment will be applied.\n");
2242 /* Peeling and versioning can't be done together at this time. */
2243 gcc_assert (! (do_peeling && do_versioning));
2245 stat = vect_verify_datarefs_alignment (loop_vinfo);
2246 gcc_assert (stat);
2247 return stat;
2250 /* This point is reached if neither peeling nor versioning is being done. */
2251 gcc_assert (! (do_peeling || do_versioning));
2253 stat = vect_verify_datarefs_alignment (loop_vinfo);
2254 return stat;
2258 /* Function vect_find_same_alignment_drs.
2260 Update group and alignment relations according to the chosen
2261 vectorization factor. */
2263 static void
2264 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2266 struct data_reference *dra = DDR_A (ddr);
2267 struct data_reference *drb = DDR_B (ddr);
2268 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2269 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2271 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2272 return;
2274 if (dra == drb)
2275 return;
2277 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2278 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2279 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2280 return;
2282 /* Two references with distance zero have the same alignment. */
2283 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2284 - wi::to_poly_offset (DR_INIT (drb)));
2285 if (maybe_ne (diff, 0))
2287 /* Get the wider of the two alignments. */
2288 unsigned int align_a = (vect_calculate_target_alignment (dra)
2289 / BITS_PER_UNIT);
2290 unsigned int align_b = (vect_calculate_target_alignment (drb)
2291 / BITS_PER_UNIT);
2292 unsigned int max_align = MAX (align_a, align_b);
2294 /* Require the gap to be a multiple of the larger vector alignment. */
2295 if (!multiple_p (diff, max_align))
2296 return;
2299 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2300 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2301 if (dump_enabled_p ())
2303 dump_printf_loc (MSG_NOTE, vect_location,
2304 "accesses have the same alignment: ");
2305 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2306 dump_printf (MSG_NOTE, " and ");
2307 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2308 dump_printf (MSG_NOTE, "\n");
2313 /* Function vect_analyze_data_refs_alignment
2315 Analyze the alignment of the data-references in the loop.
2316 Return FALSE if a data reference is found that cannot be vectorized. */
2318 bool
2319 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2321 if (dump_enabled_p ())
2322 dump_printf_loc (MSG_NOTE, vect_location,
2323 "=== vect_analyze_data_refs_alignment ===\n");
2325 /* Mark groups of data references with same alignment using
2326 data dependence information. */
2327 vec<ddr_p> ddrs = vinfo->ddrs;
2328 struct data_dependence_relation *ddr;
2329 unsigned int i;
2331 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2332 vect_find_same_alignment_drs (ddr);
2334 vec<data_reference_p> datarefs = vinfo->datarefs;
2335 struct data_reference *dr;
2337 vect_record_base_alignments (vinfo);
2338 FOR_EACH_VEC_ELT (datarefs, i, dr)
2340 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2341 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2342 && !vect_compute_data_ref_alignment (dr))
2344 /* Strided accesses perform only component accesses, misalignment
2345 information is irrelevant for them. */
2346 if (STMT_VINFO_STRIDED_P (stmt_info)
2347 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2348 continue;
2350 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352 "not vectorized: can't calculate alignment "
2353 "for data ref.\n");
2355 return false;
2359 return true;
2363 /* Analyze alignment of DRs of stmts in NODE. */
2365 static bool
2366 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2368 /* We vectorize from the first scalar stmt in the node unless
2369 the node is permuted in which case we start from the first
2370 element in the group. */
2371 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2372 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2373 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2374 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2376 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2377 if (! vect_compute_data_ref_alignment (dr)
2378 /* For creating the data-ref pointer we need alignment of the
2379 first element anyway. */
2380 || (dr != first_dr
2381 && ! vect_compute_data_ref_alignment (first_dr))
2382 || ! verify_data_ref_alignment (dr))
2384 if (dump_enabled_p ())
2385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2386 "not vectorized: bad data alignment in basic "
2387 "block.\n");
2388 return false;
2391 return true;
2394 /* Function vect_slp_analyze_instance_alignment
2396 Analyze the alignment of the data-references in the SLP instance.
2397 Return FALSE if a data reference is found that cannot be vectorized. */
2399 bool
2400 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2402 if (dump_enabled_p ())
2403 dump_printf_loc (MSG_NOTE, vect_location,
2404 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2406 slp_tree node;
2407 unsigned i;
2408 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2409 if (! vect_slp_analyze_and_verify_node_alignment (node))
2410 return false;
2412 node = SLP_INSTANCE_TREE (instance);
2413 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2414 && ! vect_slp_analyze_and_verify_node_alignment
2415 (SLP_INSTANCE_TREE (instance)))
2416 return false;
2418 return true;
2422 /* Analyze groups of accesses: check that DR belongs to a group of
2423 accesses of legal size, step, etc. Detect gaps, single element
2424 interleaving, and other special cases. Set grouped access info.
2425 Collect groups of strided stores for further use in SLP analysis.
2426 Worker for vect_analyze_group_access. */
2428 static bool
2429 vect_analyze_group_access_1 (struct data_reference *dr)
2431 tree step = DR_STEP (dr);
2432 tree scalar_type = TREE_TYPE (DR_REF (dr));
2433 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2434 gimple *stmt = DR_STMT (dr);
2435 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2436 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2437 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2438 HOST_WIDE_INT dr_step = -1;
2439 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2440 bool slp_impossible = false;
2442 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2443 size of the interleaving group (including gaps). */
2444 if (tree_fits_shwi_p (step))
2446 dr_step = tree_to_shwi (step);
2447 /* Check that STEP is a multiple of type size. Otherwise there is
2448 a non-element-sized gap at the end of the group which we
2449 cannot represent in GROUP_GAP or GROUP_SIZE.
2450 ??? As we can handle non-constant step fine here we should
2451 simply remove uses of GROUP_GAP between the last and first
2452 element and instead rely on DR_STEP. GROUP_SIZE then would
2453 simply not include that gap. */
2454 if ((dr_step % type_size) != 0)
2456 if (dump_enabled_p ())
2458 dump_printf_loc (MSG_NOTE, vect_location,
2459 "Step ");
2460 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2461 dump_printf (MSG_NOTE,
2462 " is not a multiple of the element size for ");
2463 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2464 dump_printf (MSG_NOTE, "\n");
2466 return false;
2468 groupsize = absu_hwi (dr_step) / type_size;
2470 else
2471 groupsize = 0;
2473 /* Not consecutive access is possible only if it is a part of interleaving. */
2474 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2476 /* Check if it this DR is a part of interleaving, and is a single
2477 element of the group that is accessed in the loop. */
2479 /* Gaps are supported only for loads. STEP must be a multiple of the type
2480 size. */
2481 if (DR_IS_READ (dr)
2482 && (dr_step % type_size) == 0
2483 && groupsize > 0)
2485 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2486 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2487 GROUP_GAP (stmt_info) = groupsize - 1;
2488 if (dump_enabled_p ())
2490 dump_printf_loc (MSG_NOTE, vect_location,
2491 "Detected single element interleaving ");
2492 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2493 dump_printf (MSG_NOTE, " step ");
2494 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2495 dump_printf (MSG_NOTE, "\n");
2498 return true;
2501 if (dump_enabled_p ())
2503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2504 "not consecutive access ");
2505 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2508 if (bb_vinfo)
2510 /* Mark the statement as unvectorizable. */
2511 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2512 return true;
2515 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2516 STMT_VINFO_STRIDED_P (stmt_info) = true;
2517 return true;
2520 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2522 /* First stmt in the interleaving chain. Check the chain. */
2523 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2524 struct data_reference *data_ref = dr;
2525 unsigned int count = 1;
2526 tree prev_init = DR_INIT (data_ref);
2527 gimple *prev = stmt;
2528 HOST_WIDE_INT diff, gaps = 0;
2530 /* By construction, all group members have INTEGER_CST DR_INITs. */
2531 while (next)
2533 /* Skip same data-refs. In case that two or more stmts share
2534 data-ref (supported only for loads), we vectorize only the first
2535 stmt, and the rest get their vectorized loads from the first
2536 one. */
2537 if (!tree_int_cst_compare (DR_INIT (data_ref),
2538 DR_INIT (STMT_VINFO_DATA_REF (
2539 vinfo_for_stmt (next)))))
2541 if (DR_IS_WRITE (data_ref))
2543 if (dump_enabled_p ())
2544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2545 "Two store stmts share the same dr.\n");
2546 return false;
2549 if (dump_enabled_p ())
2550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2551 "Two or more load stmts share the same dr.\n");
2553 /* For load use the same data-ref load. */
2554 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2556 prev = next;
2557 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2558 continue;
2561 prev = next;
2562 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2564 /* All group members have the same STEP by construction. */
2565 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2567 /* Check that the distance between two accesses is equal to the type
2568 size. Otherwise, we have gaps. */
2569 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2570 - TREE_INT_CST_LOW (prev_init)) / type_size;
2571 if (diff != 1)
2573 /* FORNOW: SLP of accesses with gaps is not supported. */
2574 slp_impossible = true;
2575 if (DR_IS_WRITE (data_ref))
2577 if (dump_enabled_p ())
2578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2579 "interleaved store with gaps\n");
2580 return false;
2583 gaps += diff - 1;
2586 last_accessed_element += diff;
2588 /* Store the gap from the previous member of the group. If there is no
2589 gap in the access, GROUP_GAP is always 1. */
2590 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2592 prev_init = DR_INIT (data_ref);
2593 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2594 /* Count the number of data-refs in the chain. */
2595 count++;
2598 if (groupsize == 0)
2599 groupsize = count + gaps;
2601 /* This could be UINT_MAX but as we are generating code in a very
2602 inefficient way we have to cap earlier. See PR78699 for example. */
2603 if (groupsize > 4096)
2605 if (dump_enabled_p ())
2606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2607 "group is too large\n");
2608 return false;
2611 /* Check that the size of the interleaving is equal to count for stores,
2612 i.e., that there are no gaps. */
2613 if (groupsize != count
2614 && !DR_IS_READ (dr))
2616 if (dump_enabled_p ())
2617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2618 "interleaved store with gaps\n");
2619 return false;
2622 /* If there is a gap after the last load in the group it is the
2623 difference between the groupsize and the last accessed
2624 element.
2625 When there is no gap, this difference should be 0. */
2626 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2628 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2629 if (dump_enabled_p ())
2631 dump_printf_loc (MSG_NOTE, vect_location,
2632 "Detected interleaving ");
2633 if (DR_IS_READ (dr))
2634 dump_printf (MSG_NOTE, "load ");
2635 else
2636 dump_printf (MSG_NOTE, "store ");
2637 dump_printf (MSG_NOTE, "of size %u starting with ",
2638 (unsigned)groupsize);
2639 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2640 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2641 dump_printf_loc (MSG_NOTE, vect_location,
2642 "There is a gap of %u elements after the group\n",
2643 GROUP_GAP (vinfo_for_stmt (stmt)));
2646 /* SLP: create an SLP data structure for every interleaving group of
2647 stores for further analysis in vect_analyse_slp. */
2648 if (DR_IS_WRITE (dr) && !slp_impossible)
2650 if (loop_vinfo)
2651 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2652 if (bb_vinfo)
2653 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2657 return true;
2660 /* Analyze groups of accesses: check that DR belongs to a group of
2661 accesses of legal size, step, etc. Detect gaps, single element
2662 interleaving, and other special cases. Set grouped access info.
2663 Collect groups of strided stores for further use in SLP analysis. */
2665 static bool
2666 vect_analyze_group_access (struct data_reference *dr)
2668 if (!vect_analyze_group_access_1 (dr))
2670 /* Dissolve the group if present. */
2671 gimple *next;
2672 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2673 while (stmt)
2675 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2676 next = GROUP_NEXT_ELEMENT (vinfo);
2677 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2678 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2679 stmt = next;
2681 return false;
2683 return true;
2686 /* Analyze the access pattern of the data-reference DR.
2687 In case of non-consecutive accesses call vect_analyze_group_access() to
2688 analyze groups of accesses. */
2690 static bool
2691 vect_analyze_data_ref_access (struct data_reference *dr)
2693 tree step = DR_STEP (dr);
2694 tree scalar_type = TREE_TYPE (DR_REF (dr));
2695 gimple *stmt = DR_STMT (dr);
2696 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2697 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2698 struct loop *loop = NULL;
2700 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2701 return true;
2703 if (loop_vinfo)
2704 loop = LOOP_VINFO_LOOP (loop_vinfo);
2706 if (loop_vinfo && !step)
2708 if (dump_enabled_p ())
2709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2710 "bad data-ref access in loop\n");
2711 return false;
2714 /* Allow loads with zero step in inner-loop vectorization. */
2715 if (loop_vinfo && integer_zerop (step))
2717 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2718 if (!nested_in_vect_loop_p (loop, stmt))
2719 return DR_IS_READ (dr);
2720 /* Allow references with zero step for outer loops marked
2721 with pragma omp simd only - it guarantees absence of
2722 loop-carried dependencies between inner loop iterations. */
2723 if (loop->safelen < 2)
2725 if (dump_enabled_p ())
2726 dump_printf_loc (MSG_NOTE, vect_location,
2727 "zero step in inner loop of nest\n");
2728 return false;
2732 if (loop && nested_in_vect_loop_p (loop, stmt))
2734 /* Interleaved accesses are not yet supported within outer-loop
2735 vectorization for references in the inner-loop. */
2736 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2738 /* For the rest of the analysis we use the outer-loop step. */
2739 step = STMT_VINFO_DR_STEP (stmt_info);
2740 if (integer_zerop (step))
2742 if (dump_enabled_p ())
2743 dump_printf_loc (MSG_NOTE, vect_location,
2744 "zero step in outer loop.\n");
2745 return DR_IS_READ (dr);
2749 /* Consecutive? */
2750 if (TREE_CODE (step) == INTEGER_CST)
2752 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2753 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2754 || (dr_step < 0
2755 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2757 /* Mark that it is not interleaving. */
2758 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2759 return true;
2763 if (loop && nested_in_vect_loop_p (loop, stmt))
2765 if (dump_enabled_p ())
2766 dump_printf_loc (MSG_NOTE, vect_location,
2767 "grouped access in outer loop.\n");
2768 return false;
2772 /* Assume this is a DR handled by non-constant strided load case. */
2773 if (TREE_CODE (step) != INTEGER_CST)
2774 return (STMT_VINFO_STRIDED_P (stmt_info)
2775 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2776 || vect_analyze_group_access (dr)));
2778 /* Not consecutive access - check if it's a part of interleaving group. */
2779 return vect_analyze_group_access (dr);
2782 /* Compare two data-references DRA and DRB to group them into chunks
2783 suitable for grouping. */
2785 static int
2786 dr_group_sort_cmp (const void *dra_, const void *drb_)
2788 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2789 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2790 int cmp;
2792 /* Stabilize sort. */
2793 if (dra == drb)
2794 return 0;
2796 /* DRs in different loops never belong to the same group. */
2797 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2798 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2799 if (loopa != loopb)
2800 return loopa->num < loopb->num ? -1 : 1;
2802 /* Ordering of DRs according to base. */
2803 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2804 DR_BASE_ADDRESS (drb));
2805 if (cmp != 0)
2806 return cmp;
2808 /* And according to DR_OFFSET. */
2809 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2810 if (cmp != 0)
2811 return cmp;
2813 /* Put reads before writes. */
2814 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2815 return DR_IS_READ (dra) ? -1 : 1;
2817 /* Then sort after access size. */
2818 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2819 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2820 if (cmp != 0)
2821 return cmp;
2823 /* And after step. */
2824 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2825 if (cmp != 0)
2826 return cmp;
2828 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2829 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2830 if (cmp == 0)
2831 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2832 return cmp;
2835 /* If OP is the result of a conversion, return the unconverted value,
2836 otherwise return null. */
2838 static tree
2839 strip_conversion (tree op)
2841 if (TREE_CODE (op) != SSA_NAME)
2842 return NULL_TREE;
2843 gimple *stmt = SSA_NAME_DEF_STMT (op);
2844 if (!is_gimple_assign (stmt)
2845 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2846 return NULL_TREE;
2847 return gimple_assign_rhs1 (stmt);
2850 /* Return true if vectorizable_* routines can handle statements STMT1
2851 and STMT2 being in a single group. */
2853 static bool
2854 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2856 if (gimple_assign_single_p (stmt1))
2857 return gimple_assign_single_p (stmt2);
2859 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2861 /* Check for two masked loads or two masked stores. */
2862 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2863 return false;
2864 internal_fn ifn = gimple_call_internal_fn (stmt1);
2865 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2866 return false;
2867 if (ifn != gimple_call_internal_fn (stmt2))
2868 return false;
2870 /* Check that the masks are the same. Cope with casts of masks,
2871 like those created by build_mask_conversion. */
2872 tree mask1 = gimple_call_arg (stmt1, 2);
2873 tree mask2 = gimple_call_arg (stmt2, 2);
2874 if (!operand_equal_p (mask1, mask2, 0))
2876 mask1 = strip_conversion (mask1);
2877 if (!mask1)
2878 return false;
2879 mask2 = strip_conversion (mask2);
2880 if (!mask2)
2881 return false;
2882 if (!operand_equal_p (mask1, mask2, 0))
2883 return false;
2885 return true;
2888 return false;
2891 /* Function vect_analyze_data_ref_accesses.
2893 Analyze the access pattern of all the data references in the loop.
2895 FORNOW: the only access pattern that is considered vectorizable is a
2896 simple step 1 (consecutive) access.
2898 FORNOW: handle only arrays and pointer accesses. */
2900 bool
2901 vect_analyze_data_ref_accesses (vec_info *vinfo)
2903 unsigned int i;
2904 vec<data_reference_p> datarefs = vinfo->datarefs;
2905 struct data_reference *dr;
2907 if (dump_enabled_p ())
2908 dump_printf_loc (MSG_NOTE, vect_location,
2909 "=== vect_analyze_data_ref_accesses ===\n");
2911 if (datarefs.is_empty ())
2912 return true;
2914 /* Sort the array of datarefs to make building the interleaving chains
2915 linear. Don't modify the original vector's order, it is needed for
2916 determining what dependencies are reversed. */
2917 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2918 datarefs_copy.qsort (dr_group_sort_cmp);
2920 /* Build the interleaving chains. */
2921 for (i = 0; i < datarefs_copy.length () - 1;)
2923 data_reference_p dra = datarefs_copy[i];
2924 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2925 stmt_vec_info lastinfo = NULL;
2926 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2927 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2929 ++i;
2930 continue;
2932 for (i = i + 1; i < datarefs_copy.length (); ++i)
2934 data_reference_p drb = datarefs_copy[i];
2935 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2936 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2937 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2938 break;
2940 /* ??? Imperfect sorting (non-compatible types, non-modulo
2941 accesses, same accesses) can lead to a group to be artificially
2942 split here as we don't just skip over those. If it really
2943 matters we can push those to a worklist and re-iterate
2944 over them. The we can just skip ahead to the next DR here. */
2946 /* DRs in a different loop should not be put into the same
2947 interleaving group. */
2948 if (gimple_bb (DR_STMT (dra))->loop_father
2949 != gimple_bb (DR_STMT (drb))->loop_father)
2950 break;
2952 /* Check that the data-refs have same first location (except init)
2953 and they are both either store or load (not load and store,
2954 not masked loads or stores). */
2955 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2956 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2957 DR_BASE_ADDRESS (drb)) != 0
2958 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2959 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2960 break;
2962 /* Check that the data-refs have the same constant size. */
2963 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2964 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2965 if (!tree_fits_uhwi_p (sza)
2966 || !tree_fits_uhwi_p (szb)
2967 || !tree_int_cst_equal (sza, szb))
2968 break;
2970 /* Check that the data-refs have the same step. */
2971 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2972 break;
2974 /* Check the types are compatible.
2975 ??? We don't distinguish this during sorting. */
2976 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2977 TREE_TYPE (DR_REF (drb))))
2978 break;
2980 /* Check that the DR_INITs are compile-time constants. */
2981 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2982 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2983 break;
2985 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2986 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2987 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2988 HOST_WIDE_INT init_prev
2989 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2990 gcc_assert (init_a <= init_b
2991 && init_a <= init_prev
2992 && init_prev <= init_b);
2994 /* Do not place the same access in the interleaving chain twice. */
2995 if (init_b == init_prev)
2997 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2998 < gimple_uid (DR_STMT (drb)));
2999 /* ??? For now we simply "drop" the later reference which is
3000 otherwise the same rather than finishing off this group.
3001 In the end we'd want to re-process duplicates forming
3002 multiple groups from the refs, likely by just collecting
3003 all candidates (including duplicates and split points
3004 below) in a vector and then process them together. */
3005 continue;
3008 /* If init_b == init_a + the size of the type * k, we have an
3009 interleaving, and DRA is accessed before DRB. */
3010 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3011 if (type_size_a == 0
3012 || (init_b - init_a) % type_size_a != 0)
3013 break;
3015 /* If we have a store, the accesses are adjacent. This splits
3016 groups into chunks we support (we don't support vectorization
3017 of stores with gaps). */
3018 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3019 break;
3021 /* If the step (if not zero or non-constant) is greater than the
3022 difference between data-refs' inits this splits groups into
3023 suitable sizes. */
3024 if (tree_fits_shwi_p (DR_STEP (dra)))
3026 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3027 if (step != 0 && step <= (init_b - init_a))
3028 break;
3031 if (dump_enabled_p ())
3033 dump_printf_loc (MSG_NOTE, vect_location,
3034 "Detected interleaving ");
3035 if (DR_IS_READ (dra))
3036 dump_printf (MSG_NOTE, "load ");
3037 else
3038 dump_printf (MSG_NOTE, "store ");
3039 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3040 dump_printf (MSG_NOTE, " and ");
3041 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3042 dump_printf (MSG_NOTE, "\n");
3045 /* Link the found element into the group list. */
3046 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3048 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3049 lastinfo = stmtinfo_a;
3051 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3052 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3053 lastinfo = stmtinfo_b;
3057 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3058 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3059 && !vect_analyze_data_ref_access (dr))
3061 if (dump_enabled_p ())
3062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3063 "not vectorized: complicated access pattern.\n");
3065 if (is_a <bb_vec_info> (vinfo))
3067 /* Mark the statement as not vectorizable. */
3068 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3069 continue;
3071 else
3073 datarefs_copy.release ();
3074 return false;
3078 datarefs_copy.release ();
3079 return true;
3082 /* Function vect_vfa_segment_size.
3084 Input:
3085 DR: The data reference.
3086 LENGTH_FACTOR: segment length to consider.
3088 Return a value suitable for the dr_with_seg_len::seg_len field.
3089 This is the "distance travelled" by the pointer from the first
3090 iteration in the segment to the last. Note that it does not include
3091 the size of the access; in effect it only describes the first byte. */
3093 static tree
3094 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3096 length_factor = size_binop (MINUS_EXPR,
3097 fold_convert (sizetype, length_factor),
3098 size_one_node);
3099 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3100 length_factor);
3103 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3104 gives the worst-case number of bytes covered by the segment. */
3106 static unsigned HOST_WIDE_INT
3107 vect_vfa_access_size (data_reference *dr)
3109 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3110 tree ref_type = TREE_TYPE (DR_REF (dr));
3111 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3112 unsigned HOST_WIDE_INT access_size = ref_size;
3113 if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3115 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3116 access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3118 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3119 && (vect_supportable_dr_alignment (dr, false)
3120 == dr_explicit_realign_optimized))
3122 /* We might access a full vector's worth. */
3123 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3124 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3126 return access_size;
3129 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3131 static unsigned int
3132 vect_vfa_align (const data_reference *dr)
3134 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3137 /* Function vect_no_alias_p.
3139 Given data references A and B with equal base and offset, see whether
3140 the alias relation can be decided at compilation time. Return 1 if
3141 it can and the references alias, 0 if it can and the references do
3142 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3143 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3144 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3146 static int
3147 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3148 tree segment_length_a, tree segment_length_b,
3149 unsigned HOST_WIDE_INT access_size_a,
3150 unsigned HOST_WIDE_INT access_size_b)
3152 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3153 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3154 poly_uint64 const_length_a;
3155 poly_uint64 const_length_b;
3157 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3158 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3159 [a, a+12) */
3160 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3162 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3163 offset_a = (offset_a + access_size_a) - const_length_a;
3165 else
3166 const_length_a = tree_to_poly_uint64 (segment_length_a);
3167 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3169 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3170 offset_b = (offset_b + access_size_b) - const_length_b;
3172 else
3173 const_length_b = tree_to_poly_uint64 (segment_length_b);
3175 const_length_a += access_size_a;
3176 const_length_b += access_size_b;
3178 if (ranges_known_overlap_p (offset_a, const_length_a,
3179 offset_b, const_length_b))
3180 return 1;
3182 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3183 offset_b, const_length_b))
3184 return 0;
3186 return -1;
3189 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3190 in DDR is >= VF. */
3192 static bool
3193 dependence_distance_ge_vf (data_dependence_relation *ddr,
3194 unsigned int loop_depth, poly_uint64 vf)
3196 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3197 || DDR_NUM_DIST_VECTS (ddr) == 0)
3198 return false;
3200 /* If the dependence is exact, we should have limited the VF instead. */
3201 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3203 unsigned int i;
3204 lambda_vector dist_v;
3205 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3207 HOST_WIDE_INT dist = dist_v[loop_depth];
3208 if (dist != 0
3209 && !(dist > 0 && DDR_REVERSED_P (ddr))
3210 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3211 return false;
3214 if (dump_enabled_p ())
3216 dump_printf_loc (MSG_NOTE, vect_location,
3217 "dependence distance between ");
3218 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3219 dump_printf (MSG_NOTE, " and ");
3220 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3221 dump_printf (MSG_NOTE, " is >= VF\n");
3224 return true;
3227 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3229 static void
3230 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3232 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3233 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3234 dump_printf (dump_kind, ") >= ");
3235 dump_dec (dump_kind, lower_bound.min_value);
3238 /* Record that the vectorized loop requires the vec_lower_bound described
3239 by EXPR, UNSIGNED_P and MIN_VALUE. */
3241 static void
3242 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3243 poly_uint64 min_value)
3245 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3246 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3247 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3249 unsigned_p &= lower_bounds[i].unsigned_p;
3250 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3251 if (lower_bounds[i].unsigned_p != unsigned_p
3252 || maybe_lt (lower_bounds[i].min_value, min_value))
3254 lower_bounds[i].unsigned_p = unsigned_p;
3255 lower_bounds[i].min_value = min_value;
3256 if (dump_enabled_p ())
3258 dump_printf_loc (MSG_NOTE, vect_location,
3259 "updating run-time check to ");
3260 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3261 dump_printf (MSG_NOTE, "\n");
3264 return;
3267 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3268 if (dump_enabled_p ())
3270 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3271 dump_lower_bound (MSG_NOTE, lower_bound);
3272 dump_printf (MSG_NOTE, "\n");
3274 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3277 /* Return true if it's unlikely that the step of the vectorized form of DR
3278 will span fewer than GAP bytes. */
3280 static bool
3281 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3283 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3284 HOST_WIDE_INT count
3285 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3286 if (GROUP_FIRST_ELEMENT (stmt_info))
3287 count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3288 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3291 /* Return true if we know that there is no alias between DR_A and DR_B
3292 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3293 *LOWER_BOUND_OUT to this N. */
3295 static bool
3296 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3297 poly_uint64 *lower_bound_out)
3299 /* Check that there is a constant gap of known sign between DR_A
3300 and DR_B. */
3301 poly_int64 init_a, init_b;
3302 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3303 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3304 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3305 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3306 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3307 || !ordered_p (init_a, init_b))
3308 return false;
3310 /* Sort DR_A and DR_B by the address they access. */
3311 if (maybe_lt (init_b, init_a))
3313 std::swap (init_a, init_b);
3314 std::swap (dr_a, dr_b);
3317 /* If the two accesses could be dependent within a scalar iteration,
3318 make sure that we'd retain their order. */
3319 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3320 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3321 return false;
3323 /* There is no alias if abs (DR_STEP) is greater than or equal to
3324 the bytes spanned by the combination of the two accesses. */
3325 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3326 return true;
3329 /* Function vect_prune_runtime_alias_test_list.
3331 Prune a list of ddrs to be tested at run-time by versioning for alias.
3332 Merge several alias checks into one if possible.
3333 Return FALSE if resulting list of ddrs is longer then allowed by
3334 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3336 bool
3337 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3339 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3340 hash_set <tree_pair_hash> compared_objects;
3342 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3343 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3344 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3345 vec<vec_object_pair> &check_unequal_addrs
3346 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3347 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3348 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3350 ddr_p ddr;
3351 unsigned int i;
3352 tree length_factor;
3354 if (dump_enabled_p ())
3355 dump_printf_loc (MSG_NOTE, vect_location,
3356 "=== vect_prune_runtime_alias_test_list ===\n");
3358 /* Step values are irrelevant for aliasing if the number of vector
3359 iterations is equal to the number of scalar iterations (which can
3360 happen for fully-SLP loops). */
3361 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3363 if (!ignore_step_p)
3365 /* Convert the checks for nonzero steps into bound tests. */
3366 tree value;
3367 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3368 vect_check_lower_bound (loop_vinfo, value, true, 1);
3371 if (may_alias_ddrs.is_empty ())
3372 return true;
3374 comp_alias_ddrs.create (may_alias_ddrs.length ());
3376 unsigned int loop_depth
3377 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3378 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3380 /* First, we collect all data ref pairs for aliasing checks. */
3381 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3383 int comp_res;
3384 poly_uint64 lower_bound;
3385 struct data_reference *dr_a, *dr_b;
3386 gimple *dr_group_first_a, *dr_group_first_b;
3387 tree segment_length_a, segment_length_b;
3388 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3389 unsigned int align_a, align_b;
3390 gimple *stmt_a, *stmt_b;
3392 /* Ignore the alias if the VF we chose ended up being no greater
3393 than the dependence distance. */
3394 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3395 continue;
3397 if (DDR_OBJECT_A (ddr))
3399 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3400 if (!compared_objects.add (new_pair))
3402 if (dump_enabled_p ())
3404 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3405 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3406 dump_printf (MSG_NOTE, " and ");
3407 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3408 dump_printf (MSG_NOTE, " have different addresses\n");
3410 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3412 continue;
3415 dr_a = DDR_A (ddr);
3416 stmt_a = DR_STMT (DDR_A (ddr));
3418 dr_b = DDR_B (ddr);
3419 stmt_b = DR_STMT (DDR_B (ddr));
3421 /* Skip the pair if inter-iteration dependencies are irrelevant
3422 and intra-iteration dependencies are guaranteed to be honored. */
3423 if (ignore_step_p
3424 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3425 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3427 if (dump_enabled_p ())
3429 dump_printf_loc (MSG_NOTE, vect_location,
3430 "no need for alias check between ");
3431 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3432 dump_printf (MSG_NOTE, " and ");
3433 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3434 dump_printf (MSG_NOTE, " when VF is 1\n");
3436 continue;
3439 /* See whether we can handle the alias using a bounds check on
3440 the step, and whether that's likely to be the best approach.
3441 (It might not be, for example, if the minimum step is much larger
3442 than the number of bytes handled by one vector iteration.) */
3443 if (!ignore_step_p
3444 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3445 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3446 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3447 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3449 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3450 if (dump_enabled_p ())
3452 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3453 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3454 dump_printf (MSG_NOTE, " and ");
3455 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3456 dump_printf (MSG_NOTE, " when the step ");
3457 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3458 dump_printf (MSG_NOTE, " is outside ");
3459 if (unsigned_p)
3460 dump_printf (MSG_NOTE, "[0");
3461 else
3463 dump_printf (MSG_NOTE, "(");
3464 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3466 dump_printf (MSG_NOTE, ", ");
3467 dump_dec (MSG_NOTE, lower_bound);
3468 dump_printf (MSG_NOTE, ")\n");
3470 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3471 lower_bound);
3472 continue;
3475 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3476 if (dr_group_first_a)
3478 stmt_a = dr_group_first_a;
3479 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3482 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3483 if (dr_group_first_b)
3485 stmt_b = dr_group_first_b;
3486 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3489 if (ignore_step_p)
3491 segment_length_a = size_zero_node;
3492 segment_length_b = size_zero_node;
3494 else
3496 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3497 length_factor = scalar_loop_iters;
3498 else
3499 length_factor = size_int (vect_factor);
3500 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3501 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3503 access_size_a = vect_vfa_access_size (dr_a);
3504 access_size_b = vect_vfa_access_size (dr_b);
3505 align_a = vect_vfa_align (dr_a);
3506 align_b = vect_vfa_align (dr_b);
3508 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3509 DR_BASE_ADDRESS (dr_b));
3510 if (comp_res == 0)
3511 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3512 DR_OFFSET (dr_b));
3514 /* See whether the alias is known at compilation time. */
3515 if (comp_res == 0
3516 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3517 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3518 && poly_int_tree_p (segment_length_a)
3519 && poly_int_tree_p (segment_length_b))
3521 int res = vect_compile_time_alias (dr_a, dr_b,
3522 segment_length_a,
3523 segment_length_b,
3524 access_size_a,
3525 access_size_b);
3526 if (res >= 0 && dump_enabled_p ())
3528 dump_printf_loc (MSG_NOTE, vect_location,
3529 "can tell at compile time that ");
3530 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3531 dump_printf (MSG_NOTE, " and ");
3532 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3533 if (res == 0)
3534 dump_printf (MSG_NOTE, " do not alias\n");
3535 else
3536 dump_printf (MSG_NOTE, " alias\n");
3539 if (res == 0)
3540 continue;
3542 if (res == 1)
3544 if (dump_enabled_p ())
3545 dump_printf_loc (MSG_NOTE, vect_location,
3546 "not vectorized: compilation time alias.\n");
3547 return false;
3551 dr_with_seg_len_pair_t dr_with_seg_len_pair
3552 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3553 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3555 /* Canonicalize pairs by sorting the two DR members. */
3556 if (comp_res > 0)
3557 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3559 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3562 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3564 unsigned int count = (comp_alias_ddrs.length ()
3565 + check_unequal_addrs.length ());
3567 dump_printf_loc (MSG_NOTE, vect_location,
3568 "improved number of alias checks from %d to %d\n",
3569 may_alias_ddrs.length (), count);
3570 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3572 if (dump_enabled_p ())
3573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3574 "number of versioning for alias "
3575 "run-time tests exceeds %d "
3576 "(--param vect-max-version-for-alias-checks)\n",
3577 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3578 return false;
3581 return true;
3584 /* Check whether we can use an internal function for a gather load
3585 or scatter store. READ_P is true for loads and false for stores.
3586 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3587 the type of the memory elements being loaded or stored. OFFSET_BITS
3588 is the number of bits in each scalar offset and OFFSET_SIGN is the
3589 sign of the offset. SCALE is the amount by which the offset should
3590 be multiplied *after* it has been converted to address width.
3592 Return true if the function is supported, storing the function
3593 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3595 bool
3596 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3597 tree memory_type, unsigned int offset_bits,
3598 signop offset_sign, int scale,
3599 internal_fn *ifn_out, tree *element_type_out)
3601 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3602 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3603 if (offset_bits > element_bits)
3604 /* Internal functions require the offset to be the same width as
3605 the vector elements. We can extend narrower offsets, but it isn't
3606 safe to truncate wider offsets. */
3607 return false;
3609 if (element_bits != memory_bits)
3610 /* For now the vector elements must be the same width as the
3611 memory elements. */
3612 return false;
3614 /* Work out which function we need. */
3615 internal_fn ifn;
3616 if (read_p)
3617 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3618 else
3619 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3621 /* Test whether the target supports this combination. */
3622 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3623 offset_sign, scale))
3624 return false;
3626 *ifn_out = ifn;
3627 *element_type_out = TREE_TYPE (vectype);
3628 return true;
3631 /* CALL is a call to an internal gather load or scatter store function.
3632 Describe the operation in INFO. */
3634 static void
3635 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3637 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3638 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3639 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3641 info->ifn = gimple_call_internal_fn (call);
3642 info->decl = NULL_TREE;
3643 info->base = gimple_call_arg (call, 0);
3644 info->offset = gimple_call_arg (call, 1);
3645 info->offset_dt = vect_unknown_def_type;
3646 info->offset_vectype = NULL_TREE;
3647 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3648 info->element_type = TREE_TYPE (vectype);
3649 info->memory_type = TREE_TYPE (DR_REF (dr));
3652 /* Return true if a non-affine read or write in STMT is suitable for a
3653 gather load or scatter store. Describe the operation in *INFO if so. */
3655 bool
3656 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3657 gather_scatter_info *info)
3659 HOST_WIDE_INT scale = 1;
3660 poly_int64 pbitpos, pbitsize;
3661 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3662 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3663 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3664 tree offtype = NULL_TREE;
3665 tree decl = NULL_TREE, base, off;
3666 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3667 tree memory_type = TREE_TYPE (DR_REF (dr));
3668 machine_mode pmode;
3669 int punsignedp, reversep, pvolatilep = 0;
3670 internal_fn ifn;
3671 tree element_type;
3672 bool masked_p = false;
3674 /* See whether this is already a call to a gather/scatter internal function.
3675 If not, see whether it's a masked load or store. */
3676 gcall *call = dyn_cast <gcall *> (stmt);
3677 if (call && gimple_call_internal_p (call))
3679 ifn = gimple_call_internal_fn (stmt);
3680 if (internal_gather_scatter_fn_p (ifn))
3682 vect_describe_gather_scatter_call (call, info);
3683 return true;
3685 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3688 /* True if we should aim to use internal functions rather than
3689 built-in functions. */
3690 bool use_ifn_p = (DR_IS_READ (dr)
3691 ? supports_vec_gather_load_p ()
3692 : supports_vec_scatter_store_p ());
3694 base = DR_REF (dr);
3695 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3696 see if we can use the def stmt of the address. */
3697 if (masked_p
3698 && TREE_CODE (base) == MEM_REF
3699 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3700 && integer_zerop (TREE_OPERAND (base, 1))
3701 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3703 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3704 if (is_gimple_assign (def_stmt)
3705 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3706 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3709 /* The gather and scatter builtins need address of the form
3710 loop_invariant + vector * {1, 2, 4, 8}
3712 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3713 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3714 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3715 multiplications and additions in it. To get a vector, we need
3716 a single SSA_NAME that will be defined in the loop and will
3717 contain everything that is not loop invariant and that can be
3718 vectorized. The following code attempts to find such a preexistng
3719 SSA_NAME OFF and put the loop invariants into a tree BASE
3720 that can be gimplified before the loop. */
3721 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3722 &punsignedp, &reversep, &pvolatilep);
3723 gcc_assert (base && !reversep);
3724 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3726 if (TREE_CODE (base) == MEM_REF)
3728 if (!integer_zerop (TREE_OPERAND (base, 1)))
3730 if (off == NULL_TREE)
3731 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3732 else
3733 off = size_binop (PLUS_EXPR, off,
3734 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3736 base = TREE_OPERAND (base, 0);
3738 else
3739 base = build_fold_addr_expr (base);
3741 if (off == NULL_TREE)
3742 off = size_zero_node;
3744 /* If base is not loop invariant, either off is 0, then we start with just
3745 the constant offset in the loop invariant BASE and continue with base
3746 as OFF, otherwise give up.
3747 We could handle that case by gimplifying the addition of base + off
3748 into some SSA_NAME and use that as off, but for now punt. */
3749 if (!expr_invariant_in_loop_p (loop, base))
3751 if (!integer_zerop (off))
3752 return false;
3753 off = base;
3754 base = size_int (pbytepos);
3756 /* Otherwise put base + constant offset into the loop invariant BASE
3757 and continue with OFF. */
3758 else
3760 base = fold_convert (sizetype, base);
3761 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3764 /* OFF at this point may be either a SSA_NAME or some tree expression
3765 from get_inner_reference. Try to peel off loop invariants from it
3766 into BASE as long as possible. */
3767 STRIP_NOPS (off);
3768 while (offtype == NULL_TREE)
3770 enum tree_code code;
3771 tree op0, op1, add = NULL_TREE;
3773 if (TREE_CODE (off) == SSA_NAME)
3775 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3777 if (expr_invariant_in_loop_p (loop, off))
3778 return false;
3780 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3781 break;
3783 op0 = gimple_assign_rhs1 (def_stmt);
3784 code = gimple_assign_rhs_code (def_stmt);
3785 op1 = gimple_assign_rhs2 (def_stmt);
3787 else
3789 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3790 return false;
3791 code = TREE_CODE (off);
3792 extract_ops_from_tree (off, &code, &op0, &op1);
3794 switch (code)
3796 case POINTER_PLUS_EXPR:
3797 case PLUS_EXPR:
3798 if (expr_invariant_in_loop_p (loop, op0))
3800 add = op0;
3801 off = op1;
3802 do_add:
3803 add = fold_convert (sizetype, add);
3804 if (scale != 1)
3805 add = size_binop (MULT_EXPR, add, size_int (scale));
3806 base = size_binop (PLUS_EXPR, base, add);
3807 continue;
3809 if (expr_invariant_in_loop_p (loop, op1))
3811 add = op1;
3812 off = op0;
3813 goto do_add;
3815 break;
3816 case MINUS_EXPR:
3817 if (expr_invariant_in_loop_p (loop, op1))
3819 add = fold_convert (sizetype, op1);
3820 add = size_binop (MINUS_EXPR, size_zero_node, add);
3821 off = op0;
3822 goto do_add;
3824 break;
3825 case MULT_EXPR:
3826 if (scale == 1 && tree_fits_shwi_p (op1))
3828 int new_scale = tree_to_shwi (op1);
3829 /* Only treat this as a scaling operation if the target
3830 supports it. */
3831 if (use_ifn_p
3832 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3833 vectype, memory_type, 1,
3834 TYPE_SIGN (TREE_TYPE (op0)),
3835 new_scale, &ifn,
3836 &element_type))
3837 break;
3838 scale = new_scale;
3839 off = op0;
3840 continue;
3842 break;
3843 case SSA_NAME:
3844 off = op0;
3845 continue;
3846 CASE_CONVERT:
3847 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3848 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3849 break;
3850 if (TYPE_PRECISION (TREE_TYPE (op0))
3851 == TYPE_PRECISION (TREE_TYPE (off)))
3853 off = op0;
3854 continue;
3857 /* The internal functions need the offset to be the same width
3858 as the elements of VECTYPE. Don't include operations that
3859 cast the offset from that width to a different width. */
3860 if (use_ifn_p
3861 && (int_size_in_bytes (TREE_TYPE (vectype))
3862 == int_size_in_bytes (TREE_TYPE (off))))
3863 break;
3865 if (TYPE_PRECISION (TREE_TYPE (op0))
3866 < TYPE_PRECISION (TREE_TYPE (off)))
3868 off = op0;
3869 offtype = TREE_TYPE (off);
3870 STRIP_NOPS (off);
3871 continue;
3873 break;
3874 default:
3875 break;
3877 break;
3880 /* If at the end OFF still isn't a SSA_NAME or isn't
3881 defined in the loop, punt. */
3882 if (TREE_CODE (off) != SSA_NAME
3883 || expr_invariant_in_loop_p (loop, off))
3884 return false;
3886 if (offtype == NULL_TREE)
3887 offtype = TREE_TYPE (off);
3889 if (use_ifn_p)
3891 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3892 memory_type, TYPE_PRECISION (offtype),
3893 TYPE_SIGN (offtype), scale, &ifn,
3894 &element_type))
3895 return false;
3897 else
3899 if (DR_IS_READ (dr))
3901 if (targetm.vectorize.builtin_gather)
3902 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3904 else
3906 if (targetm.vectorize.builtin_scatter)
3907 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3910 if (!decl)
3911 return false;
3913 ifn = IFN_LAST;
3914 element_type = TREE_TYPE (vectype);
3917 info->ifn = ifn;
3918 info->decl = decl;
3919 info->base = base;
3920 info->offset = off;
3921 info->offset_dt = vect_unknown_def_type;
3922 info->offset_vectype = NULL_TREE;
3923 info->scale = scale;
3924 info->element_type = element_type;
3925 info->memory_type = memory_type;
3926 return true;
3929 /* Function vect_analyze_data_refs.
3931 Find all the data references in the loop or basic block.
3933 The general structure of the analysis of data refs in the vectorizer is as
3934 follows:
3935 1- vect_analyze_data_refs(loop/bb): call
3936 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3937 in the loop/bb and their dependences.
3938 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3939 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3940 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3944 bool
3945 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3947 struct loop *loop = NULL;
3948 unsigned int i;
3949 struct data_reference *dr;
3950 tree scalar_type;
3952 if (dump_enabled_p ())
3953 dump_printf_loc (MSG_NOTE, vect_location,
3954 "=== vect_analyze_data_refs ===\n");
3956 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3957 loop = LOOP_VINFO_LOOP (loop_vinfo);
3959 /* Go through the data-refs, check that the analysis succeeded. Update
3960 pointer from stmt_vec_info struct to DR and vectype. */
3962 vec<data_reference_p> datarefs = vinfo->datarefs;
3963 FOR_EACH_VEC_ELT (datarefs, i, dr)
3965 gimple *stmt;
3966 stmt_vec_info stmt_info;
3967 tree base, offset, init;
3968 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3969 bool simd_lane_access = false;
3970 poly_uint64 vf;
3972 again:
3973 if (!dr || !DR_REF (dr))
3975 if (dump_enabled_p ())
3976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3977 "not vectorized: unhandled data-ref\n");
3978 return false;
3981 stmt = DR_STMT (dr);
3982 stmt_info = vinfo_for_stmt (stmt);
3984 /* Discard clobbers from the dataref vector. We will remove
3985 clobber stmts during vectorization. */
3986 if (gimple_clobber_p (stmt))
3988 free_data_ref (dr);
3989 if (i == datarefs.length () - 1)
3991 datarefs.pop ();
3992 break;
3994 datarefs.ordered_remove (i);
3995 dr = datarefs[i];
3996 goto again;
3999 /* Check that analysis of the data-ref succeeded. */
4000 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4001 || !DR_STEP (dr))
4003 bool maybe_gather
4004 = DR_IS_READ (dr)
4005 && !TREE_THIS_VOLATILE (DR_REF (dr))
4006 && (targetm.vectorize.builtin_gather != NULL
4007 || supports_vec_gather_load_p ());
4008 bool maybe_scatter
4009 = DR_IS_WRITE (dr)
4010 && !TREE_THIS_VOLATILE (DR_REF (dr))
4011 && (targetm.vectorize.builtin_scatter != NULL
4012 || supports_vec_scatter_store_p ());
4013 bool maybe_simd_lane_access
4014 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4016 /* If target supports vector gather loads or scatter stores, or if
4017 this might be a SIMD lane access, see if they can't be used. */
4018 if (is_a <loop_vec_info> (vinfo)
4019 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4020 && !nested_in_vect_loop_p (loop, stmt))
4022 struct data_reference *newdr
4023 = create_data_ref (NULL, loop_containing_stmt (stmt),
4024 DR_REF (dr), stmt, !maybe_scatter,
4025 DR_IS_CONDITIONAL_IN_STMT (dr));
4026 gcc_assert (newdr != NULL && DR_REF (newdr));
4027 if (DR_BASE_ADDRESS (newdr)
4028 && DR_OFFSET (newdr)
4029 && DR_INIT (newdr)
4030 && DR_STEP (newdr)
4031 && integer_zerop (DR_STEP (newdr)))
4033 if (maybe_simd_lane_access)
4035 tree off = DR_OFFSET (newdr);
4036 STRIP_NOPS (off);
4037 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4038 && TREE_CODE (off) == MULT_EXPR
4039 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4041 tree step = TREE_OPERAND (off, 1);
4042 off = TREE_OPERAND (off, 0);
4043 STRIP_NOPS (off);
4044 if (CONVERT_EXPR_P (off)
4045 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4046 0)))
4047 < TYPE_PRECISION (TREE_TYPE (off)))
4048 off = TREE_OPERAND (off, 0);
4049 if (TREE_CODE (off) == SSA_NAME)
4051 gimple *def = SSA_NAME_DEF_STMT (off);
4052 tree reft = TREE_TYPE (DR_REF (newdr));
4053 if (is_gimple_call (def)
4054 && gimple_call_internal_p (def)
4055 && (gimple_call_internal_fn (def)
4056 == IFN_GOMP_SIMD_LANE))
4058 tree arg = gimple_call_arg (def, 0);
4059 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4060 arg = SSA_NAME_VAR (arg);
4061 if (arg == loop->simduid
4062 /* For now. */
4063 && tree_int_cst_equal
4064 (TYPE_SIZE_UNIT (reft),
4065 step))
4067 DR_OFFSET (newdr) = ssize_int (0);
4068 DR_STEP (newdr) = step;
4069 DR_OFFSET_ALIGNMENT (newdr)
4070 = BIGGEST_ALIGNMENT;
4071 DR_STEP_ALIGNMENT (newdr)
4072 = highest_pow2_factor (step);
4073 dr = newdr;
4074 simd_lane_access = true;
4080 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4082 dr = newdr;
4083 if (maybe_gather)
4084 gatherscatter = GATHER;
4085 else
4086 gatherscatter = SCATTER;
4089 if (gatherscatter == SG_NONE && !simd_lane_access)
4090 free_data_ref (newdr);
4093 if (gatherscatter == SG_NONE && !simd_lane_access)
4095 if (dump_enabled_p ())
4097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4098 "not vectorized: data ref analysis "
4099 "failed ");
4100 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4103 if (is_a <bb_vec_info> (vinfo))
4104 break;
4106 return false;
4110 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4112 if (dump_enabled_p ())
4113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4114 "not vectorized: base addr of dr is a "
4115 "constant\n");
4117 if (is_a <bb_vec_info> (vinfo))
4118 break;
4120 if (gatherscatter != SG_NONE || simd_lane_access)
4121 free_data_ref (dr);
4122 return false;
4125 if (TREE_THIS_VOLATILE (DR_REF (dr)))
4127 if (dump_enabled_p ())
4129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4130 "not vectorized: volatile type ");
4131 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4134 if (is_a <bb_vec_info> (vinfo))
4135 break;
4137 return false;
4140 if (stmt_can_throw_internal (stmt))
4142 if (dump_enabled_p ())
4144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4145 "not vectorized: statement can throw an "
4146 "exception ");
4147 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4150 if (is_a <bb_vec_info> (vinfo))
4151 break;
4153 if (gatherscatter != SG_NONE || simd_lane_access)
4154 free_data_ref (dr);
4155 return false;
4158 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4159 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4161 if (dump_enabled_p ())
4163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4164 "not vectorized: statement is bitfield "
4165 "access ");
4166 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4169 if (is_a <bb_vec_info> (vinfo))
4170 break;
4172 if (gatherscatter != SG_NONE || simd_lane_access)
4173 free_data_ref (dr);
4174 return false;
4177 base = unshare_expr (DR_BASE_ADDRESS (dr));
4178 offset = unshare_expr (DR_OFFSET (dr));
4179 init = unshare_expr (DR_INIT (dr));
4181 if (is_gimple_call (stmt)
4182 && (!gimple_call_internal_p (stmt)
4183 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4184 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4186 if (dump_enabled_p ())
4188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4189 "not vectorized: dr in a call ");
4190 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4193 if (is_a <bb_vec_info> (vinfo))
4194 break;
4196 if (gatherscatter != SG_NONE || simd_lane_access)
4197 free_data_ref (dr);
4198 return false;
4201 /* Update DR field in stmt_vec_info struct. */
4203 /* If the dataref is in an inner-loop of the loop that is considered for
4204 for vectorization, we also want to analyze the access relative to
4205 the outer-loop (DR contains information only relative to the
4206 inner-most enclosing loop). We do that by building a reference to the
4207 first location accessed by the inner-loop, and analyze it relative to
4208 the outer-loop. */
4209 if (loop && nested_in_vect_loop_p (loop, stmt))
4211 /* Build a reference to the first location accessed by the
4212 inner loop: *(BASE + INIT + OFFSET). By construction,
4213 this address must be invariant in the inner loop, so we
4214 can consider it as being used in the outer loop. */
4215 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4216 init, offset);
4217 tree init_addr = fold_build_pointer_plus (base, init_offset);
4218 tree init_ref = build_fold_indirect_ref (init_addr);
4220 if (dump_enabled_p ())
4222 dump_printf_loc (MSG_NOTE, vect_location,
4223 "analyze in outer loop: ");
4224 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4225 dump_printf (MSG_NOTE, "\n");
4228 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4229 init_ref, loop))
4230 /* dr_analyze_innermost already explained the failure. */
4231 return false;
4233 if (dump_enabled_p ())
4235 dump_printf_loc (MSG_NOTE, vect_location,
4236 "\touter base_address: ");
4237 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4238 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4239 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4240 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4241 STMT_VINFO_DR_OFFSET (stmt_info));
4242 dump_printf (MSG_NOTE,
4243 "\n\touter constant offset from base address: ");
4244 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4245 STMT_VINFO_DR_INIT (stmt_info));
4246 dump_printf (MSG_NOTE, "\n\touter step: ");
4247 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4248 STMT_VINFO_DR_STEP (stmt_info));
4249 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4250 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4251 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4252 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4253 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4254 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4255 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4256 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4260 if (STMT_VINFO_DATA_REF (stmt_info))
4262 if (dump_enabled_p ())
4264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4265 "not vectorized: more than one data ref "
4266 "in stmt: ");
4267 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4270 if (is_a <bb_vec_info> (vinfo))
4271 break;
4273 if (gatherscatter != SG_NONE || simd_lane_access)
4274 free_data_ref (dr);
4275 return false;
4278 STMT_VINFO_DATA_REF (stmt_info) = dr;
4279 if (simd_lane_access)
4281 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4282 free_data_ref (datarefs[i]);
4283 datarefs[i] = dr;
4286 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4287 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4288 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4290 if (dump_enabled_p ())
4292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4293 "not vectorized: base object not addressable "
4294 "for stmt: ");
4295 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4297 if (is_a <bb_vec_info> (vinfo))
4299 /* In BB vectorization the ref can still participate
4300 in dependence analysis, we just can't vectorize it. */
4301 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4302 continue;
4304 return false;
4307 /* Set vectype for STMT. */
4308 scalar_type = TREE_TYPE (DR_REF (dr));
4309 STMT_VINFO_VECTYPE (stmt_info)
4310 = get_vectype_for_scalar_type (scalar_type);
4311 if (!STMT_VINFO_VECTYPE (stmt_info))
4313 if (dump_enabled_p ())
4315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4316 "not vectorized: no vectype for stmt: ");
4317 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4318 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4319 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4320 scalar_type);
4321 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4324 if (is_a <bb_vec_info> (vinfo))
4326 /* No vector type is fine, the ref can still participate
4327 in dependence analysis, we just can't vectorize it. */
4328 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4329 continue;
4332 if (gatherscatter != SG_NONE || simd_lane_access)
4334 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4335 if (gatherscatter != SG_NONE)
4336 free_data_ref (dr);
4338 return false;
4340 else
4342 if (dump_enabled_p ())
4344 dump_printf_loc (MSG_NOTE, vect_location,
4345 "got vectype for stmt: ");
4346 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4347 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4348 STMT_VINFO_VECTYPE (stmt_info));
4349 dump_printf (MSG_NOTE, "\n");
4353 /* Adjust the minimal vectorization factor according to the
4354 vector type. */
4355 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4356 *min_vf = upper_bound (*min_vf, vf);
4358 if (gatherscatter != SG_NONE)
4360 gather_scatter_info gs_info;
4361 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4362 &gs_info)
4363 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4365 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4366 free_data_ref (dr);
4367 if (dump_enabled_p ())
4369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4370 (gatherscatter == GATHER) ?
4371 "not vectorized: not suitable for gather "
4372 "load " :
4373 "not vectorized: not suitable for scatter "
4374 "store ");
4375 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4377 return false;
4380 free_data_ref (datarefs[i]);
4381 datarefs[i] = dr;
4382 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4385 else if (is_a <loop_vec_info> (vinfo)
4386 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4388 if (nested_in_vect_loop_p (loop, stmt))
4390 if (dump_enabled_p ())
4392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4393 "not vectorized: not suitable for strided "
4394 "load ");
4395 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4397 return false;
4399 STMT_VINFO_STRIDED_P (stmt_info) = true;
4403 /* If we stopped analysis at the first dataref we could not analyze
4404 when trying to vectorize a basic-block mark the rest of the datarefs
4405 as not vectorizable and truncate the vector of datarefs. That
4406 avoids spending useless time in analyzing their dependence. */
4407 if (i != datarefs.length ())
4409 gcc_assert (is_a <bb_vec_info> (vinfo));
4410 for (unsigned j = i; j < datarefs.length (); ++j)
4412 data_reference_p dr = datarefs[j];
4413 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4414 free_data_ref (dr);
4416 datarefs.truncate (i);
4419 return true;
4423 /* Function vect_get_new_vect_var.
4425 Returns a name for a new variable. The current naming scheme appends the
4426 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4427 the name of vectorizer generated variables, and appends that to NAME if
4428 provided. */
4430 tree
4431 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4433 const char *prefix;
4434 tree new_vect_var;
4436 switch (var_kind)
4438 case vect_simple_var:
4439 prefix = "vect";
4440 break;
4441 case vect_scalar_var:
4442 prefix = "stmp";
4443 break;
4444 case vect_mask_var:
4445 prefix = "mask";
4446 break;
4447 case vect_pointer_var:
4448 prefix = "vectp";
4449 break;
4450 default:
4451 gcc_unreachable ();
4454 if (name)
4456 char* tmp = concat (prefix, "_", name, NULL);
4457 new_vect_var = create_tmp_reg (type, tmp);
4458 free (tmp);
4460 else
4461 new_vect_var = create_tmp_reg (type, prefix);
4463 return new_vect_var;
4466 /* Like vect_get_new_vect_var but return an SSA name. */
4468 tree
4469 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4471 const char *prefix;
4472 tree new_vect_var;
4474 switch (var_kind)
4476 case vect_simple_var:
4477 prefix = "vect";
4478 break;
4479 case vect_scalar_var:
4480 prefix = "stmp";
4481 break;
4482 case vect_pointer_var:
4483 prefix = "vectp";
4484 break;
4485 default:
4486 gcc_unreachable ();
4489 if (name)
4491 char* tmp = concat (prefix, "_", name, NULL);
4492 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4493 free (tmp);
4495 else
4496 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4498 return new_vect_var;
4501 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4503 static void
4504 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4506 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4507 int misalign = DR_MISALIGNMENT (dr);
4508 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4509 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4510 else
4511 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4512 DR_TARGET_ALIGNMENT (dr), misalign);
4515 /* Function vect_create_addr_base_for_vector_ref.
4517 Create an expression that computes the address of the first memory location
4518 that will be accessed for a data reference.
4520 Input:
4521 STMT: The statement containing the data reference.
4522 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4523 OFFSET: Optional. If supplied, it is be added to the initial address.
4524 LOOP: Specify relative to which loop-nest should the address be computed.
4525 For example, when the dataref is in an inner-loop nested in an
4526 outer-loop that is now being vectorized, LOOP can be either the
4527 outer-loop, or the inner-loop. The first memory location accessed
4528 by the following dataref ('in' points to short):
4530 for (i=0; i<N; i++)
4531 for (j=0; j<M; j++)
4532 s += in[i+j]
4534 is as follows:
4535 if LOOP=i_loop: &in (relative to i_loop)
4536 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4537 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4538 initial address. Unlike OFFSET, which is number of elements to
4539 be added, BYTE_OFFSET is measured in bytes.
4541 Output:
4542 1. Return an SSA_NAME whose value is the address of the memory location of
4543 the first vector of the data reference.
4544 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4545 these statement(s) which define the returned SSA_NAME.
4547 FORNOW: We are only handling array accesses with step 1. */
4549 tree
4550 vect_create_addr_base_for_vector_ref (gimple *stmt,
4551 gimple_seq *new_stmt_list,
4552 tree offset,
4553 tree byte_offset)
4555 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4556 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4557 const char *base_name;
4558 tree addr_base;
4559 tree dest;
4560 gimple_seq seq = NULL;
4561 tree vect_ptr_type;
4562 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4563 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4564 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4566 tree data_ref_base = unshare_expr (drb->base_address);
4567 tree base_offset = unshare_expr (drb->offset);
4568 tree init = unshare_expr (drb->init);
4570 if (loop_vinfo)
4571 base_name = get_name (data_ref_base);
4572 else
4574 base_offset = ssize_int (0);
4575 init = ssize_int (0);
4576 base_name = get_name (DR_REF (dr));
4579 /* Create base_offset */
4580 base_offset = size_binop (PLUS_EXPR,
4581 fold_convert (sizetype, base_offset),
4582 fold_convert (sizetype, init));
4584 if (offset)
4586 offset = fold_build2 (MULT_EXPR, sizetype,
4587 fold_convert (sizetype, offset), step);
4588 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4589 base_offset, offset);
4591 if (byte_offset)
4593 byte_offset = fold_convert (sizetype, byte_offset);
4594 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4595 base_offset, byte_offset);
4598 /* base + base_offset */
4599 if (loop_vinfo)
4600 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4601 else
4603 addr_base = build1 (ADDR_EXPR,
4604 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4605 unshare_expr (DR_REF (dr)));
4608 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4609 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4610 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4611 gimple_seq_add_seq (new_stmt_list, seq);
4613 if (DR_PTR_INFO (dr)
4614 && TREE_CODE (addr_base) == SSA_NAME
4615 && !SSA_NAME_PTR_INFO (addr_base))
4617 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4618 if (offset || byte_offset)
4619 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4622 if (dump_enabled_p ())
4624 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4625 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4626 dump_printf (MSG_NOTE, "\n");
4629 return addr_base;
4633 /* Function vect_create_data_ref_ptr.
4635 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4636 location accessed in the loop by STMT, along with the def-use update
4637 chain to appropriately advance the pointer through the loop iterations.
4638 Also set aliasing information for the pointer. This pointer is used by
4639 the callers to this function to create a memory reference expression for
4640 vector load/store access.
4642 Input:
4643 1. STMT: a stmt that references memory. Expected to be of the form
4644 GIMPLE_ASSIGN <name, data-ref> or
4645 GIMPLE_ASSIGN <data-ref, name>.
4646 2. AGGR_TYPE: the type of the reference, which should be either a vector
4647 or an array.
4648 3. AT_LOOP: the loop where the vector memref is to be created.
4649 4. OFFSET (optional): an offset to be added to the initial address accessed
4650 by the data-ref in STMT.
4651 5. BSI: location where the new stmts are to be placed if there is no loop
4652 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4653 pointing to the initial address.
4654 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4655 to the initial address accessed by the data-ref in STMT. This is
4656 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4657 in bytes.
4658 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4659 to the IV during each iteration of the loop. NULL says to move
4660 by one copy of AGGR_TYPE up or down, depending on the step of the
4661 data reference.
4663 Output:
4664 1. Declare a new ptr to vector_type, and have it point to the base of the
4665 data reference (initial addressed accessed by the data reference).
4666 For example, for vector of type V8HI, the following code is generated:
4668 v8hi *ap;
4669 ap = (v8hi *)initial_address;
4671 if OFFSET is not supplied:
4672 initial_address = &a[init];
4673 if OFFSET is supplied:
4674 initial_address = &a[init + OFFSET];
4675 if BYTE_OFFSET is supplied:
4676 initial_address = &a[init] + BYTE_OFFSET;
4678 Return the initial_address in INITIAL_ADDRESS.
4680 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4681 update the pointer in each iteration of the loop.
4683 Return the increment stmt that updates the pointer in PTR_INCR.
4685 3. Set INV_P to true if the access pattern of the data reference in the
4686 vectorized loop is invariant. Set it to false otherwise.
4688 4. Return the pointer. */
4690 tree
4691 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4692 tree offset, tree *initial_address,
4693 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4694 bool only_init, bool *inv_p, tree byte_offset,
4695 tree iv_step)
4697 const char *base_name;
4698 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4699 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4700 struct loop *loop = NULL;
4701 bool nested_in_vect_loop = false;
4702 struct loop *containing_loop = NULL;
4703 tree aggr_ptr_type;
4704 tree aggr_ptr;
4705 tree new_temp;
4706 gimple_seq new_stmt_list = NULL;
4707 edge pe = NULL;
4708 basic_block new_bb;
4709 tree aggr_ptr_init;
4710 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4711 tree aptr;
4712 gimple_stmt_iterator incr_gsi;
4713 bool insert_after;
4714 tree indx_before_incr, indx_after_incr;
4715 gimple *incr;
4716 tree step;
4717 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4719 gcc_assert (iv_step != NULL_TREE
4720 || TREE_CODE (aggr_type) == ARRAY_TYPE
4721 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4723 if (loop_vinfo)
4725 loop = LOOP_VINFO_LOOP (loop_vinfo);
4726 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4727 containing_loop = (gimple_bb (stmt))->loop_father;
4728 pe = loop_preheader_edge (loop);
4730 else
4732 gcc_assert (bb_vinfo);
4733 only_init = true;
4734 *ptr_incr = NULL;
4737 /* Check the step (evolution) of the load in LOOP, and record
4738 whether it's invariant. */
4739 step = vect_dr_behavior (dr)->step;
4740 if (integer_zerop (step))
4741 *inv_p = true;
4742 else
4743 *inv_p = false;
4745 /* Create an expression for the first address accessed by this load
4746 in LOOP. */
4747 base_name = get_name (DR_BASE_ADDRESS (dr));
4749 if (dump_enabled_p ())
4751 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4752 dump_printf_loc (MSG_NOTE, vect_location,
4753 "create %s-pointer variable to type: ",
4754 get_tree_code_name (TREE_CODE (aggr_type)));
4755 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4756 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4757 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4758 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4759 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4760 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4761 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4762 else
4763 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4764 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4765 dump_printf (MSG_NOTE, "\n");
4768 /* (1) Create the new aggregate-pointer variable.
4769 Vector and array types inherit the alias set of their component
4770 type by default so we need to use a ref-all pointer if the data
4771 reference does not conflict with the created aggregated data
4772 reference because it is not addressable. */
4773 bool need_ref_all = false;
4774 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4775 get_alias_set (DR_REF (dr))))
4776 need_ref_all = true;
4777 /* Likewise for any of the data references in the stmt group. */
4778 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4780 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4783 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4784 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4785 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4786 get_alias_set (DR_REF (sdr))))
4788 need_ref_all = true;
4789 break;
4791 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4793 while (orig_stmt);
4795 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4796 need_ref_all);
4797 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4800 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4801 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4802 def-use update cycles for the pointer: one relative to the outer-loop
4803 (LOOP), which is what steps (3) and (4) below do. The other is relative
4804 to the inner-loop (which is the inner-most loop containing the dataref),
4805 and this is done be step (5) below.
4807 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4808 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4809 redundant. Steps (3),(4) create the following:
4811 vp0 = &base_addr;
4812 LOOP: vp1 = phi(vp0,vp2)
4815 vp2 = vp1 + step
4816 goto LOOP
4818 If there is an inner-loop nested in loop, then step (5) will also be
4819 applied, and an additional update in the inner-loop will be created:
4821 vp0 = &base_addr;
4822 LOOP: vp1 = phi(vp0,vp2)
4824 inner: vp3 = phi(vp1,vp4)
4825 vp4 = vp3 + inner_step
4826 if () goto inner
4828 vp2 = vp1 + step
4829 if () goto LOOP */
4831 /* (2) Calculate the initial address of the aggregate-pointer, and set
4832 the aggregate-pointer to point to it before the loop. */
4834 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4836 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4837 offset, byte_offset);
4838 if (new_stmt_list)
4840 if (pe)
4842 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4843 gcc_assert (!new_bb);
4845 else
4846 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4849 *initial_address = new_temp;
4850 aggr_ptr_init = new_temp;
4852 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4853 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4854 inner-loop nested in LOOP (during outer-loop vectorization). */
4856 /* No update in loop is required. */
4857 if (only_init && (!loop_vinfo || at_loop == loop))
4858 aptr = aggr_ptr_init;
4859 else
4861 if (iv_step == NULL_TREE)
4863 /* The step of the aggregate pointer is the type size. */
4864 iv_step = TYPE_SIZE_UNIT (aggr_type);
4865 /* One exception to the above is when the scalar step of the load in
4866 LOOP is zero. In this case the step here is also zero. */
4867 if (*inv_p)
4868 iv_step = size_zero_node;
4869 else if (tree_int_cst_sgn (step) == -1)
4870 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4873 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4875 create_iv (aggr_ptr_init,
4876 fold_convert (aggr_ptr_type, iv_step),
4877 aggr_ptr, loop, &incr_gsi, insert_after,
4878 &indx_before_incr, &indx_after_incr);
4879 incr = gsi_stmt (incr_gsi);
4880 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4882 /* Copy the points-to information if it exists. */
4883 if (DR_PTR_INFO (dr))
4885 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4886 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4888 if (ptr_incr)
4889 *ptr_incr = incr;
4891 aptr = indx_before_incr;
4894 if (!nested_in_vect_loop || only_init)
4895 return aptr;
4898 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4899 nested in LOOP, if exists. */
4901 gcc_assert (nested_in_vect_loop);
4902 if (!only_init)
4904 standard_iv_increment_position (containing_loop, &incr_gsi,
4905 &insert_after);
4906 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4907 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4908 &indx_after_incr);
4909 incr = gsi_stmt (incr_gsi);
4910 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4912 /* Copy the points-to information if it exists. */
4913 if (DR_PTR_INFO (dr))
4915 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4916 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4918 if (ptr_incr)
4919 *ptr_incr = incr;
4921 return indx_before_incr;
4923 else
4924 gcc_unreachable ();
4928 /* Function bump_vector_ptr
4930 Increment a pointer (to a vector type) by vector-size. If requested,
4931 i.e. if PTR-INCR is given, then also connect the new increment stmt
4932 to the existing def-use update-chain of the pointer, by modifying
4933 the PTR_INCR as illustrated below:
4935 The pointer def-use update-chain before this function:
4936 DATAREF_PTR = phi (p_0, p_2)
4937 ....
4938 PTR_INCR: p_2 = DATAREF_PTR + step
4940 The pointer def-use update-chain after this function:
4941 DATAREF_PTR = phi (p_0, p_2)
4942 ....
4943 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4944 ....
4945 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4947 Input:
4948 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4949 in the loop.
4950 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4951 the loop. The increment amount across iterations is expected
4952 to be vector_size.
4953 BSI - location where the new update stmt is to be placed.
4954 STMT - the original scalar memory-access stmt that is being vectorized.
4955 BUMP - optional. The offset by which to bump the pointer. If not given,
4956 the offset is assumed to be vector_size.
4958 Output: Return NEW_DATAREF_PTR as illustrated above.
4962 tree
4963 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4964 gimple *stmt, tree bump)
4966 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4967 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4968 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4969 tree update = TYPE_SIZE_UNIT (vectype);
4970 gassign *incr_stmt;
4971 ssa_op_iter iter;
4972 use_operand_p use_p;
4973 tree new_dataref_ptr;
4975 if (bump)
4976 update = bump;
4978 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4979 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4980 else
4981 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4982 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4983 dataref_ptr, update);
4984 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4986 /* Copy the points-to information if it exists. */
4987 if (DR_PTR_INFO (dr))
4989 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4990 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4993 if (!ptr_incr)
4994 return new_dataref_ptr;
4996 /* Update the vector-pointer's cross-iteration increment. */
4997 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4999 tree use = USE_FROM_PTR (use_p);
5001 if (use == dataref_ptr)
5002 SET_USE (use_p, new_dataref_ptr);
5003 else
5004 gcc_assert (operand_equal_p (use, update, 0));
5007 return new_dataref_ptr;
5011 /* Function vect_create_destination_var.
5013 Create a new temporary of type VECTYPE. */
5015 tree
5016 vect_create_destination_var (tree scalar_dest, tree vectype)
5018 tree vec_dest;
5019 const char *name;
5020 char *new_name;
5021 tree type;
5022 enum vect_var_kind kind;
5024 kind = vectype
5025 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5026 ? vect_mask_var
5027 : vect_simple_var
5028 : vect_scalar_var;
5029 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5031 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5033 name = get_name (scalar_dest);
5034 if (name)
5035 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5036 else
5037 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5038 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5039 free (new_name);
5041 return vec_dest;
5044 /* Function vect_grouped_store_supported.
5046 Returns TRUE if interleave high and interleave low permutations
5047 are supported, and FALSE otherwise. */
5049 bool
5050 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5052 machine_mode mode = TYPE_MODE (vectype);
5054 /* vect_permute_store_chain requires the group size to be equal to 3 or
5055 be a power of two. */
5056 if (count != 3 && exact_log2 (count) == -1)
5058 if (dump_enabled_p ())
5059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5060 "the size of the group of accesses"
5061 " is not a power of 2 or not eqaul to 3\n");
5062 return false;
5065 /* Check that the permutation is supported. */
5066 if (VECTOR_MODE_P (mode))
5068 unsigned int i;
5069 if (count == 3)
5071 unsigned int j0 = 0, j1 = 0, j2 = 0;
5072 unsigned int i, j;
5074 unsigned int nelt;
5075 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5077 if (dump_enabled_p ())
5078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5079 "cannot handle groups of 3 stores for"
5080 " variable-length vectors\n");
5081 return false;
5084 vec_perm_builder sel (nelt, nelt, 1);
5085 sel.quick_grow (nelt);
5086 vec_perm_indices indices;
5087 for (j = 0; j < 3; j++)
5089 int nelt0 = ((3 - j) * nelt) % 3;
5090 int nelt1 = ((3 - j) * nelt + 1) % 3;
5091 int nelt2 = ((3 - j) * nelt + 2) % 3;
5092 for (i = 0; i < nelt; i++)
5094 if (3 * i + nelt0 < nelt)
5095 sel[3 * i + nelt0] = j0++;
5096 if (3 * i + nelt1 < nelt)
5097 sel[3 * i + nelt1] = nelt + j1++;
5098 if (3 * i + nelt2 < nelt)
5099 sel[3 * i + nelt2] = 0;
5101 indices.new_vector (sel, 2, nelt);
5102 if (!can_vec_perm_const_p (mode, indices))
5104 if (dump_enabled_p ())
5105 dump_printf (MSG_MISSED_OPTIMIZATION,
5106 "permutation op not supported by target.\n");
5107 return false;
5110 for (i = 0; i < nelt; i++)
5112 if (3 * i + nelt0 < nelt)
5113 sel[3 * i + nelt0] = 3 * i + nelt0;
5114 if (3 * i + nelt1 < nelt)
5115 sel[3 * i + nelt1] = 3 * i + nelt1;
5116 if (3 * i + nelt2 < nelt)
5117 sel[3 * i + nelt2] = nelt + j2++;
5119 indices.new_vector (sel, 2, nelt);
5120 if (!can_vec_perm_const_p (mode, indices))
5122 if (dump_enabled_p ())
5123 dump_printf (MSG_MISSED_OPTIMIZATION,
5124 "permutation op not supported by target.\n");
5125 return false;
5128 return true;
5130 else
5132 /* If length is not equal to 3 then only power of 2 is supported. */
5133 gcc_assert (pow2p_hwi (count));
5134 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5136 /* The encoding has 2 interleaved stepped patterns. */
5137 vec_perm_builder sel (nelt, 2, 3);
5138 sel.quick_grow (6);
5139 for (i = 0; i < 3; i++)
5141 sel[i * 2] = i;
5142 sel[i * 2 + 1] = i + nelt;
5144 vec_perm_indices indices (sel, 2, nelt);
5145 if (can_vec_perm_const_p (mode, indices))
5147 for (i = 0; i < 6; i++)
5148 sel[i] += exact_div (nelt, 2);
5149 indices.new_vector (sel, 2, nelt);
5150 if (can_vec_perm_const_p (mode, indices))
5151 return true;
5156 if (dump_enabled_p ())
5157 dump_printf (MSG_MISSED_OPTIMIZATION,
5158 "permutaion op not supported by target.\n");
5159 return false;
5163 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5164 type VECTYPE. MASKED_P says whether the masked form is needed. */
5166 bool
5167 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5168 bool masked_p)
5170 if (masked_p)
5171 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5172 vec_mask_store_lanes_optab,
5173 vectype, count);
5174 else
5175 return vect_lanes_optab_supported_p ("vec_store_lanes",
5176 vec_store_lanes_optab,
5177 vectype, count);
5181 /* Function vect_permute_store_chain.
5183 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5184 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5185 the data correctly for the stores. Return the final references for stores
5186 in RESULT_CHAIN.
5188 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5189 The input is 4 vectors each containing 8 elements. We assign a number to
5190 each element, the input sequence is:
5192 1st vec: 0 1 2 3 4 5 6 7
5193 2nd vec: 8 9 10 11 12 13 14 15
5194 3rd vec: 16 17 18 19 20 21 22 23
5195 4th vec: 24 25 26 27 28 29 30 31
5197 The output sequence should be:
5199 1st vec: 0 8 16 24 1 9 17 25
5200 2nd vec: 2 10 18 26 3 11 19 27
5201 3rd vec: 4 12 20 28 5 13 21 30
5202 4th vec: 6 14 22 30 7 15 23 31
5204 i.e., we interleave the contents of the four vectors in their order.
5206 We use interleave_high/low instructions to create such output. The input of
5207 each interleave_high/low operation is two vectors:
5208 1st vec 2nd vec
5209 0 1 2 3 4 5 6 7
5210 the even elements of the result vector are obtained left-to-right from the
5211 high/low elements of the first vector. The odd elements of the result are
5212 obtained left-to-right from the high/low elements of the second vector.
5213 The output of interleave_high will be: 0 4 1 5
5214 and of interleave_low: 2 6 3 7
5217 The permutation is done in log LENGTH stages. In each stage interleave_high
5218 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5219 where the first argument is taken from the first half of DR_CHAIN and the
5220 second argument from it's second half.
5221 In our example,
5223 I1: interleave_high (1st vec, 3rd vec)
5224 I2: interleave_low (1st vec, 3rd vec)
5225 I3: interleave_high (2nd vec, 4th vec)
5226 I4: interleave_low (2nd vec, 4th vec)
5228 The output for the first stage is:
5230 I1: 0 16 1 17 2 18 3 19
5231 I2: 4 20 5 21 6 22 7 23
5232 I3: 8 24 9 25 10 26 11 27
5233 I4: 12 28 13 29 14 30 15 31
5235 The output of the second stage, i.e. the final result is:
5237 I1: 0 8 16 24 1 9 17 25
5238 I2: 2 10 18 26 3 11 19 27
5239 I3: 4 12 20 28 5 13 21 30
5240 I4: 6 14 22 30 7 15 23 31. */
5242 void
5243 vect_permute_store_chain (vec<tree> dr_chain,
5244 unsigned int length,
5245 gimple *stmt,
5246 gimple_stmt_iterator *gsi,
5247 vec<tree> *result_chain)
5249 tree vect1, vect2, high, low;
5250 gimple *perm_stmt;
5251 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5252 tree perm_mask_low, perm_mask_high;
5253 tree data_ref;
5254 tree perm3_mask_low, perm3_mask_high;
5255 unsigned int i, j, n, log_length = exact_log2 (length);
5257 result_chain->quick_grow (length);
5258 memcpy (result_chain->address (), dr_chain.address (),
5259 length * sizeof (tree));
5261 if (length == 3)
5263 /* vect_grouped_store_supported ensures that this is constant. */
5264 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5265 unsigned int j0 = 0, j1 = 0, j2 = 0;
5267 vec_perm_builder sel (nelt, nelt, 1);
5268 sel.quick_grow (nelt);
5269 vec_perm_indices indices;
5270 for (j = 0; j < 3; j++)
5272 int nelt0 = ((3 - j) * nelt) % 3;
5273 int nelt1 = ((3 - j) * nelt + 1) % 3;
5274 int nelt2 = ((3 - j) * nelt + 2) % 3;
5276 for (i = 0; i < nelt; i++)
5278 if (3 * i + nelt0 < nelt)
5279 sel[3 * i + nelt0] = j0++;
5280 if (3 * i + nelt1 < nelt)
5281 sel[3 * i + nelt1] = nelt + j1++;
5282 if (3 * i + nelt2 < nelt)
5283 sel[3 * i + nelt2] = 0;
5285 indices.new_vector (sel, 2, nelt);
5286 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5288 for (i = 0; i < nelt; i++)
5290 if (3 * i + nelt0 < nelt)
5291 sel[3 * i + nelt0] = 3 * i + nelt0;
5292 if (3 * i + nelt1 < nelt)
5293 sel[3 * i + nelt1] = 3 * i + nelt1;
5294 if (3 * i + nelt2 < nelt)
5295 sel[3 * i + nelt2] = nelt + j2++;
5297 indices.new_vector (sel, 2, nelt);
5298 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5300 vect1 = dr_chain[0];
5301 vect2 = dr_chain[1];
5303 /* Create interleaving stmt:
5304 low = VEC_PERM_EXPR <vect1, vect2,
5305 {j, nelt, *, j + 1, nelt + j + 1, *,
5306 j + 2, nelt + j + 2, *, ...}> */
5307 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5308 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5309 vect2, perm3_mask_low);
5310 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5312 vect1 = data_ref;
5313 vect2 = dr_chain[2];
5314 /* Create interleaving stmt:
5315 low = VEC_PERM_EXPR <vect1, vect2,
5316 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5317 6, 7, nelt + j + 2, ...}> */
5318 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5319 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5320 vect2, perm3_mask_high);
5321 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5322 (*result_chain)[j] = data_ref;
5325 else
5327 /* If length is not equal to 3 then only power of 2 is supported. */
5328 gcc_assert (pow2p_hwi (length));
5330 /* The encoding has 2 interleaved stepped patterns. */
5331 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5332 vec_perm_builder sel (nelt, 2, 3);
5333 sel.quick_grow (6);
5334 for (i = 0; i < 3; i++)
5336 sel[i * 2] = i;
5337 sel[i * 2 + 1] = i + nelt;
5339 vec_perm_indices indices (sel, 2, nelt);
5340 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5342 for (i = 0; i < 6; i++)
5343 sel[i] += exact_div (nelt, 2);
5344 indices.new_vector (sel, 2, nelt);
5345 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5347 for (i = 0, n = log_length; i < n; i++)
5349 for (j = 0; j < length/2; j++)
5351 vect1 = dr_chain[j];
5352 vect2 = dr_chain[j+length/2];
5354 /* Create interleaving stmt:
5355 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5356 ...}> */
5357 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5358 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5359 vect2, perm_mask_high);
5360 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5361 (*result_chain)[2*j] = high;
5363 /* Create interleaving stmt:
5364 low = VEC_PERM_EXPR <vect1, vect2,
5365 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5366 ...}> */
5367 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5368 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5369 vect2, perm_mask_low);
5370 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5371 (*result_chain)[2*j+1] = low;
5373 memcpy (dr_chain.address (), result_chain->address (),
5374 length * sizeof (tree));
5379 /* Function vect_setup_realignment
5381 This function is called when vectorizing an unaligned load using
5382 the dr_explicit_realign[_optimized] scheme.
5383 This function generates the following code at the loop prolog:
5385 p = initial_addr;
5386 x msq_init = *(floor(p)); # prolog load
5387 realignment_token = call target_builtin;
5388 loop:
5389 x msq = phi (msq_init, ---)
5391 The stmts marked with x are generated only for the case of
5392 dr_explicit_realign_optimized.
5394 The code above sets up a new (vector) pointer, pointing to the first
5395 location accessed by STMT, and a "floor-aligned" load using that pointer.
5396 It also generates code to compute the "realignment-token" (if the relevant
5397 target hook was defined), and creates a phi-node at the loop-header bb
5398 whose arguments are the result of the prolog-load (created by this
5399 function) and the result of a load that takes place in the loop (to be
5400 created by the caller to this function).
5402 For the case of dr_explicit_realign_optimized:
5403 The caller to this function uses the phi-result (msq) to create the
5404 realignment code inside the loop, and sets up the missing phi argument,
5405 as follows:
5406 loop:
5407 msq = phi (msq_init, lsq)
5408 lsq = *(floor(p')); # load in loop
5409 result = realign_load (msq, lsq, realignment_token);
5411 For the case of dr_explicit_realign:
5412 loop:
5413 msq = *(floor(p)); # load in loop
5414 p' = p + (VS-1);
5415 lsq = *(floor(p')); # load in loop
5416 result = realign_load (msq, lsq, realignment_token);
5418 Input:
5419 STMT - (scalar) load stmt to be vectorized. This load accesses
5420 a memory location that may be unaligned.
5421 BSI - place where new code is to be inserted.
5422 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5423 is used.
5425 Output:
5426 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5427 target hook, if defined.
5428 Return value - the result of the loop-header phi node. */
5430 tree
5431 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5432 tree *realignment_token,
5433 enum dr_alignment_support alignment_support_scheme,
5434 tree init_addr,
5435 struct loop **at_loop)
5437 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5438 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5439 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5440 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5441 struct loop *loop = NULL;
5442 edge pe = NULL;
5443 tree scalar_dest = gimple_assign_lhs (stmt);
5444 tree vec_dest;
5445 gimple *inc;
5446 tree ptr;
5447 tree data_ref;
5448 basic_block new_bb;
5449 tree msq_init = NULL_TREE;
5450 tree new_temp;
5451 gphi *phi_stmt;
5452 tree msq = NULL_TREE;
5453 gimple_seq stmts = NULL;
5454 bool inv_p;
5455 bool compute_in_loop = false;
5456 bool nested_in_vect_loop = false;
5457 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5458 struct loop *loop_for_initial_load = NULL;
5460 if (loop_vinfo)
5462 loop = LOOP_VINFO_LOOP (loop_vinfo);
5463 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5466 gcc_assert (alignment_support_scheme == dr_explicit_realign
5467 || alignment_support_scheme == dr_explicit_realign_optimized);
5469 /* We need to generate three things:
5470 1. the misalignment computation
5471 2. the extra vector load (for the optimized realignment scheme).
5472 3. the phi node for the two vectors from which the realignment is
5473 done (for the optimized realignment scheme). */
5475 /* 1. Determine where to generate the misalignment computation.
5477 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5478 calculation will be generated by this function, outside the loop (in the
5479 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5480 caller, inside the loop.
5482 Background: If the misalignment remains fixed throughout the iterations of
5483 the loop, then both realignment schemes are applicable, and also the
5484 misalignment computation can be done outside LOOP. This is because we are
5485 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5486 are a multiple of VS (the Vector Size), and therefore the misalignment in
5487 different vectorized LOOP iterations is always the same.
5488 The problem arises only if the memory access is in an inner-loop nested
5489 inside LOOP, which is now being vectorized using outer-loop vectorization.
5490 This is the only case when the misalignment of the memory access may not
5491 remain fixed throughout the iterations of the inner-loop (as explained in
5492 detail in vect_supportable_dr_alignment). In this case, not only is the
5493 optimized realignment scheme not applicable, but also the misalignment
5494 computation (and generation of the realignment token that is passed to
5495 REALIGN_LOAD) have to be done inside the loop.
5497 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5498 or not, which in turn determines if the misalignment is computed inside
5499 the inner-loop, or outside LOOP. */
5501 if (init_addr != NULL_TREE || !loop_vinfo)
5503 compute_in_loop = true;
5504 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5508 /* 2. Determine where to generate the extra vector load.
5510 For the optimized realignment scheme, instead of generating two vector
5511 loads in each iteration, we generate a single extra vector load in the
5512 preheader of the loop, and in each iteration reuse the result of the
5513 vector load from the previous iteration. In case the memory access is in
5514 an inner-loop nested inside LOOP, which is now being vectorized using
5515 outer-loop vectorization, we need to determine whether this initial vector
5516 load should be generated at the preheader of the inner-loop, or can be
5517 generated at the preheader of LOOP. If the memory access has no evolution
5518 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5519 to be generated inside LOOP (in the preheader of the inner-loop). */
5521 if (nested_in_vect_loop)
5523 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5524 bool invariant_in_outerloop =
5525 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5526 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5528 else
5529 loop_for_initial_load = loop;
5530 if (at_loop)
5531 *at_loop = loop_for_initial_load;
5533 if (loop_for_initial_load)
5534 pe = loop_preheader_edge (loop_for_initial_load);
5536 /* 3. For the case of the optimized realignment, create the first vector
5537 load at the loop preheader. */
5539 if (alignment_support_scheme == dr_explicit_realign_optimized)
5541 /* Create msq_init = *(floor(p1)) in the loop preheader */
5542 gassign *new_stmt;
5544 gcc_assert (!compute_in_loop);
5545 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5546 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5547 NULL_TREE, &init_addr, NULL, &inc,
5548 true, &inv_p);
5549 if (TREE_CODE (ptr) == SSA_NAME)
5550 new_temp = copy_ssa_name (ptr);
5551 else
5552 new_temp = make_ssa_name (TREE_TYPE (ptr));
5553 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5554 new_stmt = gimple_build_assign
5555 (new_temp, BIT_AND_EXPR, ptr,
5556 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5557 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5558 gcc_assert (!new_bb);
5559 data_ref
5560 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5561 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5562 new_stmt = gimple_build_assign (vec_dest, data_ref);
5563 new_temp = make_ssa_name (vec_dest, new_stmt);
5564 gimple_assign_set_lhs (new_stmt, new_temp);
5565 if (pe)
5567 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5568 gcc_assert (!new_bb);
5570 else
5571 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5573 msq_init = gimple_assign_lhs (new_stmt);
5576 /* 4. Create realignment token using a target builtin, if available.
5577 It is done either inside the containing loop, or before LOOP (as
5578 determined above). */
5580 if (targetm.vectorize.builtin_mask_for_load)
5582 gcall *new_stmt;
5583 tree builtin_decl;
5585 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5586 if (!init_addr)
5588 /* Generate the INIT_ADDR computation outside LOOP. */
5589 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5590 NULL_TREE);
5591 if (loop)
5593 pe = loop_preheader_edge (loop);
5594 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5595 gcc_assert (!new_bb);
5597 else
5598 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5601 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5602 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5603 vec_dest =
5604 vect_create_destination_var (scalar_dest,
5605 gimple_call_return_type (new_stmt));
5606 new_temp = make_ssa_name (vec_dest, new_stmt);
5607 gimple_call_set_lhs (new_stmt, new_temp);
5609 if (compute_in_loop)
5610 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5611 else
5613 /* Generate the misalignment computation outside LOOP. */
5614 pe = loop_preheader_edge (loop);
5615 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5616 gcc_assert (!new_bb);
5619 *realignment_token = gimple_call_lhs (new_stmt);
5621 /* The result of the CALL_EXPR to this builtin is determined from
5622 the value of the parameter and no global variables are touched
5623 which makes the builtin a "const" function. Requiring the
5624 builtin to have the "const" attribute makes it unnecessary
5625 to call mark_call_clobbered. */
5626 gcc_assert (TREE_READONLY (builtin_decl));
5629 if (alignment_support_scheme == dr_explicit_realign)
5630 return msq;
5632 gcc_assert (!compute_in_loop);
5633 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5636 /* 5. Create msq = phi <msq_init, lsq> in loop */
5638 pe = loop_preheader_edge (containing_loop);
5639 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5640 msq = make_ssa_name (vec_dest);
5641 phi_stmt = create_phi_node (msq, containing_loop->header);
5642 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5644 return msq;
5648 /* Function vect_grouped_load_supported.
5650 COUNT is the size of the load group (the number of statements plus the
5651 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5652 only one statement, with a gap of COUNT - 1.
5654 Returns true if a suitable permute exists. */
5656 bool
5657 vect_grouped_load_supported (tree vectype, bool single_element_p,
5658 unsigned HOST_WIDE_INT count)
5660 machine_mode mode = TYPE_MODE (vectype);
5662 /* If this is single-element interleaving with an element distance
5663 that leaves unused vector loads around punt - we at least create
5664 very sub-optimal code in that case (and blow up memory,
5665 see PR65518). */
5666 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5668 if (dump_enabled_p ())
5669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5670 "single-element interleaving not supported "
5671 "for not adjacent vector loads\n");
5672 return false;
5675 /* vect_permute_load_chain requires the group size to be equal to 3 or
5676 be a power of two. */
5677 if (count != 3 && exact_log2 (count) == -1)
5679 if (dump_enabled_p ())
5680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5681 "the size of the group of accesses"
5682 " is not a power of 2 or not equal to 3\n");
5683 return false;
5686 /* Check that the permutation is supported. */
5687 if (VECTOR_MODE_P (mode))
5689 unsigned int i, j;
5690 if (count == 3)
5692 unsigned int nelt;
5693 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5695 if (dump_enabled_p ())
5696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5697 "cannot handle groups of 3 loads for"
5698 " variable-length vectors\n");
5699 return false;
5702 vec_perm_builder sel (nelt, nelt, 1);
5703 sel.quick_grow (nelt);
5704 vec_perm_indices indices;
5705 unsigned int k;
5706 for (k = 0; k < 3; k++)
5708 for (i = 0; i < nelt; i++)
5709 if (3 * i + k < 2 * nelt)
5710 sel[i] = 3 * i + k;
5711 else
5712 sel[i] = 0;
5713 indices.new_vector (sel, 2, nelt);
5714 if (!can_vec_perm_const_p (mode, indices))
5716 if (dump_enabled_p ())
5717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5718 "shuffle of 3 loads is not supported by"
5719 " target\n");
5720 return false;
5722 for (i = 0, j = 0; i < nelt; i++)
5723 if (3 * i + k < 2 * nelt)
5724 sel[i] = i;
5725 else
5726 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5727 indices.new_vector (sel, 2, nelt);
5728 if (!can_vec_perm_const_p (mode, indices))
5730 if (dump_enabled_p ())
5731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5732 "shuffle of 3 loads is not supported by"
5733 " target\n");
5734 return false;
5737 return true;
5739 else
5741 /* If length is not equal to 3 then only power of 2 is supported. */
5742 gcc_assert (pow2p_hwi (count));
5743 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5745 /* The encoding has a single stepped pattern. */
5746 vec_perm_builder sel (nelt, 1, 3);
5747 sel.quick_grow (3);
5748 for (i = 0; i < 3; i++)
5749 sel[i] = i * 2;
5750 vec_perm_indices indices (sel, 2, nelt);
5751 if (can_vec_perm_const_p (mode, indices))
5753 for (i = 0; i < 3; i++)
5754 sel[i] = i * 2 + 1;
5755 indices.new_vector (sel, 2, nelt);
5756 if (can_vec_perm_const_p (mode, indices))
5757 return true;
5762 if (dump_enabled_p ())
5763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5764 "extract even/odd not supported by target\n");
5765 return false;
5768 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5769 type VECTYPE. MASKED_P says whether the masked form is needed. */
5771 bool
5772 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5773 bool masked_p)
5775 if (masked_p)
5776 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5777 vec_mask_load_lanes_optab,
5778 vectype, count);
5779 else
5780 return vect_lanes_optab_supported_p ("vec_load_lanes",
5781 vec_load_lanes_optab,
5782 vectype, count);
5785 /* Function vect_permute_load_chain.
5787 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5788 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5789 the input data correctly. Return the final references for loads in
5790 RESULT_CHAIN.
5792 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5793 The input is 4 vectors each containing 8 elements. We assign a number to each
5794 element, the input sequence is:
5796 1st vec: 0 1 2 3 4 5 6 7
5797 2nd vec: 8 9 10 11 12 13 14 15
5798 3rd vec: 16 17 18 19 20 21 22 23
5799 4th vec: 24 25 26 27 28 29 30 31
5801 The output sequence should be:
5803 1st vec: 0 4 8 12 16 20 24 28
5804 2nd vec: 1 5 9 13 17 21 25 29
5805 3rd vec: 2 6 10 14 18 22 26 30
5806 4th vec: 3 7 11 15 19 23 27 31
5808 i.e., the first output vector should contain the first elements of each
5809 interleaving group, etc.
5811 We use extract_even/odd instructions to create such output. The input of
5812 each extract_even/odd operation is two vectors
5813 1st vec 2nd vec
5814 0 1 2 3 4 5 6 7
5816 and the output is the vector of extracted even/odd elements. The output of
5817 extract_even will be: 0 2 4 6
5818 and of extract_odd: 1 3 5 7
5821 The permutation is done in log LENGTH stages. In each stage extract_even
5822 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5823 their order. In our example,
5825 E1: extract_even (1st vec, 2nd vec)
5826 E2: extract_odd (1st vec, 2nd vec)
5827 E3: extract_even (3rd vec, 4th vec)
5828 E4: extract_odd (3rd vec, 4th vec)
5830 The output for the first stage will be:
5832 E1: 0 2 4 6 8 10 12 14
5833 E2: 1 3 5 7 9 11 13 15
5834 E3: 16 18 20 22 24 26 28 30
5835 E4: 17 19 21 23 25 27 29 31
5837 In order to proceed and create the correct sequence for the next stage (or
5838 for the correct output, if the second stage is the last one, as in our
5839 example), we first put the output of extract_even operation and then the
5840 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5841 The input for the second stage is:
5843 1st vec (E1): 0 2 4 6 8 10 12 14
5844 2nd vec (E3): 16 18 20 22 24 26 28 30
5845 3rd vec (E2): 1 3 5 7 9 11 13 15
5846 4th vec (E4): 17 19 21 23 25 27 29 31
5848 The output of the second stage:
5850 E1: 0 4 8 12 16 20 24 28
5851 E2: 2 6 10 14 18 22 26 30
5852 E3: 1 5 9 13 17 21 25 29
5853 E4: 3 7 11 15 19 23 27 31
5855 And RESULT_CHAIN after reordering:
5857 1st vec (E1): 0 4 8 12 16 20 24 28
5858 2nd vec (E3): 1 5 9 13 17 21 25 29
5859 3rd vec (E2): 2 6 10 14 18 22 26 30
5860 4th vec (E4): 3 7 11 15 19 23 27 31. */
5862 static void
5863 vect_permute_load_chain (vec<tree> dr_chain,
5864 unsigned int length,
5865 gimple *stmt,
5866 gimple_stmt_iterator *gsi,
5867 vec<tree> *result_chain)
5869 tree data_ref, first_vect, second_vect;
5870 tree perm_mask_even, perm_mask_odd;
5871 tree perm3_mask_low, perm3_mask_high;
5872 gimple *perm_stmt;
5873 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5874 unsigned int i, j, log_length = exact_log2 (length);
5876 result_chain->quick_grow (length);
5877 memcpy (result_chain->address (), dr_chain.address (),
5878 length * sizeof (tree));
5880 if (length == 3)
5882 /* vect_grouped_load_supported ensures that this is constant. */
5883 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5884 unsigned int k;
5886 vec_perm_builder sel (nelt, nelt, 1);
5887 sel.quick_grow (nelt);
5888 vec_perm_indices indices;
5889 for (k = 0; k < 3; k++)
5891 for (i = 0; i < nelt; i++)
5892 if (3 * i + k < 2 * nelt)
5893 sel[i] = 3 * i + k;
5894 else
5895 sel[i] = 0;
5896 indices.new_vector (sel, 2, nelt);
5897 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5899 for (i = 0, j = 0; i < nelt; i++)
5900 if (3 * i + k < 2 * nelt)
5901 sel[i] = i;
5902 else
5903 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5904 indices.new_vector (sel, 2, nelt);
5905 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5907 first_vect = dr_chain[0];
5908 second_vect = dr_chain[1];
5910 /* Create interleaving stmt (low part of):
5911 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5912 ...}> */
5913 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5914 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5915 second_vect, perm3_mask_low);
5916 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5918 /* Create interleaving stmt (high part of):
5919 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5920 ...}> */
5921 first_vect = data_ref;
5922 second_vect = dr_chain[2];
5923 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5924 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5925 second_vect, perm3_mask_high);
5926 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5927 (*result_chain)[k] = data_ref;
5930 else
5932 /* If length is not equal to 3 then only power of 2 is supported. */
5933 gcc_assert (pow2p_hwi (length));
5935 /* The encoding has a single stepped pattern. */
5936 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5937 vec_perm_builder sel (nelt, 1, 3);
5938 sel.quick_grow (3);
5939 for (i = 0; i < 3; ++i)
5940 sel[i] = i * 2;
5941 vec_perm_indices indices (sel, 2, nelt);
5942 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5944 for (i = 0; i < 3; ++i)
5945 sel[i] = i * 2 + 1;
5946 indices.new_vector (sel, 2, nelt);
5947 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5949 for (i = 0; i < log_length; i++)
5951 for (j = 0; j < length; j += 2)
5953 first_vect = dr_chain[j];
5954 second_vect = dr_chain[j+1];
5956 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5957 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5958 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5959 first_vect, second_vect,
5960 perm_mask_even);
5961 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5962 (*result_chain)[j/2] = data_ref;
5964 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5965 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5966 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5967 first_vect, second_vect,
5968 perm_mask_odd);
5969 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5970 (*result_chain)[j/2+length/2] = data_ref;
5972 memcpy (dr_chain.address (), result_chain->address (),
5973 length * sizeof (tree));
5978 /* Function vect_shift_permute_load_chain.
5980 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5981 sequence of stmts to reorder the input data accordingly.
5982 Return the final references for loads in RESULT_CHAIN.
5983 Return true if successed, false otherwise.
5985 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5986 The input is 3 vectors each containing 8 elements. We assign a
5987 number to each element, the input sequence is:
5989 1st vec: 0 1 2 3 4 5 6 7
5990 2nd vec: 8 9 10 11 12 13 14 15
5991 3rd vec: 16 17 18 19 20 21 22 23
5993 The output sequence should be:
5995 1st vec: 0 3 6 9 12 15 18 21
5996 2nd vec: 1 4 7 10 13 16 19 22
5997 3rd vec: 2 5 8 11 14 17 20 23
5999 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6001 First we shuffle all 3 vectors to get correct elements order:
6003 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6004 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6005 3rd vec: (16 19 22) (17 20 23) (18 21)
6007 Next we unite and shift vector 3 times:
6009 1st step:
6010 shift right by 6 the concatenation of:
6011 "1st vec" and "2nd vec"
6012 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6013 "2nd vec" and "3rd vec"
6014 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6015 "3rd vec" and "1st vec"
6016 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6017 | New vectors |
6019 So that now new vectors are:
6021 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6022 2nd vec: (10 13) (16 19 22) (17 20 23)
6023 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6025 2nd step:
6026 shift right by 5 the concatenation of:
6027 "1st vec" and "3rd vec"
6028 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6029 "2nd vec" and "1st vec"
6030 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6031 "3rd vec" and "2nd vec"
6032 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6033 | New vectors |
6035 So that now new vectors are:
6037 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6038 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6039 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6041 3rd step:
6042 shift right by 5 the concatenation of:
6043 "1st vec" and "1st vec"
6044 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6045 shift right by 3 the concatenation of:
6046 "2nd vec" and "2nd vec"
6047 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6048 | New vectors |
6050 So that now all vectors are READY:
6051 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6052 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6053 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6055 This algorithm is faster than one in vect_permute_load_chain if:
6056 1. "shift of a concatination" is faster than general permutation.
6057 This is usually so.
6058 2. The TARGET machine can't execute vector instructions in parallel.
6059 This is because each step of the algorithm depends on previous.
6060 The algorithm in vect_permute_load_chain is much more parallel.
6062 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6065 static bool
6066 vect_shift_permute_load_chain (vec<tree> dr_chain,
6067 unsigned int length,
6068 gimple *stmt,
6069 gimple_stmt_iterator *gsi,
6070 vec<tree> *result_chain)
6072 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6073 tree perm2_mask1, perm2_mask2, perm3_mask;
6074 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6075 gimple *perm_stmt;
6077 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6078 unsigned int i;
6079 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6080 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6082 unsigned HOST_WIDE_INT nelt, vf;
6083 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6084 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6085 /* Not supported for variable-length vectors. */
6086 return false;
6088 vec_perm_builder sel (nelt, nelt, 1);
6089 sel.quick_grow (nelt);
6091 result_chain->quick_grow (length);
6092 memcpy (result_chain->address (), dr_chain.address (),
6093 length * sizeof (tree));
6095 if (pow2p_hwi (length) && vf > 4)
6097 unsigned int j, log_length = exact_log2 (length);
6098 for (i = 0; i < nelt / 2; ++i)
6099 sel[i] = i * 2;
6100 for (i = 0; i < nelt / 2; ++i)
6101 sel[nelt / 2 + i] = i * 2 + 1;
6102 vec_perm_indices indices (sel, 2, nelt);
6103 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6105 if (dump_enabled_p ())
6106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6107 "shuffle of 2 fields structure is not \
6108 supported by target\n");
6109 return false;
6111 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6113 for (i = 0; i < nelt / 2; ++i)
6114 sel[i] = i * 2 + 1;
6115 for (i = 0; i < nelt / 2; ++i)
6116 sel[nelt / 2 + i] = i * 2;
6117 indices.new_vector (sel, 2, nelt);
6118 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6120 if (dump_enabled_p ())
6121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6122 "shuffle of 2 fields structure is not \
6123 supported by target\n");
6124 return false;
6126 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6128 /* Generating permutation constant to shift all elements.
6129 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6130 for (i = 0; i < nelt; i++)
6131 sel[i] = nelt / 2 + i;
6132 indices.new_vector (sel, 2, nelt);
6133 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6135 if (dump_enabled_p ())
6136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6137 "shift permutation is not supported by target\n");
6138 return false;
6140 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6142 /* Generating permutation constant to select vector from 2.
6143 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6144 for (i = 0; i < nelt / 2; i++)
6145 sel[i] = i;
6146 for (i = nelt / 2; i < nelt; i++)
6147 sel[i] = nelt + i;
6148 indices.new_vector (sel, 2, nelt);
6149 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6151 if (dump_enabled_p ())
6152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6153 "select is not supported by target\n");
6154 return false;
6156 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6158 for (i = 0; i < log_length; i++)
6160 for (j = 0; j < length; j += 2)
6162 first_vect = dr_chain[j];
6163 second_vect = dr_chain[j + 1];
6165 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6166 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6167 first_vect, first_vect,
6168 perm2_mask1);
6169 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6170 vect[0] = data_ref;
6172 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6173 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6174 second_vect, second_vect,
6175 perm2_mask2);
6176 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6177 vect[1] = data_ref;
6179 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6180 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6181 vect[0], vect[1], shift1_mask);
6182 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6183 (*result_chain)[j/2 + length/2] = data_ref;
6185 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6186 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6187 vect[0], vect[1], select_mask);
6188 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6189 (*result_chain)[j/2] = data_ref;
6191 memcpy (dr_chain.address (), result_chain->address (),
6192 length * sizeof (tree));
6194 return true;
6196 if (length == 3 && vf > 2)
6198 unsigned int k = 0, l = 0;
6200 /* Generating permutation constant to get all elements in rigth order.
6201 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6202 for (i = 0; i < nelt; i++)
6204 if (3 * k + (l % 3) >= nelt)
6206 k = 0;
6207 l += (3 - (nelt % 3));
6209 sel[i] = 3 * k + (l % 3);
6210 k++;
6212 vec_perm_indices indices (sel, 2, nelt);
6213 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6215 if (dump_enabled_p ())
6216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6217 "shuffle of 3 fields structure is not \
6218 supported by target\n");
6219 return false;
6221 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6223 /* Generating permutation constant to shift all elements.
6224 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6225 for (i = 0; i < nelt; i++)
6226 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6227 indices.new_vector (sel, 2, nelt);
6228 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6230 if (dump_enabled_p ())
6231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6232 "shift permutation is not supported by target\n");
6233 return false;
6235 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6237 /* Generating permutation constant to shift all elements.
6238 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6239 for (i = 0; i < nelt; i++)
6240 sel[i] = 2 * (nelt / 3) + 1 + i;
6241 indices.new_vector (sel, 2, nelt);
6242 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6244 if (dump_enabled_p ())
6245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6246 "shift permutation is not supported by target\n");
6247 return false;
6249 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6251 /* Generating permutation constant to shift all elements.
6252 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6253 for (i = 0; i < nelt; i++)
6254 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6255 indices.new_vector (sel, 2, nelt);
6256 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6258 if (dump_enabled_p ())
6259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6260 "shift permutation is not supported by target\n");
6261 return false;
6263 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6265 /* Generating permutation constant to shift all elements.
6266 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6267 for (i = 0; i < nelt; i++)
6268 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6269 indices.new_vector (sel, 2, nelt);
6270 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6272 if (dump_enabled_p ())
6273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6274 "shift permutation is not supported by target\n");
6275 return false;
6277 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6279 for (k = 0; k < 3; k++)
6281 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6282 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6283 dr_chain[k], dr_chain[k],
6284 perm3_mask);
6285 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6286 vect[k] = data_ref;
6289 for (k = 0; k < 3; k++)
6291 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6292 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6293 vect[k % 3], vect[(k + 1) % 3],
6294 shift1_mask);
6295 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6296 vect_shift[k] = data_ref;
6299 for (k = 0; k < 3; k++)
6301 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6302 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6303 vect_shift[(4 - k) % 3],
6304 vect_shift[(3 - k) % 3],
6305 shift2_mask);
6306 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6307 vect[k] = data_ref;
6310 (*result_chain)[3 - (nelt % 3)] = vect[2];
6312 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6313 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6314 vect[0], shift3_mask);
6315 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6316 (*result_chain)[nelt % 3] = data_ref;
6318 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6319 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6320 vect[1], shift4_mask);
6321 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6322 (*result_chain)[0] = data_ref;
6323 return true;
6325 return false;
6328 /* Function vect_transform_grouped_load.
6330 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6331 to perform their permutation and ascribe the result vectorized statements to
6332 the scalar statements.
6335 void
6336 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6337 gimple_stmt_iterator *gsi)
6339 machine_mode mode;
6340 vec<tree> result_chain = vNULL;
6342 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6343 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6344 vectors, that are ready for vector computation. */
6345 result_chain.create (size);
6347 /* If reassociation width for vector type is 2 or greater target machine can
6348 execute 2 or more vector instructions in parallel. Otherwise try to
6349 get chain for loads group using vect_shift_permute_load_chain. */
6350 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6351 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6352 || pow2p_hwi (size)
6353 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6354 gsi, &result_chain))
6355 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6356 vect_record_grouped_load_vectors (stmt, result_chain);
6357 result_chain.release ();
6360 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6361 generated as part of the vectorization of STMT. Assign the statement
6362 for each vector to the associated scalar statement. */
6364 void
6365 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6367 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6368 gimple *next_stmt, *new_stmt;
6369 unsigned int i, gap_count;
6370 tree tmp_data_ref;
6372 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6373 Since we scan the chain starting from it's first node, their order
6374 corresponds the order of data-refs in RESULT_CHAIN. */
6375 next_stmt = first_stmt;
6376 gap_count = 1;
6377 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6379 if (!next_stmt)
6380 break;
6382 /* Skip the gaps. Loads created for the gaps will be removed by dead
6383 code elimination pass later. No need to check for the first stmt in
6384 the group, since it always exists.
6385 GROUP_GAP is the number of steps in elements from the previous
6386 access (if there is no gap GROUP_GAP is 1). We skip loads that
6387 correspond to the gaps. */
6388 if (next_stmt != first_stmt
6389 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6391 gap_count++;
6392 continue;
6395 while (next_stmt)
6397 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6398 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6399 copies, and we put the new vector statement in the first available
6400 RELATED_STMT. */
6401 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6402 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6403 else
6405 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6407 gimple *prev_stmt =
6408 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6409 gimple *rel_stmt =
6410 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6411 while (rel_stmt)
6413 prev_stmt = rel_stmt;
6414 rel_stmt =
6415 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6418 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6419 new_stmt;
6423 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6424 gap_count = 1;
6425 /* If NEXT_STMT accesses the same DR as the previous statement,
6426 put the same TMP_DATA_REF as its vectorized statement; otherwise
6427 get the next data-ref from RESULT_CHAIN. */
6428 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6429 break;
6434 /* Function vect_force_dr_alignment_p.
6436 Returns whether the alignment of a DECL can be forced to be aligned
6437 on ALIGNMENT bit boundary. */
6439 bool
6440 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6442 if (!VAR_P (decl))
6443 return false;
6445 if (decl_in_symtab_p (decl)
6446 && !symtab_node::get (decl)->can_increase_alignment_p ())
6447 return false;
6449 if (TREE_STATIC (decl))
6450 return (alignment <= MAX_OFILE_ALIGNMENT);
6451 else
6452 return (alignment <= MAX_STACK_ALIGNMENT);
6456 /* Return whether the data reference DR is supported with respect to its
6457 alignment.
6458 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6459 it is aligned, i.e., check if it is possible to vectorize it with different
6460 alignment. */
6462 enum dr_alignment_support
6463 vect_supportable_dr_alignment (struct data_reference *dr,
6464 bool check_aligned_accesses)
6466 gimple *stmt = DR_STMT (dr);
6467 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6468 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6469 machine_mode mode = TYPE_MODE (vectype);
6470 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6471 struct loop *vect_loop = NULL;
6472 bool nested_in_vect_loop = false;
6474 if (aligned_access_p (dr) && !check_aligned_accesses)
6475 return dr_aligned;
6477 /* For now assume all conditional loads/stores support unaligned
6478 access without any special code. */
6479 if (is_gimple_call (stmt)
6480 && gimple_call_internal_p (stmt)
6481 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6482 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6483 return dr_unaligned_supported;
6485 if (loop_vinfo)
6487 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6488 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6491 /* Possibly unaligned access. */
6493 /* We can choose between using the implicit realignment scheme (generating
6494 a misaligned_move stmt) and the explicit realignment scheme (generating
6495 aligned loads with a REALIGN_LOAD). There are two variants to the
6496 explicit realignment scheme: optimized, and unoptimized.
6497 We can optimize the realignment only if the step between consecutive
6498 vector loads is equal to the vector size. Since the vector memory
6499 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6500 is guaranteed that the misalignment amount remains the same throughout the
6501 execution of the vectorized loop. Therefore, we can create the
6502 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6503 at the loop preheader.
6505 However, in the case of outer-loop vectorization, when vectorizing a
6506 memory access in the inner-loop nested within the LOOP that is now being
6507 vectorized, while it is guaranteed that the misalignment of the
6508 vectorized memory access will remain the same in different outer-loop
6509 iterations, it is *not* guaranteed that is will remain the same throughout
6510 the execution of the inner-loop. This is because the inner-loop advances
6511 with the original scalar step (and not in steps of VS). If the inner-loop
6512 step happens to be a multiple of VS, then the misalignment remains fixed
6513 and we can use the optimized realignment scheme. For example:
6515 for (i=0; i<N; i++)
6516 for (j=0; j<M; j++)
6517 s += a[i+j];
6519 When vectorizing the i-loop in the above example, the step between
6520 consecutive vector loads is 1, and so the misalignment does not remain
6521 fixed across the execution of the inner-loop, and the realignment cannot
6522 be optimized (as illustrated in the following pseudo vectorized loop):
6524 for (i=0; i<N; i+=4)
6525 for (j=0; j<M; j++){
6526 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6527 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6528 // (assuming that we start from an aligned address).
6531 We therefore have to use the unoptimized realignment scheme:
6533 for (i=0; i<N; i+=4)
6534 for (j=k; j<M; j+=4)
6535 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6536 // that the misalignment of the initial address is
6537 // 0).
6539 The loop can then be vectorized as follows:
6541 for (k=0; k<4; k++){
6542 rt = get_realignment_token (&vp[k]);
6543 for (i=0; i<N; i+=4){
6544 v1 = vp[i+k];
6545 for (j=k; j<M; j+=4){
6546 v2 = vp[i+j+VS-1];
6547 va = REALIGN_LOAD <v1,v2,rt>;
6548 vs += va;
6549 v1 = v2;
6552 } */
6554 if (DR_IS_READ (dr))
6556 bool is_packed = false;
6557 tree type = (TREE_TYPE (DR_REF (dr)));
6559 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6560 && (!targetm.vectorize.builtin_mask_for_load
6561 || targetm.vectorize.builtin_mask_for_load ()))
6563 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6565 /* If we are doing SLP then the accesses need not have the
6566 same alignment, instead it depends on the SLP group size. */
6567 if (loop_vinfo
6568 && STMT_SLP_TYPE (stmt_info)
6569 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6570 * GROUP_SIZE (vinfo_for_stmt
6571 (GROUP_FIRST_ELEMENT (stmt_info))),
6572 TYPE_VECTOR_SUBPARTS (vectype)))
6574 else if (!loop_vinfo
6575 || (nested_in_vect_loop
6576 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6577 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6578 return dr_explicit_realign;
6579 else
6580 return dr_explicit_realign_optimized;
6582 if (!known_alignment_for_access_p (dr))
6583 is_packed = not_size_aligned (DR_REF (dr));
6585 if (targetm.vectorize.support_vector_misalignment
6586 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6587 /* Can't software pipeline the loads, but can at least do them. */
6588 return dr_unaligned_supported;
6590 else
6592 bool is_packed = false;
6593 tree type = (TREE_TYPE (DR_REF (dr)));
6595 if (!known_alignment_for_access_p (dr))
6596 is_packed = not_size_aligned (DR_REF (dr));
6598 if (targetm.vectorize.support_vector_misalignment
6599 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6600 return dr_unaligned_supported;
6603 /* Unsupported. */
6604 return dr_unaligned_unsupported;