introduce --enable-large-address-aware
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobc4805e724474f5870420f0ea43b552bacf30027b
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
120 HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt_info->stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
134 if (assign
135 && (gimple_assign_cast_p (assign)
136 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
142 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
145 if (rhs < lhs)
146 scalar_type = rhs_type;
149 *lhs_size_unit = lhs;
150 *rhs_size_unit = rhs;
151 return scalar_type;
155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
156 tested at run-time. Return TRUE if DDR was successfully inserted.
157 Return false if versioning is not supported. */
159 static opt_result
160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
162 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
164 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
165 return opt_result::failure_at (vect_location,
166 "will not create alias checks, as"
167 " --param vect-max-version-for-alias-checks"
168 " == 0\n");
170 opt_result res
171 = runtime_alias_check_p (ddr, loop,
172 optimize_loop_nest_for_speed_p (loop));
173 if (!res)
174 return res;
176 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
177 return opt_result::success ();
180 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
182 static void
183 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
185 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
186 for (unsigned int i = 0; i < checks.length(); ++i)
187 if (checks[i] == value)
188 return;
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_NOTE, vect_location,
192 "need run-time check that %T is nonzero\n",
193 value);
194 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
197 /* Return true if we know that the order of vectorized DR_INFO_A and
198 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
199 DR_INFO_B. At least one of the accesses is a write. */
201 static bool
202 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
204 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
205 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
207 /* Single statements are always kept in their original order. */
208 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
209 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
210 return true;
212 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
213 group are emitted at the position of the first scalar load and all
214 stores in a group are emitted at the position of the last scalar store.
215 Thus writes will happen no earlier than their current position
216 (but could happen later) while reads will happen no later than their
217 current position (but could happen earlier). Reordering is therefore
218 only possible if the first access is a write. */
219 stmtinfo_a = vect_orig_stmt (stmtinfo_a);
220 stmtinfo_b = vect_orig_stmt (stmtinfo_b);
221 stmt_vec_info earlier_stmt_info = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
222 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (earlier_stmt_info));
225 /* A subroutine of vect_analyze_data_ref_dependence. Handle
226 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
227 distances. These distances are conservatively correct but they don't
228 reflect a guaranteed dependence.
230 Return true if this function does all the work necessary to avoid
231 an alias or false if the caller should use the dependence distances
232 to limit the vectorization factor in the usual way. LOOP_DEPTH is
233 the depth of the loop described by LOOP_VINFO and the other arguments
234 are as for vect_analyze_data_ref_dependence. */
236 static bool
237 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
238 loop_vec_info loop_vinfo,
239 int loop_depth, unsigned int *max_vf)
241 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
242 lambda_vector dist_v;
243 unsigned int i;
244 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
246 int dist = dist_v[loop_depth];
247 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
249 /* If the user asserted safelen >= DIST consecutive iterations
250 can be executed concurrently, assume independence.
252 ??? An alternative would be to add the alias check even
253 in this case, and vectorize the fallback loop with the
254 maximum VF set to safelen. However, if the user has
255 explicitly given a length, it's less likely that that
256 would be a win. */
257 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
259 if ((unsigned int) loop->safelen < *max_vf)
260 *max_vf = loop->safelen;
261 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
262 continue;
265 /* For dependence distances of 2 or more, we have the option
266 of limiting VF or checking for an alias at runtime.
267 Prefer to check at runtime if we can, to avoid limiting
268 the VF unnecessarily when the bases are in fact independent.
270 Note that the alias checks will be removed if the VF ends up
271 being small enough. */
272 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
273 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
274 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
275 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
276 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
279 return true;
283 /* Function vect_analyze_data_ref_dependence.
285 FIXME: I needed to change the sense of the returned flag.
287 Return FALSE if there (might) exist a dependence between a memory-reference
288 DRA and a memory-reference DRB. When versioning for alias may check a
289 dependence at run-time, return TRUE. Adjust *MAX_VF according to
290 the data dependence. */
292 static opt_result
293 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
294 loop_vec_info loop_vinfo,
295 unsigned int *max_vf)
297 unsigned int i;
298 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
299 struct data_reference *dra = DDR_A (ddr);
300 struct data_reference *drb = DDR_B (ddr);
301 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
302 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
303 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
304 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
305 lambda_vector dist_v;
306 unsigned int loop_depth;
308 /* In loop analysis all data references should be vectorizable. */
309 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
310 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
311 gcc_unreachable ();
313 /* Independent data accesses. */
314 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
315 return opt_result::success ();
317 if (dra == drb
318 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
319 return opt_result::success ();
321 /* We do not have to consider dependences between accesses that belong
322 to the same group, unless the stride could be smaller than the
323 group size. */
324 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
325 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
326 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
327 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
328 return opt_result::success ();
330 /* Even if we have an anti-dependence then, as the vectorized loop covers at
331 least two scalar iterations, there is always also a true dependence.
332 As the vectorizer does not re-order loads and stores we can ignore
333 the anti-dependence if TBAA can disambiguate both DRs similar to the
334 case with known negative distance anti-dependences (positive
335 distance anti-dependences would violate TBAA constraints). */
336 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
337 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
338 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
339 get_alias_set (DR_REF (drb))))
340 return opt_result::success ();
342 /* Unknown data dependence. */
343 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
345 /* If user asserted safelen consecutive iterations can be
346 executed concurrently, assume independence. */
347 if (loop->safelen >= 2)
349 if ((unsigned int) loop->safelen < *max_vf)
350 *max_vf = loop->safelen;
351 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
352 return opt_result::success ();
355 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
356 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
357 return opt_result::failure_at
358 (stmtinfo_a->stmt,
359 "versioning for alias not supported for: "
360 "can't determine dependence between %T and %T\n",
361 DR_REF (dra), DR_REF (drb));
363 if (dump_enabled_p ())
364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
365 "versioning for alias required: "
366 "can't determine dependence between %T and %T\n",
367 DR_REF (dra), DR_REF (drb));
369 /* Add to list of ddrs that need to be tested at run-time. */
370 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
373 /* Known data dependence. */
374 if (DDR_NUM_DIST_VECTS (ddr) == 0)
376 /* If user asserted safelen consecutive iterations can be
377 executed concurrently, assume independence. */
378 if (loop->safelen >= 2)
380 if ((unsigned int) loop->safelen < *max_vf)
381 *max_vf = loop->safelen;
382 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
383 return opt_result::success ();
386 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
387 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
388 return opt_result::failure_at
389 (stmtinfo_a->stmt,
390 "versioning for alias not supported for: "
391 "bad dist vector for %T and %T\n",
392 DR_REF (dra), DR_REF (drb));
394 if (dump_enabled_p ())
395 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
396 "versioning for alias required: "
397 "bad dist vector for %T and %T\n",
398 DR_REF (dra), DR_REF (drb));
399 /* Add to list of ddrs that need to be tested at run-time. */
400 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
403 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
405 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
406 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
407 loop_depth, max_vf))
408 return opt_result::success ();
410 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
412 int dist = dist_v[loop_depth];
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_NOTE, vect_location,
416 "dependence distance = %d.\n", dist);
418 if (dist == 0)
420 if (dump_enabled_p ())
421 dump_printf_loc (MSG_NOTE, vect_location,
422 "dependence distance == 0 between %T and %T\n",
423 DR_REF (dra), DR_REF (drb));
425 /* When we perform grouped accesses and perform implicit CSE
426 by detecting equal accesses and doing disambiguation with
427 runtime alias tests like for
428 .. = a[i];
429 .. = a[i+1];
430 a[i] = ..;
431 a[i+1] = ..;
432 *p = ..;
433 .. = a[i];
434 .. = a[i+1];
435 where we will end up loading { a[i], a[i+1] } once, make
436 sure that inserting group loads before the first load and
437 stores after the last store will do the right thing.
438 Similar for groups like
439 a[i] = ...;
440 ... = a[i];
441 a[i+1] = ...;
442 where loads from the group interleave with the store. */
443 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
444 return opt_result::failure_at (stmtinfo_a->stmt,
445 "READ_WRITE dependence"
446 " in interleaving.\n");
448 if (loop->safelen < 2)
450 tree indicator = dr_zero_step_indicator (dra);
451 if (!indicator || integer_zerop (indicator))
452 return opt_result::failure_at (stmtinfo_a->stmt,
453 "access also has a zero step\n");
454 else if (TREE_CODE (indicator) != INTEGER_CST)
455 vect_check_nonzero_value (loop_vinfo, indicator);
457 continue;
460 if (dist > 0 && DDR_REVERSED_P (ddr))
462 /* If DDR_REVERSED_P the order of the data-refs in DDR was
463 reversed (to make distance vector positive), and the actual
464 distance is negative. */
465 if (dump_enabled_p ())
466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
467 "dependence distance negative.\n");
468 /* Record a negative dependence distance to later limit the
469 amount of stmt copying / unrolling we can perform.
470 Only need to handle read-after-write dependence. */
471 if (DR_IS_READ (drb)
472 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
473 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
474 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
475 continue;
478 unsigned int abs_dist = abs (dist);
479 if (abs_dist >= 2 && abs_dist < *max_vf)
481 /* The dependence distance requires reduction of the maximal
482 vectorization factor. */
483 *max_vf = abs (dist);
484 if (dump_enabled_p ())
485 dump_printf_loc (MSG_NOTE, vect_location,
486 "adjusting maximal vectorization factor to %i\n",
487 *max_vf);
490 if (abs_dist >= *max_vf)
492 /* Dependence distance does not create dependence, as far as
493 vectorization is concerned, in this case. */
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_NOTE, vect_location,
496 "dependence distance >= VF.\n");
497 continue;
500 return opt_result::failure_at (stmtinfo_a->stmt,
501 "not vectorized, possible dependence "
502 "between data-refs %T and %T\n",
503 DR_REF (dra), DR_REF (drb));
506 return opt_result::success ();
509 /* Function vect_analyze_data_ref_dependences.
511 Examine all the data references in the loop, and make sure there do not
512 exist any data dependences between them. Set *MAX_VF according to
513 the maximum vectorization factor the data dependences allow. */
515 opt_result
516 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
517 unsigned int *max_vf)
519 unsigned int i;
520 struct data_dependence_relation *ddr;
522 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
524 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
526 LOOP_VINFO_DDRS (loop_vinfo)
527 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
528 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
529 /* We need read-read dependences to compute
530 STMT_VINFO_SAME_ALIGN_REFS. */
531 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
532 &LOOP_VINFO_DDRS (loop_vinfo),
533 LOOP_VINFO_LOOP_NEST (loop_vinfo),
534 true);
535 gcc_assert (res);
538 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
540 /* For epilogues we either have no aliases or alias versioning
541 was applied to original loop. Therefore we may just get max_vf
542 using VF of original loop. */
543 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
544 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
545 else
546 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
548 opt_result res
549 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
550 if (!res)
551 return res;
554 return opt_result::success ();
558 /* Function vect_slp_analyze_data_ref_dependence.
560 Return TRUE if there (might) exist a dependence between a memory-reference
561 DRA and a memory-reference DRB for VINFO. When versioning for alias
562 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
563 according to the data dependence. */
565 static bool
566 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
567 struct data_dependence_relation *ddr)
569 struct data_reference *dra = DDR_A (ddr);
570 struct data_reference *drb = DDR_B (ddr);
571 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
572 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
574 /* We need to check dependences of statements marked as unvectorizable
575 as well, they still can prohibit vectorization. */
577 /* Independent data accesses. */
578 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
579 return false;
581 if (dra == drb)
582 return false;
584 /* Read-read is OK. */
585 if (DR_IS_READ (dra) && DR_IS_READ (drb))
586 return false;
588 /* If dra and drb are part of the same interleaving chain consider
589 them independent. */
590 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
591 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
592 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
593 return false;
595 /* Unknown data dependence. */
596 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
598 if (dump_enabled_p ())
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "can't determine dependence between %T and %T\n",
601 DR_REF (dra), DR_REF (drb));
603 else if (dump_enabled_p ())
604 dump_printf_loc (MSG_NOTE, vect_location,
605 "determined dependence between %T and %T\n",
606 DR_REF (dra), DR_REF (drb));
608 return true;
612 /* Analyze dependences involved in the transform of SLP NODE. STORES
613 contain the vector of scalar stores of this instance if we are
614 disambiguating the loads. */
616 static bool
617 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
618 vec<stmt_vec_info> stores,
619 stmt_vec_info last_store_info)
621 /* This walks over all stmts involved in the SLP load/store done
622 in NODE verifying we can sink them up to the last stmt in the
623 group. */
624 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
625 vec_info *vinfo = last_access_info->vinfo;
626 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
628 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
629 if (access_info == last_access_info)
630 continue;
631 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
632 ao_ref ref;
633 bool ref_initialized_p = false;
634 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
635 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
637 gimple *stmt = gsi_stmt (gsi);
638 if (! gimple_vuse (stmt)
639 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
640 continue;
642 /* If we couldn't record a (single) data reference for this
643 stmt we have to resort to the alias oracle. */
644 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
645 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
646 if (!dr_b)
648 /* We are moving a store or sinking a load - this means
649 we cannot use TBAA for disambiguation. */
650 if (!ref_initialized_p)
651 ao_ref_init (&ref, DR_REF (dr_a));
652 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
653 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
654 return false;
655 continue;
658 bool dependent = false;
659 /* If we run into a store of this same instance (we've just
660 marked those) then delay dependence checking until we run
661 into the last store because this is where it will have
662 been sunk to (and we verify if we can do that as well). */
663 if (gimple_visited_p (stmt))
665 if (stmt_info != last_store_info)
666 continue;
667 unsigned i;
668 stmt_vec_info store_info;
669 FOR_EACH_VEC_ELT (stores, i, store_info)
671 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
672 ddr_p ddr = initialize_data_dependence_relation
673 (dr_a, store_dr, vNULL);
674 dependent
675 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
676 free_dependence_relation (ddr);
677 if (dependent)
678 break;
681 else
683 ddr_p ddr = initialize_data_dependence_relation (dr_a,
684 dr_b, vNULL);
685 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
686 free_dependence_relation (ddr);
688 if (dependent)
689 return false;
692 return true;
696 /* Function vect_analyze_data_ref_dependences.
698 Examine all the data references in the basic-block, and make sure there
699 do not exist any data dependences between them. Set *MAX_VF according to
700 the maximum vectorization factor the data dependences allow. */
702 bool
703 vect_slp_analyze_instance_dependence (slp_instance instance)
705 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
707 /* The stores of this instance are at the root of the SLP tree. */
708 slp_tree store = SLP_INSTANCE_TREE (instance);
709 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
710 store = NULL;
712 /* Verify we can sink stores to the vectorized stmt insert location. */
713 stmt_vec_info last_store_info = NULL;
714 if (store)
716 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
717 return false;
719 /* Mark stores in this instance and remember the last one. */
720 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
721 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
722 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
725 bool res = true;
727 /* Verify we can sink loads to the vectorized stmt insert location,
728 special-casing stores of this instance. */
729 slp_tree load;
730 unsigned int i;
731 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
732 if (! vect_slp_analyze_node_dependences (instance, load,
733 store
734 ? SLP_TREE_SCALAR_STMTS (store)
735 : vNULL, last_store_info))
737 res = false;
738 break;
741 /* Unset the visited flag. */
742 if (store)
743 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
744 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
746 return res;
749 /* Record the base alignment guarantee given by DRB, which occurs
750 in STMT_INFO. */
752 static void
753 vect_record_base_alignment (stmt_vec_info stmt_info,
754 innermost_loop_behavior *drb)
756 vec_info *vinfo = stmt_info->vinfo;
757 bool existed;
758 innermost_loop_behavior *&entry
759 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
760 if (!existed || entry->base_alignment < drb->base_alignment)
762 entry = drb;
763 if (dump_enabled_p ())
764 dump_printf_loc (MSG_NOTE, vect_location,
765 "recording new base alignment for %T\n"
766 " alignment: %d\n"
767 " misalignment: %d\n"
768 " based on: %G",
769 drb->base_address,
770 drb->base_alignment,
771 drb->base_misalignment,
772 stmt_info->stmt);
776 /* If the region we're going to vectorize is reached, all unconditional
777 data references occur at least once. We can therefore pool the base
778 alignment guarantees from each unconditional reference. Do this by
779 going through all the data references in VINFO and checking whether
780 the containing statement makes the reference unconditionally. If so,
781 record the alignment of the base address in VINFO so that it can be
782 used for all other references with the same base. */
784 void
785 vect_record_base_alignments (vec_info *vinfo)
787 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
788 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
789 data_reference *dr;
790 unsigned int i;
791 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
793 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
794 stmt_vec_info stmt_info = dr_info->stmt;
795 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
796 && STMT_VINFO_VECTORIZABLE (stmt_info)
797 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
799 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
801 /* If DR is nested in the loop that is being vectorized, we can also
802 record the alignment of the base wrt the outer loop. */
803 if (loop && nested_in_vect_loop_p (loop, stmt_info))
804 vect_record_base_alignment
805 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
810 /* Return the target alignment for the vectorized form of DR_INFO. */
812 static unsigned int
813 vect_calculate_target_alignment (dr_vec_info *dr_info)
815 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
816 return targetm.vectorize.preferred_vector_alignment (vectype);
819 /* Function vect_compute_data_ref_alignment
821 Compute the misalignment of the data reference DR_INFO.
823 Output:
824 1. DR_MISALIGNMENT (DR_INFO) is defined.
826 FOR NOW: No analysis is actually performed. Misalignment is calculated
827 only for trivial cases. TODO. */
829 static void
830 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
832 stmt_vec_info stmt_info = dr_info->stmt;
833 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
834 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
835 struct loop *loop = NULL;
836 tree ref = DR_REF (dr_info->dr);
837 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
839 if (dump_enabled_p ())
840 dump_printf_loc (MSG_NOTE, vect_location,
841 "vect_compute_data_ref_alignment:\n");
843 if (loop_vinfo)
844 loop = LOOP_VINFO_LOOP (loop_vinfo);
846 /* Initialize misalignment to unknown. */
847 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
849 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
850 return;
852 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
853 bool step_preserves_misalignment_p;
855 unsigned HOST_WIDE_INT vector_alignment
856 = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
857 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
859 /* No step for BB vectorization. */
860 if (!loop)
862 gcc_assert (integer_zerop (drb->step));
863 step_preserves_misalignment_p = true;
866 /* In case the dataref is in an inner-loop of the loop that is being
867 vectorized (LOOP), we use the base and misalignment information
868 relative to the outer-loop (LOOP). This is ok only if the misalignment
869 stays the same throughout the execution of the inner-loop, which is why
870 we have to check that the stride of the dataref in the inner-loop evenly
871 divides by the vector alignment. */
872 else if (nested_in_vect_loop_p (loop, stmt_info))
874 step_preserves_misalignment_p
875 = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
877 if (dump_enabled_p ())
879 if (step_preserves_misalignment_p)
880 dump_printf_loc (MSG_NOTE, vect_location,
881 "inner step divides the vector alignment.\n");
882 else
883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
884 "inner step doesn't divide the vector"
885 " alignment.\n");
889 /* Similarly we can only use base and misalignment information relative to
890 an innermost loop if the misalignment stays the same throughout the
891 execution of the loop. As above, this is the case if the stride of
892 the dataref evenly divides by the alignment. */
893 else
895 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
896 step_preserves_misalignment_p
897 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
899 if (!step_preserves_misalignment_p && dump_enabled_p ())
900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
901 "step doesn't divide the vector alignment.\n");
904 unsigned int base_alignment = drb->base_alignment;
905 unsigned int base_misalignment = drb->base_misalignment;
907 /* Calculate the maximum of the pooled base address alignment and the
908 alignment that we can compute for DR itself. */
909 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
910 if (entry && base_alignment < (*entry)->base_alignment)
912 base_alignment = (*entry)->base_alignment;
913 base_misalignment = (*entry)->base_misalignment;
916 if (drb->offset_alignment < vector_alignment
917 || !step_preserves_misalignment_p
918 /* We need to know whether the step wrt the vectorized loop is
919 negative when computing the starting misalignment below. */
920 || TREE_CODE (drb->step) != INTEGER_CST)
922 if (dump_enabled_p ())
923 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
924 "Unknown alignment for access: %T\n", ref);
925 return;
928 if (base_alignment < vector_alignment)
930 unsigned int max_alignment;
931 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
932 if (max_alignment < vector_alignment
933 || !vect_can_force_dr_alignment_p (base,
934 vector_alignment * BITS_PER_UNIT))
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_NOTE, vect_location,
938 "can't force alignment of ref: %T\n", ref);
939 return;
942 /* Force the alignment of the decl.
943 NOTE: This is the only change to the code we make during
944 the analysis phase, before deciding to vectorize the loop. */
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_NOTE, vect_location,
947 "force alignment of %T\n", ref);
949 dr_info->base_decl = base;
950 dr_info->base_misaligned = true;
951 base_misalignment = 0;
953 poly_int64 misalignment
954 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
956 /* If this is a backward running DR then first access in the larger
957 vectype actually is N-1 elements before the address in the DR.
958 Adjust misalign accordingly. */
959 if (tree_int_cst_sgn (drb->step) < 0)
960 /* PLUS because STEP is negative. */
961 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
962 * TREE_INT_CST_LOW (drb->step));
964 unsigned int const_misalignment;
965 if (!known_misalignment (misalignment, vector_alignment,
966 &const_misalignment))
968 if (dump_enabled_p ())
969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
970 "Non-constant misalignment for access: %T\n", ref);
971 return;
974 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
976 if (dump_enabled_p ())
977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
978 "misalign = %d bytes of ref %T\n",
979 DR_MISALIGNMENT (dr_info), ref);
981 return;
984 /* Function vect_update_misalignment_for_peel.
985 Sets DR_INFO's misalignment
986 - to 0 if it has the same alignment as DR_PEEL_INFO,
987 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
988 - to -1 (unknown) otherwise.
990 DR_INFO - the data reference whose misalignment is to be adjusted.
991 DR_PEEL_INFO - the data reference whose misalignment is being made
992 zero in the vector loop by the peel.
993 NPEEL - the number of iterations in the peel loop if the misalignment
994 of DR_PEEL_INFO is known at compile time. */
996 static void
997 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
998 dr_vec_info *dr_peel_info, int npeel)
1000 unsigned int i;
1001 vec<dr_p> same_aligned_drs;
1002 struct data_reference *current_dr;
1003 int dr_size = vect_get_scalar_dr_size (dr_info);
1004 int dr_peel_size = vect_get_scalar_dr_size (dr_peel_info);
1005 stmt_vec_info stmt_info = dr_info->stmt;
1006 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1008 /* For interleaved data accesses the step in the loop must be multiplied by
1009 the size of the interleaving group. */
1010 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1011 dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1012 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1013 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1015 /* It can be assumed that the data refs with the same alignment as dr_peel
1016 are aligned in the vector loop. */
1017 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1018 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1020 if (current_dr != dr_info->dr)
1021 continue;
1022 gcc_assert (!known_alignment_for_access_p (dr_info)
1023 || !known_alignment_for_access_p (dr_peel_info)
1024 || (DR_MISALIGNMENT (dr_info) / dr_size
1025 == DR_MISALIGNMENT (dr_peel_info) / dr_peel_size));
1026 SET_DR_MISALIGNMENT (dr_info, 0);
1027 return;
1030 if (known_alignment_for_access_p (dr_info)
1031 && known_alignment_for_access_p (dr_peel_info))
1033 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1034 size_zero_node) < 0;
1035 int misal = DR_MISALIGNMENT (dr_info);
1036 misal += negative ? -npeel * dr_size : npeel * dr_size;
1037 misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1038 SET_DR_MISALIGNMENT (dr_info, misal);
1039 return;
1042 if (dump_enabled_p ())
1043 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1044 "to unknown (-1).\n");
1045 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1049 /* Function verify_data_ref_alignment
1051 Return TRUE if DR_INFO can be handled with respect to alignment. */
1053 static opt_result
1054 verify_data_ref_alignment (dr_vec_info *dr_info)
1056 enum dr_alignment_support supportable_dr_alignment
1057 = vect_supportable_dr_alignment (dr_info, false);
1058 if (!supportable_dr_alignment)
1059 return opt_result::failure_at
1060 (dr_info->stmt->stmt,
1061 DR_IS_READ (dr_info->dr)
1062 ? "not vectorized: unsupported unaligned load: %T\n"
1063 : "not vectorized: unsupported unaligned store: %T\n",
1064 DR_REF (dr_info->dr));
1066 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1067 dump_printf_loc (MSG_NOTE, vect_location,
1068 "Vectorizing an unaligned access.\n");
1070 return opt_result::success ();
1073 /* Function vect_verify_datarefs_alignment
1075 Return TRUE if all data references in the loop can be
1076 handled with respect to alignment. */
1078 opt_result
1079 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1081 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1082 struct data_reference *dr;
1083 unsigned int i;
1085 FOR_EACH_VEC_ELT (datarefs, i, dr)
1087 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1088 stmt_vec_info stmt_info = dr_info->stmt;
1090 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1091 continue;
1093 /* For interleaving, only the alignment of the first access matters. */
1094 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1095 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1096 continue;
1098 /* Strided accesses perform only component accesses, alignment is
1099 irrelevant for them. */
1100 if (STMT_VINFO_STRIDED_P (stmt_info)
1101 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1102 continue;
1104 opt_result res = verify_data_ref_alignment (dr_info);
1105 if (!res)
1106 return res;
1109 return opt_result::success ();
1112 /* Given an memory reference EXP return whether its alignment is less
1113 than its size. */
1115 static bool
1116 not_size_aligned (tree exp)
1118 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1119 return true;
1121 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1122 > get_object_alignment (exp));
1125 /* Function vector_alignment_reachable_p
1127 Return true if vector alignment for DR_INFO is reachable by peeling
1128 a few loop iterations. Return false otherwise. */
1130 static bool
1131 vector_alignment_reachable_p (dr_vec_info *dr_info)
1133 stmt_vec_info stmt_info = dr_info->stmt;
1134 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1136 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1138 /* For interleaved access we peel only if number of iterations in
1139 the prolog loop ({VF - misalignment}), is a multiple of the
1140 number of the interleaved accesses. */
1141 int elem_size, mis_in_elements;
1143 /* FORNOW: handle only known alignment. */
1144 if (!known_alignment_for_access_p (dr_info))
1145 return false;
1147 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1148 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1149 elem_size = vector_element_size (vector_size, nelements);
1150 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1152 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1153 return false;
1156 /* If misalignment is known at the compile time then allow peeling
1157 only if natural alignment is reachable through peeling. */
1158 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1160 HOST_WIDE_INT elmsize =
1161 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1162 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_NOTE, vect_location,
1165 "data size = %wd. misalignment = %d.\n", elmsize,
1166 DR_MISALIGNMENT (dr_info));
1168 if (DR_MISALIGNMENT (dr_info) % elmsize)
1170 if (dump_enabled_p ())
1171 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1172 "data size does not divide the misalignment.\n");
1173 return false;
1177 if (!known_alignment_for_access_p (dr_info))
1179 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1180 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1181 if (dump_enabled_p ())
1182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1183 "Unknown misalignment, %snaturally aligned\n",
1184 is_packed ? "not " : "");
1185 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1188 return true;
1192 /* Calculate the cost of the memory access represented by DR_INFO. */
1194 static void
1195 vect_get_data_access_cost (dr_vec_info *dr_info,
1196 unsigned int *inside_cost,
1197 unsigned int *outside_cost,
1198 stmt_vector_for_cost *body_cost_vec,
1199 stmt_vector_for_cost *prologue_cost_vec)
1201 stmt_vec_info stmt_info = dr_info->stmt;
1202 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1203 int ncopies;
1205 if (PURE_SLP_STMT (stmt_info))
1206 ncopies = 1;
1207 else
1208 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1210 if (DR_IS_READ (dr_info->dr))
1211 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1212 prologue_cost_vec, body_cost_vec, false);
1213 else
1214 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_NOTE, vect_location,
1218 "vect_get_data_access_cost: inside_cost = %d, "
1219 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1223 typedef struct _vect_peel_info
1225 dr_vec_info *dr_info;
1226 int npeel;
1227 unsigned int count;
1228 } *vect_peel_info;
1230 typedef struct _vect_peel_extended_info
1232 struct _vect_peel_info peel_info;
1233 unsigned int inside_cost;
1234 unsigned int outside_cost;
1235 } *vect_peel_extended_info;
1238 /* Peeling hashtable helpers. */
1240 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1242 static inline hashval_t hash (const _vect_peel_info *);
1243 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1246 inline hashval_t
1247 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1249 return (hashval_t) peel_info->npeel;
1252 inline bool
1253 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1255 return (a->npeel == b->npeel);
1259 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1261 static void
1262 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1263 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1264 int npeel)
1266 struct _vect_peel_info elem, *slot;
1267 _vect_peel_info **new_slot;
1268 bool supportable_dr_alignment
1269 = vect_supportable_dr_alignment (dr_info, true);
1271 elem.npeel = npeel;
1272 slot = peeling_htab->find (&elem);
1273 if (slot)
1274 slot->count++;
1275 else
1277 slot = XNEW (struct _vect_peel_info);
1278 slot->npeel = npeel;
1279 slot->dr_info = dr_info;
1280 slot->count = 1;
1281 new_slot = peeling_htab->find_slot (slot, INSERT);
1282 *new_slot = slot;
1285 if (!supportable_dr_alignment
1286 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1287 slot->count += VECT_MAX_COST;
1291 /* Traverse peeling hash table to find peeling option that aligns maximum
1292 number of data accesses. */
1295 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1296 _vect_peel_extended_info *max)
1298 vect_peel_info elem = *slot;
1300 if (elem->count > max->peel_info.count
1301 || (elem->count == max->peel_info.count
1302 && max->peel_info.npeel > elem->npeel))
1304 max->peel_info.npeel = elem->npeel;
1305 max->peel_info.count = elem->count;
1306 max->peel_info.dr_info = elem->dr_info;
1309 return 1;
1312 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1313 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1314 we assume DR0_INFO's misalignment will be zero after peeling. */
1316 static void
1317 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1318 dr_vec_info *dr0_info,
1319 unsigned int *inside_cost,
1320 unsigned int *outside_cost,
1321 stmt_vector_for_cost *body_cost_vec,
1322 stmt_vector_for_cost *prologue_cost_vec,
1323 unsigned int npeel,
1324 bool unknown_misalignment)
1326 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1327 unsigned i;
1328 data_reference *dr;
1330 FOR_EACH_VEC_ELT (datarefs, i, dr)
1332 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1333 stmt_vec_info stmt_info = dr_info->stmt;
1334 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1335 continue;
1337 /* For interleaving, only the alignment of the first access
1338 matters. */
1339 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1340 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1341 continue;
1343 /* Strided accesses perform only component accesses, alignment is
1344 irrelevant for them. */
1345 if (STMT_VINFO_STRIDED_P (stmt_info)
1346 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1347 continue;
1349 int save_misalignment;
1350 save_misalignment = DR_MISALIGNMENT (dr_info);
1351 if (npeel == 0)
1353 else if (unknown_misalignment && dr_info == dr0_info)
1354 SET_DR_MISALIGNMENT (dr_info, 0);
1355 else
1356 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1357 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1358 body_cost_vec, prologue_cost_vec);
1359 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1363 /* Traverse peeling hash table and calculate cost for each peeling option.
1364 Find the one with the lowest cost. */
1367 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1368 _vect_peel_extended_info *min)
1370 vect_peel_info elem = *slot;
1371 int dummy;
1372 unsigned int inside_cost = 0, outside_cost = 0;
1373 stmt_vec_info stmt_info = elem->dr_info->stmt;
1374 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1375 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1376 epilogue_cost_vec;
1378 prologue_cost_vec.create (2);
1379 body_cost_vec.create (2);
1380 epilogue_cost_vec.create (2);
1382 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1383 &outside_cost, &body_cost_vec,
1384 &prologue_cost_vec, elem->npeel, false);
1386 body_cost_vec.release ();
1388 outside_cost += vect_get_known_peeling_cost
1389 (loop_vinfo, elem->npeel, &dummy,
1390 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1391 &prologue_cost_vec, &epilogue_cost_vec);
1393 /* Prologue and epilogue costs are added to the target model later.
1394 These costs depend only on the scalar iteration cost, the
1395 number of peeling iterations finally chosen, and the number of
1396 misaligned statements. So discard the information found here. */
1397 prologue_cost_vec.release ();
1398 epilogue_cost_vec.release ();
1400 if (inside_cost < min->inside_cost
1401 || (inside_cost == min->inside_cost
1402 && outside_cost < min->outside_cost))
1404 min->inside_cost = inside_cost;
1405 min->outside_cost = outside_cost;
1406 min->peel_info.dr_info = elem->dr_info;
1407 min->peel_info.npeel = elem->npeel;
1408 min->peel_info.count = elem->count;
1411 return 1;
1415 /* Choose best peeling option by traversing peeling hash table and either
1416 choosing an option with the lowest cost (if cost model is enabled) or the
1417 option that aligns as many accesses as possible. */
1419 static struct _vect_peel_extended_info
1420 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1421 loop_vec_info loop_vinfo)
1423 struct _vect_peel_extended_info res;
1425 res.peel_info.dr_info = NULL;
1427 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1429 res.inside_cost = INT_MAX;
1430 res.outside_cost = INT_MAX;
1431 peeling_htab->traverse <_vect_peel_extended_info *,
1432 vect_peeling_hash_get_lowest_cost> (&res);
1434 else
1436 res.peel_info.count = 0;
1437 peeling_htab->traverse <_vect_peel_extended_info *,
1438 vect_peeling_hash_get_most_frequent> (&res);
1439 res.inside_cost = 0;
1440 res.outside_cost = 0;
1443 return res;
1446 /* Return true if the new peeling NPEEL is supported. */
1448 static bool
1449 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1450 unsigned npeel)
1452 unsigned i;
1453 struct data_reference *dr = NULL;
1454 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1455 enum dr_alignment_support supportable_dr_alignment;
1457 /* Ensure that all data refs can be vectorized after the peel. */
1458 FOR_EACH_VEC_ELT (datarefs, i, dr)
1460 int save_misalignment;
1462 if (dr == dr0_info->dr)
1463 continue;
1465 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1466 stmt_vec_info stmt_info = dr_info->stmt;
1467 /* For interleaving, only the alignment of the first access
1468 matters. */
1469 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1470 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1471 continue;
1473 /* Strided accesses perform only component accesses, alignment is
1474 irrelevant for them. */
1475 if (STMT_VINFO_STRIDED_P (stmt_info)
1476 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1477 continue;
1479 save_misalignment = DR_MISALIGNMENT (dr_info);
1480 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1481 supportable_dr_alignment
1482 = vect_supportable_dr_alignment (dr_info, false);
1483 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1485 if (!supportable_dr_alignment)
1486 return false;
1489 return true;
1492 /* Function vect_enhance_data_refs_alignment
1494 This pass will use loop versioning and loop peeling in order to enhance
1495 the alignment of data references in the loop.
1497 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1498 original loop is to be vectorized. Any other loops that are created by
1499 the transformations performed in this pass - are not supposed to be
1500 vectorized. This restriction will be relaxed.
1502 This pass will require a cost model to guide it whether to apply peeling
1503 or versioning or a combination of the two. For example, the scheme that
1504 intel uses when given a loop with several memory accesses, is as follows:
1505 choose one memory access ('p') which alignment you want to force by doing
1506 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1507 other accesses are not necessarily aligned, or (2) use loop versioning to
1508 generate one loop in which all accesses are aligned, and another loop in
1509 which only 'p' is necessarily aligned.
1511 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1512 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1513 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1515 Devising a cost model is the most critical aspect of this work. It will
1516 guide us on which access to peel for, whether to use loop versioning, how
1517 many versions to create, etc. The cost model will probably consist of
1518 generic considerations as well as target specific considerations (on
1519 powerpc for example, misaligned stores are more painful than misaligned
1520 loads).
1522 Here are the general steps involved in alignment enhancements:
1524 -- original loop, before alignment analysis:
1525 for (i=0; i<N; i++){
1526 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1527 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1530 -- After vect_compute_data_refs_alignment:
1531 for (i=0; i<N; i++){
1532 x = q[i]; # DR_MISALIGNMENT(q) = 3
1533 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1536 -- Possibility 1: we do loop versioning:
1537 if (p is aligned) {
1538 for (i=0; i<N; i++){ # loop 1A
1539 x = q[i]; # DR_MISALIGNMENT(q) = 3
1540 p[i] = y; # DR_MISALIGNMENT(p) = 0
1543 else {
1544 for (i=0; i<N; i++){ # loop 1B
1545 x = q[i]; # DR_MISALIGNMENT(q) = 3
1546 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1550 -- Possibility 2: we do loop peeling:
1551 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1552 x = q[i];
1553 p[i] = y;
1555 for (i = 3; i < N; i++){ # loop 2A
1556 x = q[i]; # DR_MISALIGNMENT(q) = 0
1557 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1560 -- Possibility 3: combination of loop peeling and versioning:
1561 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1562 x = q[i];
1563 p[i] = y;
1565 if (p is aligned) {
1566 for (i = 3; i<N; i++){ # loop 3A
1567 x = q[i]; # DR_MISALIGNMENT(q) = 0
1568 p[i] = y; # DR_MISALIGNMENT(p) = 0
1571 else {
1572 for (i = 3; i<N; i++){ # loop 3B
1573 x = q[i]; # DR_MISALIGNMENT(q) = 0
1574 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1578 These loops are later passed to loop_transform to be vectorized. The
1579 vectorizer will use the alignment information to guide the transformation
1580 (whether to generate regular loads/stores, or with special handling for
1581 misalignment). */
1583 opt_result
1584 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1586 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1587 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1588 enum dr_alignment_support supportable_dr_alignment;
1589 dr_vec_info *first_store = NULL;
1590 dr_vec_info *dr0_info = NULL;
1591 struct data_reference *dr;
1592 unsigned int i, j;
1593 bool do_peeling = false;
1594 bool do_versioning = false;
1595 unsigned int npeel = 0;
1596 bool one_misalignment_known = false;
1597 bool one_misalignment_unknown = false;
1598 bool one_dr_unsupportable = false;
1599 dr_vec_info *unsupportable_dr_info = NULL;
1600 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1601 unsigned possible_npeel_number = 1;
1602 tree vectype;
1603 unsigned int mis, same_align_drs_max = 0;
1604 hash_table<peel_info_hasher> peeling_htab (1);
1606 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1608 /* Reset data so we can safely be called multiple times. */
1609 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1610 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1612 /* While cost model enhancements are expected in the future, the high level
1613 view of the code at this time is as follows:
1615 A) If there is a misaligned access then see if peeling to align
1616 this access can make all data references satisfy
1617 vect_supportable_dr_alignment. If so, update data structures
1618 as needed and return true.
1620 B) If peeling wasn't possible and there is a data reference with an
1621 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1622 then see if loop versioning checks can be used to make all data
1623 references satisfy vect_supportable_dr_alignment. If so, update
1624 data structures as needed and return true.
1626 C) If neither peeling nor versioning were successful then return false if
1627 any data reference does not satisfy vect_supportable_dr_alignment.
1629 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1631 Note, Possibility 3 above (which is peeling and versioning together) is not
1632 being done at this time. */
1634 /* (1) Peeling to force alignment. */
1636 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1637 Considerations:
1638 + How many accesses will become aligned due to the peeling
1639 - How many accesses will become unaligned due to the peeling,
1640 and the cost of misaligned accesses.
1641 - The cost of peeling (the extra runtime checks, the increase
1642 in code size). */
1644 FOR_EACH_VEC_ELT (datarefs, i, dr)
1646 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1647 stmt_vec_info stmt_info = dr_info->stmt;
1649 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1650 continue;
1652 /* For interleaving, only the alignment of the first access
1653 matters. */
1654 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1655 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1656 continue;
1658 /* For scatter-gather or invariant accesses there is nothing
1659 to enhance. */
1660 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1661 || integer_zerop (DR_STEP (dr)))
1662 continue;
1664 /* Strided accesses perform only component accesses, alignment is
1665 irrelevant for them. */
1666 if (STMT_VINFO_STRIDED_P (stmt_info)
1667 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1668 continue;
1670 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1671 do_peeling = vector_alignment_reachable_p (dr_info);
1672 if (do_peeling)
1674 if (known_alignment_for_access_p (dr_info))
1676 unsigned int npeel_tmp = 0;
1677 bool negative = tree_int_cst_compare (DR_STEP (dr),
1678 size_zero_node) < 0;
1680 vectype = STMT_VINFO_VECTYPE (stmt_info);
1681 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1682 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1683 mis = (negative
1684 ? DR_MISALIGNMENT (dr_info)
1685 : -DR_MISALIGNMENT (dr_info));
1686 if (DR_MISALIGNMENT (dr_info) != 0)
1687 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1689 /* For multiple types, it is possible that the bigger type access
1690 will have more than one peeling option. E.g., a loop with two
1691 types: one of size (vector size / 4), and the other one of
1692 size (vector size / 8). Vectorization factor will 8. If both
1693 accesses are misaligned by 3, the first one needs one scalar
1694 iteration to be aligned, and the second one needs 5. But the
1695 first one will be aligned also by peeling 5 scalar
1696 iterations, and in that case both accesses will be aligned.
1697 Hence, except for the immediate peeling amount, we also want
1698 to try to add full vector size, while we don't exceed
1699 vectorization factor.
1700 We do this automatically for cost model, since we calculate
1701 cost for every peeling option. */
1702 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1704 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1705 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1706 possible_npeel_number
1707 = vect_get_num_vectors (nscalars, vectype);
1709 /* NPEEL_TMP is 0 when there is no misalignment, but also
1710 allow peeling NELEMENTS. */
1711 if (DR_MISALIGNMENT (dr_info) == 0)
1712 possible_npeel_number++;
1715 /* Save info about DR in the hash table. Also include peeling
1716 amounts according to the explanation above. */
1717 for (j = 0; j < possible_npeel_number; j++)
1719 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1720 dr_info, npeel_tmp);
1721 npeel_tmp += target_align / dr_size;
1724 one_misalignment_known = true;
1726 else
1728 /* If we don't know any misalignment values, we prefer
1729 peeling for data-ref that has the maximum number of data-refs
1730 with the same alignment, unless the target prefers to align
1731 stores over load. */
1732 unsigned same_align_drs
1733 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1734 if (!dr0_info
1735 || same_align_drs_max < same_align_drs)
1737 same_align_drs_max = same_align_drs;
1738 dr0_info = dr_info;
1740 /* For data-refs with the same number of related
1741 accesses prefer the one where the misalign
1742 computation will be invariant in the outermost loop. */
1743 else if (same_align_drs_max == same_align_drs)
1745 struct loop *ivloop0, *ivloop;
1746 ivloop0 = outermost_invariant_loop_for_expr
1747 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1748 ivloop = outermost_invariant_loop_for_expr
1749 (loop, DR_BASE_ADDRESS (dr));
1750 if ((ivloop && !ivloop0)
1751 || (ivloop && ivloop0
1752 && flow_loop_nested_p (ivloop, ivloop0)))
1753 dr0_info = dr_info;
1756 one_misalignment_unknown = true;
1758 /* Check for data refs with unsupportable alignment that
1759 can be peeled. */
1760 if (!supportable_dr_alignment)
1762 one_dr_unsupportable = true;
1763 unsupportable_dr_info = dr_info;
1766 if (!first_store && DR_IS_WRITE (dr))
1767 first_store = dr_info;
1770 else
1772 if (!aligned_access_p (dr_info))
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1776 "vector alignment may not be reachable\n");
1777 break;
1782 /* Check if we can possibly peel the loop. */
1783 if (!vect_can_advance_ivs_p (loop_vinfo)
1784 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1785 || loop->inner)
1786 do_peeling = false;
1788 struct _vect_peel_extended_info peel_for_known_alignment;
1789 struct _vect_peel_extended_info peel_for_unknown_alignment;
1790 struct _vect_peel_extended_info best_peel;
1792 peel_for_unknown_alignment.inside_cost = INT_MAX;
1793 peel_for_unknown_alignment.outside_cost = INT_MAX;
1794 peel_for_unknown_alignment.peel_info.count = 0;
1796 if (do_peeling
1797 && one_misalignment_unknown)
1799 /* Check if the target requires to prefer stores over loads, i.e., if
1800 misaligned stores are more expensive than misaligned loads (taking
1801 drs with same alignment into account). */
1802 unsigned int load_inside_cost = 0;
1803 unsigned int load_outside_cost = 0;
1804 unsigned int store_inside_cost = 0;
1805 unsigned int store_outside_cost = 0;
1806 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1808 stmt_vector_for_cost dummy;
1809 dummy.create (2);
1810 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1811 &load_inside_cost,
1812 &load_outside_cost,
1813 &dummy, &dummy, estimated_npeels, true);
1814 dummy.release ();
1816 if (first_store)
1818 dummy.create (2);
1819 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1820 &store_inside_cost,
1821 &store_outside_cost,
1822 &dummy, &dummy,
1823 estimated_npeels, true);
1824 dummy.release ();
1826 else
1828 store_inside_cost = INT_MAX;
1829 store_outside_cost = INT_MAX;
1832 if (load_inside_cost > store_inside_cost
1833 || (load_inside_cost == store_inside_cost
1834 && load_outside_cost > store_outside_cost))
1836 dr0_info = first_store;
1837 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1838 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1840 else
1842 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1843 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1846 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1847 prologue_cost_vec.create (2);
1848 epilogue_cost_vec.create (2);
1850 int dummy2;
1851 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1852 (loop_vinfo, estimated_npeels, &dummy2,
1853 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1854 &prologue_cost_vec, &epilogue_cost_vec);
1856 prologue_cost_vec.release ();
1857 epilogue_cost_vec.release ();
1859 peel_for_unknown_alignment.peel_info.count = 1
1860 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1863 peel_for_unknown_alignment.peel_info.npeel = 0;
1864 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1866 best_peel = peel_for_unknown_alignment;
1868 peel_for_known_alignment.inside_cost = INT_MAX;
1869 peel_for_known_alignment.outside_cost = INT_MAX;
1870 peel_for_known_alignment.peel_info.count = 0;
1871 peel_for_known_alignment.peel_info.dr_info = NULL;
1873 if (do_peeling && one_misalignment_known)
1875 /* Peeling is possible, but there is no data access that is not supported
1876 unless aligned. So we try to choose the best possible peeling from
1877 the hash table. */
1878 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1879 (&peeling_htab, loop_vinfo);
1882 /* Compare costs of peeling for known and unknown alignment. */
1883 if (peel_for_known_alignment.peel_info.dr_info != NULL
1884 && peel_for_unknown_alignment.inside_cost
1885 >= peel_for_known_alignment.inside_cost)
1887 best_peel = peel_for_known_alignment;
1889 /* If the best peeling for known alignment has NPEEL == 0, perform no
1890 peeling at all except if there is an unsupportable dr that we can
1891 align. */
1892 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1893 do_peeling = false;
1896 /* If there is an unsupportable data ref, prefer this over all choices so far
1897 since we'd have to discard a chosen peeling except when it accidentally
1898 aligned the unsupportable data ref. */
1899 if (one_dr_unsupportable)
1900 dr0_info = unsupportable_dr_info;
1901 else if (do_peeling)
1903 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1904 TODO: Use nopeel_outside_cost or get rid of it? */
1905 unsigned nopeel_inside_cost = 0;
1906 unsigned nopeel_outside_cost = 0;
1908 stmt_vector_for_cost dummy;
1909 dummy.create (2);
1910 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1911 &nopeel_outside_cost, &dummy, &dummy,
1912 0, false);
1913 dummy.release ();
1915 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1916 costs will be recorded. */
1917 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1918 prologue_cost_vec.create (2);
1919 epilogue_cost_vec.create (2);
1921 int dummy2;
1922 nopeel_outside_cost += vect_get_known_peeling_cost
1923 (loop_vinfo, 0, &dummy2,
1924 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1925 &prologue_cost_vec, &epilogue_cost_vec);
1927 prologue_cost_vec.release ();
1928 epilogue_cost_vec.release ();
1930 npeel = best_peel.peel_info.npeel;
1931 dr0_info = best_peel.peel_info.dr_info;
1933 /* If no peeling is not more expensive than the best peeling we
1934 have so far, don't perform any peeling. */
1935 if (nopeel_inside_cost <= best_peel.inside_cost)
1936 do_peeling = false;
1939 if (do_peeling)
1941 stmt_vec_info stmt_info = dr0_info->stmt;
1942 vectype = STMT_VINFO_VECTYPE (stmt_info);
1944 if (known_alignment_for_access_p (dr0_info))
1946 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
1947 size_zero_node) < 0;
1948 if (!npeel)
1950 /* Since it's known at compile time, compute the number of
1951 iterations in the peeled loop (the peeling factor) for use in
1952 updating DR_MISALIGNMENT values. The peeling factor is the
1953 vectorization factor minus the misalignment as an element
1954 count. */
1955 mis = (negative
1956 ? DR_MISALIGNMENT (dr0_info)
1957 : -DR_MISALIGNMENT (dr0_info));
1958 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
1959 npeel = ((mis & (target_align - 1))
1960 / vect_get_scalar_dr_size (dr0_info));
1963 /* For interleaved data access every iteration accesses all the
1964 members of the group, therefore we divide the number of iterations
1965 by the group size. */
1966 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1967 npeel /= DR_GROUP_SIZE (stmt_info);
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_NOTE, vect_location,
1971 "Try peeling by %d\n", npeel);
1974 /* Ensure that all datarefs can be vectorized after the peel. */
1975 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
1976 do_peeling = false;
1978 /* Check if all datarefs are supportable and log. */
1979 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
1981 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
1982 if (!stat)
1983 do_peeling = false;
1984 else
1985 return stat;
1988 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1989 if (do_peeling)
1991 unsigned max_allowed_peel
1992 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1993 if (max_allowed_peel != (unsigned)-1)
1995 unsigned max_peel = npeel;
1996 if (max_peel == 0)
1998 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
1999 max_peel = (target_align
2000 / vect_get_scalar_dr_size (dr0_info) - 1);
2002 if (max_peel > max_allowed_peel)
2004 do_peeling = false;
2005 if (dump_enabled_p ())
2006 dump_printf_loc (MSG_NOTE, vect_location,
2007 "Disable peeling, max peels reached: %d\n", max_peel);
2012 /* Cost model #2 - if peeling may result in a remaining loop not
2013 iterating enough to be vectorized then do not peel. Since this
2014 is a cost heuristic rather than a correctness decision, use the
2015 most likely runtime value for variable vectorization factors. */
2016 if (do_peeling
2017 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2019 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2020 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2021 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2022 < assumed_vf + max_peel)
2023 do_peeling = false;
2026 if (do_peeling)
2028 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2029 If the misalignment of DR_i is identical to that of dr0 then set
2030 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2031 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2032 by the peeling factor times the element size of DR_i (MOD the
2033 vectorization factor times the size). Otherwise, the
2034 misalignment of DR_i must be set to unknown. */
2035 FOR_EACH_VEC_ELT (datarefs, i, dr)
2036 if (dr != dr0_info->dr)
2038 /* Strided accesses perform only component accesses, alignment
2039 is irrelevant for them. */
2040 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2041 stmt_info = dr_info->stmt;
2042 if (STMT_VINFO_STRIDED_P (stmt_info)
2043 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2044 continue;
2046 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2049 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2050 if (npeel)
2051 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2052 else
2053 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2054 = DR_MISALIGNMENT (dr0_info);
2055 SET_DR_MISALIGNMENT (dr0_info, 0);
2056 if (dump_enabled_p ())
2058 dump_printf_loc (MSG_NOTE, vect_location,
2059 "Alignment of access forced using peeling.\n");
2060 dump_printf_loc (MSG_NOTE, vect_location,
2061 "Peeling for alignment will be applied.\n");
2064 /* The inside-loop cost will be accounted for in vectorizable_load
2065 and vectorizable_store correctly with adjusted alignments.
2066 Drop the body_cst_vec on the floor here. */
2067 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2068 gcc_assert (stat);
2069 return stat;
2073 /* (2) Versioning to force alignment. */
2075 /* Try versioning if:
2076 1) optimize loop for speed
2077 2) there is at least one unsupported misaligned data ref with an unknown
2078 misalignment, and
2079 3) all misaligned data refs with a known misalignment are supported, and
2080 4) the number of runtime alignment checks is within reason. */
2082 do_versioning =
2083 optimize_loop_nest_for_speed_p (loop)
2084 && (!loop->inner); /* FORNOW */
2086 if (do_versioning)
2088 FOR_EACH_VEC_ELT (datarefs, i, dr)
2090 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2091 stmt_vec_info stmt_info = dr_info->stmt;
2093 /* For interleaving, only the alignment of the first access
2094 matters. */
2095 if (aligned_access_p (dr_info)
2096 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2097 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2098 continue;
2100 if (STMT_VINFO_STRIDED_P (stmt_info))
2102 /* Strided loads perform only component accesses, alignment is
2103 irrelevant for them. */
2104 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2105 continue;
2106 do_versioning = false;
2107 break;
2110 supportable_dr_alignment
2111 = vect_supportable_dr_alignment (dr_info, false);
2113 if (!supportable_dr_alignment)
2115 int mask;
2116 tree vectype;
2118 if (known_alignment_for_access_p (dr_info)
2119 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2120 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2122 do_versioning = false;
2123 break;
2126 vectype = STMT_VINFO_VECTYPE (stmt_info);
2127 gcc_assert (vectype);
2129 /* At present we don't support versioning for alignment
2130 with variable VF, since there's no guarantee that the
2131 VF is a power of two. We could relax this if we added
2132 a way of enforcing a power-of-two size. */
2133 unsigned HOST_WIDE_INT size;
2134 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2136 do_versioning = false;
2137 break;
2140 /* The rightmost bits of an aligned address must be zeros.
2141 Construct the mask needed for this test. For example,
2142 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2143 mask must be 15 = 0xf. */
2144 mask = size - 1;
2146 /* FORNOW: use the same mask to test all potentially unaligned
2147 references in the loop. The vectorizer currently supports
2148 a single vector size, see the reference to
2149 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2150 vectorization factor is computed. */
2151 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2152 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2153 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2154 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2158 /* Versioning requires at least one misaligned data reference. */
2159 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2160 do_versioning = false;
2161 else if (!do_versioning)
2162 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2165 if (do_versioning)
2167 vec<stmt_vec_info> may_misalign_stmts
2168 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2169 stmt_vec_info stmt_info;
2171 /* It can now be assumed that the data references in the statements
2172 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2173 of the loop being vectorized. */
2174 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2176 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2177 SET_DR_MISALIGNMENT (dr_info, 0);
2178 if (dump_enabled_p ())
2179 dump_printf_loc (MSG_NOTE, vect_location,
2180 "Alignment of access forced using versioning.\n");
2183 if (dump_enabled_p ())
2184 dump_printf_loc (MSG_NOTE, vect_location,
2185 "Versioning for alignment will be applied.\n");
2187 /* Peeling and versioning can't be done together at this time. */
2188 gcc_assert (! (do_peeling && do_versioning));
2190 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2191 gcc_assert (stat);
2192 return stat;
2195 /* This point is reached if neither peeling nor versioning is being done. */
2196 gcc_assert (! (do_peeling || do_versioning));
2198 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2199 return stat;
2203 /* Function vect_find_same_alignment_drs.
2205 Update group and alignment relations in VINFO according to the chosen
2206 vectorization factor. */
2208 static void
2209 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2211 struct data_reference *dra = DDR_A (ddr);
2212 struct data_reference *drb = DDR_B (ddr);
2213 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2214 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2215 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2216 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2218 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2219 return;
2221 if (dra == drb)
2222 return;
2224 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2225 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2226 return;
2228 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2229 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2230 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2231 return;
2233 /* Two references with distance zero have the same alignment. */
2234 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2235 - wi::to_poly_offset (DR_INIT (drb)));
2236 if (maybe_ne (diff, 0))
2238 /* Get the wider of the two alignments. */
2239 unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2240 / BITS_PER_UNIT);
2241 unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2242 / BITS_PER_UNIT);
2243 unsigned int max_align = MAX (align_a, align_b);
2245 /* Require the gap to be a multiple of the larger vector alignment. */
2246 if (!multiple_p (diff, max_align))
2247 return;
2250 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2251 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2252 if (dump_enabled_p ())
2253 dump_printf_loc (MSG_NOTE, vect_location,
2254 "accesses have the same alignment: %T and %T\n",
2255 DR_REF (dra), DR_REF (drb));
2259 /* Function vect_analyze_data_refs_alignment
2261 Analyze the alignment of the data-references in the loop.
2262 Return FALSE if a data reference is found that cannot be vectorized. */
2264 opt_result
2265 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2267 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2269 /* Mark groups of data references with same alignment using
2270 data dependence information. */
2271 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2272 struct data_dependence_relation *ddr;
2273 unsigned int i;
2275 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2276 vect_find_same_alignment_drs (vinfo, ddr);
2278 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2279 struct data_reference *dr;
2281 vect_record_base_alignments (vinfo);
2282 FOR_EACH_VEC_ELT (datarefs, i, dr)
2284 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2285 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2286 vect_compute_data_ref_alignment (dr_info);
2289 return opt_result::success ();
2293 /* Analyze alignment of DRs of stmts in NODE. */
2295 static bool
2296 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2298 /* We vectorize from the first scalar stmt in the node unless
2299 the node is permuted in which case we start from the first
2300 element in the group. */
2301 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2302 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2303 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2304 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2306 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2307 vect_compute_data_ref_alignment (dr_info);
2308 /* For creating the data-ref pointer we need alignment of the
2309 first element anyway. */
2310 if (dr_info != first_dr_info)
2311 vect_compute_data_ref_alignment (first_dr_info);
2312 if (! verify_data_ref_alignment (dr_info))
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316 "not vectorized: bad data alignment in basic "
2317 "block.\n");
2318 return false;
2321 return true;
2324 /* Function vect_slp_analyze_instance_alignment
2326 Analyze the alignment of the data-references in the SLP instance.
2327 Return FALSE if a data reference is found that cannot be vectorized. */
2329 bool
2330 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2332 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2334 slp_tree node;
2335 unsigned i;
2336 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2337 if (! vect_slp_analyze_and_verify_node_alignment (node))
2338 return false;
2340 node = SLP_INSTANCE_TREE (instance);
2341 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2342 && ! vect_slp_analyze_and_verify_node_alignment
2343 (SLP_INSTANCE_TREE (instance)))
2344 return false;
2346 return true;
2350 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2351 accesses of legal size, step, etc. Detect gaps, single element
2352 interleaving, and other special cases. Set grouped access info.
2353 Collect groups of strided stores for further use in SLP analysis.
2354 Worker for vect_analyze_group_access. */
2356 static bool
2357 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2359 data_reference *dr = dr_info->dr;
2360 tree step = DR_STEP (dr);
2361 tree scalar_type = TREE_TYPE (DR_REF (dr));
2362 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2363 stmt_vec_info stmt_info = dr_info->stmt;
2364 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2365 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2366 HOST_WIDE_INT dr_step = -1;
2367 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2368 bool slp_impossible = false;
2370 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2371 size of the interleaving group (including gaps). */
2372 if (tree_fits_shwi_p (step))
2374 dr_step = tree_to_shwi (step);
2375 /* Check that STEP is a multiple of type size. Otherwise there is
2376 a non-element-sized gap at the end of the group which we
2377 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2378 ??? As we can handle non-constant step fine here we should
2379 simply remove uses of DR_GROUP_GAP between the last and first
2380 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2381 simply not include that gap. */
2382 if ((dr_step % type_size) != 0)
2384 if (dump_enabled_p ())
2385 dump_printf_loc (MSG_NOTE, vect_location,
2386 "Step %T is not a multiple of the element size"
2387 " for %T\n",
2388 step, DR_REF (dr));
2389 return false;
2391 groupsize = absu_hwi (dr_step) / type_size;
2393 else
2394 groupsize = 0;
2396 /* Not consecutive access is possible only if it is a part of interleaving. */
2397 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2399 /* Check if it this DR is a part of interleaving, and is a single
2400 element of the group that is accessed in the loop. */
2402 /* Gaps are supported only for loads. STEP must be a multiple of the type
2403 size. */
2404 if (DR_IS_READ (dr)
2405 && (dr_step % type_size) == 0
2406 && groupsize > 0)
2408 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2409 DR_GROUP_SIZE (stmt_info) = groupsize;
2410 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2411 if (dump_enabled_p ())
2412 dump_printf_loc (MSG_NOTE, vect_location,
2413 "Detected single element interleaving %T"
2414 " step %T\n",
2415 DR_REF (dr), step);
2417 return true;
2420 if (dump_enabled_p ())
2421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2422 "not consecutive access %G", stmt_info->stmt);
2424 if (bb_vinfo)
2426 /* Mark the statement as unvectorizable. */
2427 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2428 return true;
2431 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2432 STMT_VINFO_STRIDED_P (stmt_info) = true;
2433 return true;
2436 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2438 /* First stmt in the interleaving chain. Check the chain. */
2439 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2440 struct data_reference *data_ref = dr;
2441 unsigned int count = 1;
2442 tree prev_init = DR_INIT (data_ref);
2443 stmt_vec_info prev = stmt_info;
2444 HOST_WIDE_INT diff, gaps = 0;
2446 /* By construction, all group members have INTEGER_CST DR_INITs. */
2447 while (next)
2449 /* Skip same data-refs. In case that two or more stmts share
2450 data-ref (supported only for loads), we vectorize only the first
2451 stmt, and the rest get their vectorized loads from the first
2452 one. */
2453 if (!tree_int_cst_compare (DR_INIT (data_ref),
2454 DR_INIT (STMT_VINFO_DATA_REF (next))))
2456 if (DR_IS_WRITE (data_ref))
2458 if (dump_enabled_p ())
2459 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2460 "Two store stmts share the same dr.\n");
2461 return false;
2464 if (dump_enabled_p ())
2465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2466 "Two or more load stmts share the same dr.\n");
2468 /* For load use the same data-ref load. */
2469 DR_GROUP_SAME_DR_STMT (next) = prev;
2471 prev = next;
2472 next = DR_GROUP_NEXT_ELEMENT (next);
2473 continue;
2476 prev = next;
2477 data_ref = STMT_VINFO_DATA_REF (next);
2479 /* All group members have the same STEP by construction. */
2480 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2482 /* Check that the distance between two accesses is equal to the type
2483 size. Otherwise, we have gaps. */
2484 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2485 - TREE_INT_CST_LOW (prev_init)) / type_size;
2486 if (diff != 1)
2488 /* FORNOW: SLP of accesses with gaps is not supported. */
2489 slp_impossible = true;
2490 if (DR_IS_WRITE (data_ref))
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494 "interleaved store with gaps\n");
2495 return false;
2498 gaps += diff - 1;
2501 last_accessed_element += diff;
2503 /* Store the gap from the previous member of the group. If there is no
2504 gap in the access, DR_GROUP_GAP is always 1. */
2505 DR_GROUP_GAP (next) = diff;
2507 prev_init = DR_INIT (data_ref);
2508 next = DR_GROUP_NEXT_ELEMENT (next);
2509 /* Count the number of data-refs in the chain. */
2510 count++;
2513 if (groupsize == 0)
2514 groupsize = count + gaps;
2516 /* This could be UINT_MAX but as we are generating code in a very
2517 inefficient way we have to cap earlier. See PR78699 for example. */
2518 if (groupsize > 4096)
2520 if (dump_enabled_p ())
2521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2522 "group is too large\n");
2523 return false;
2526 /* Check that the size of the interleaving is equal to count for stores,
2527 i.e., that there are no gaps. */
2528 if (groupsize != count
2529 && !DR_IS_READ (dr))
2531 groupsize = count;
2532 STMT_VINFO_STRIDED_P (stmt_info) = true;
2535 /* If there is a gap after the last load in the group it is the
2536 difference between the groupsize and the last accessed
2537 element.
2538 When there is no gap, this difference should be 0. */
2539 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2541 DR_GROUP_SIZE (stmt_info) = groupsize;
2542 if (dump_enabled_p ())
2544 dump_printf_loc (MSG_NOTE, vect_location,
2545 "Detected interleaving ");
2546 if (DR_IS_READ (dr))
2547 dump_printf (MSG_NOTE, "load ");
2548 else if (STMT_VINFO_STRIDED_P (stmt_info))
2549 dump_printf (MSG_NOTE, "strided store ");
2550 else
2551 dump_printf (MSG_NOTE, "store ");
2552 dump_printf (MSG_NOTE, "of size %u starting with %G",
2553 (unsigned)groupsize, stmt_info->stmt);
2554 if (DR_GROUP_GAP (stmt_info) != 0)
2555 dump_printf_loc (MSG_NOTE, vect_location,
2556 "There is a gap of %u elements after the group\n",
2557 DR_GROUP_GAP (stmt_info));
2560 /* SLP: create an SLP data structure for every interleaving group of
2561 stores for further analysis in vect_analyse_slp. */
2562 if (DR_IS_WRITE (dr) && !slp_impossible)
2564 if (loop_vinfo)
2565 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2566 if (bb_vinfo)
2567 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2571 return true;
2574 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2575 accesses of legal size, step, etc. Detect gaps, single element
2576 interleaving, and other special cases. Set grouped access info.
2577 Collect groups of strided stores for further use in SLP analysis. */
2579 static bool
2580 vect_analyze_group_access (dr_vec_info *dr_info)
2582 if (!vect_analyze_group_access_1 (dr_info))
2584 /* Dissolve the group if present. */
2585 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2586 while (stmt_info)
2588 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2589 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2590 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2591 stmt_info = next;
2593 return false;
2595 return true;
2598 /* Analyze the access pattern of the data-reference DR_INFO.
2599 In case of non-consecutive accesses call vect_analyze_group_access() to
2600 analyze groups of accesses. */
2602 static bool
2603 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2605 data_reference *dr = dr_info->dr;
2606 tree step = DR_STEP (dr);
2607 tree scalar_type = TREE_TYPE (DR_REF (dr));
2608 stmt_vec_info stmt_info = dr_info->stmt;
2609 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2610 struct loop *loop = NULL;
2612 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2613 return true;
2615 if (loop_vinfo)
2616 loop = LOOP_VINFO_LOOP (loop_vinfo);
2618 if (loop_vinfo && !step)
2620 if (dump_enabled_p ())
2621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2622 "bad data-ref access in loop\n");
2623 return false;
2626 /* Allow loads with zero step in inner-loop vectorization. */
2627 if (loop_vinfo && integer_zerop (step))
2629 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2630 if (!nested_in_vect_loop_p (loop, stmt_info))
2631 return DR_IS_READ (dr);
2632 /* Allow references with zero step for outer loops marked
2633 with pragma omp simd only - it guarantees absence of
2634 loop-carried dependencies between inner loop iterations. */
2635 if (loop->safelen < 2)
2637 if (dump_enabled_p ())
2638 dump_printf_loc (MSG_NOTE, vect_location,
2639 "zero step in inner loop of nest\n");
2640 return false;
2644 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2646 /* Interleaved accesses are not yet supported within outer-loop
2647 vectorization for references in the inner-loop. */
2648 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2650 /* For the rest of the analysis we use the outer-loop step. */
2651 step = STMT_VINFO_DR_STEP (stmt_info);
2652 if (integer_zerop (step))
2654 if (dump_enabled_p ())
2655 dump_printf_loc (MSG_NOTE, vect_location,
2656 "zero step in outer loop.\n");
2657 return DR_IS_READ (dr);
2661 /* Consecutive? */
2662 if (TREE_CODE (step) == INTEGER_CST)
2664 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2665 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2666 || (dr_step < 0
2667 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2669 /* Mark that it is not interleaving. */
2670 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2671 return true;
2675 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2677 if (dump_enabled_p ())
2678 dump_printf_loc (MSG_NOTE, vect_location,
2679 "grouped access in outer loop.\n");
2680 return false;
2684 /* Assume this is a DR handled by non-constant strided load case. */
2685 if (TREE_CODE (step) != INTEGER_CST)
2686 return (STMT_VINFO_STRIDED_P (stmt_info)
2687 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2688 || vect_analyze_group_access (dr_info)));
2690 /* Not consecutive access - check if it's a part of interleaving group. */
2691 return vect_analyze_group_access (dr_info);
2694 /* Compare two data-references DRA and DRB to group them into chunks
2695 suitable for grouping. */
2697 static int
2698 dr_group_sort_cmp (const void *dra_, const void *drb_)
2700 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2701 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2702 int cmp;
2704 /* Stabilize sort. */
2705 if (dra == drb)
2706 return 0;
2708 /* DRs in different loops never belong to the same group. */
2709 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2710 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2711 if (loopa != loopb)
2712 return loopa->num < loopb->num ? -1 : 1;
2714 /* Ordering of DRs according to base. */
2715 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2716 DR_BASE_ADDRESS (drb));
2717 if (cmp != 0)
2718 return cmp;
2720 /* And according to DR_OFFSET. */
2721 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2722 if (cmp != 0)
2723 return cmp;
2725 /* Put reads before writes. */
2726 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2727 return DR_IS_READ (dra) ? -1 : 1;
2729 /* Then sort after access size. */
2730 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2731 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2732 if (cmp != 0)
2733 return cmp;
2735 /* And after step. */
2736 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2737 if (cmp != 0)
2738 return cmp;
2740 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2741 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2742 if (cmp == 0)
2743 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2744 return cmp;
2747 /* If OP is the result of a conversion, return the unconverted value,
2748 otherwise return null. */
2750 static tree
2751 strip_conversion (tree op)
2753 if (TREE_CODE (op) != SSA_NAME)
2754 return NULL_TREE;
2755 gimple *stmt = SSA_NAME_DEF_STMT (op);
2756 if (!is_gimple_assign (stmt)
2757 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2758 return NULL_TREE;
2759 return gimple_assign_rhs1 (stmt);
2762 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2763 and STMT2_INFO being in a single group. */
2765 static bool
2766 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2768 if (gimple_assign_single_p (stmt1_info->stmt))
2769 return gimple_assign_single_p (stmt2_info->stmt);
2771 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2772 if (call1 && gimple_call_internal_p (call1))
2774 /* Check for two masked loads or two masked stores. */
2775 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2776 if (!call2 || !gimple_call_internal_p (call2))
2777 return false;
2778 internal_fn ifn = gimple_call_internal_fn (call1);
2779 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2780 return false;
2781 if (ifn != gimple_call_internal_fn (call2))
2782 return false;
2784 /* Check that the masks are the same. Cope with casts of masks,
2785 like those created by build_mask_conversion. */
2786 tree mask1 = gimple_call_arg (call1, 2);
2787 tree mask2 = gimple_call_arg (call2, 2);
2788 if (!operand_equal_p (mask1, mask2, 0))
2790 mask1 = strip_conversion (mask1);
2791 if (!mask1)
2792 return false;
2793 mask2 = strip_conversion (mask2);
2794 if (!mask2)
2795 return false;
2796 if (!operand_equal_p (mask1, mask2, 0))
2797 return false;
2799 return true;
2802 return false;
2805 /* Function vect_analyze_data_ref_accesses.
2807 Analyze the access pattern of all the data references in the loop.
2809 FORNOW: the only access pattern that is considered vectorizable is a
2810 simple step 1 (consecutive) access.
2812 FORNOW: handle only arrays and pointer accesses. */
2814 opt_result
2815 vect_analyze_data_ref_accesses (vec_info *vinfo)
2817 unsigned int i;
2818 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2819 struct data_reference *dr;
2821 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2823 if (datarefs.is_empty ())
2824 return opt_result::success ();
2826 /* Sort the array of datarefs to make building the interleaving chains
2827 linear. Don't modify the original vector's order, it is needed for
2828 determining what dependencies are reversed. */
2829 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2830 datarefs_copy.qsort (dr_group_sort_cmp);
2832 /* Build the interleaving chains. */
2833 for (i = 0; i < datarefs_copy.length () - 1;)
2835 data_reference_p dra = datarefs_copy[i];
2836 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2837 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2838 stmt_vec_info lastinfo = NULL;
2839 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2840 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2842 ++i;
2843 continue;
2845 for (i = i + 1; i < datarefs_copy.length (); ++i)
2847 data_reference_p drb = datarefs_copy[i];
2848 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2849 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2850 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2851 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2852 break;
2854 /* ??? Imperfect sorting (non-compatible types, non-modulo
2855 accesses, same accesses) can lead to a group to be artificially
2856 split here as we don't just skip over those. If it really
2857 matters we can push those to a worklist and re-iterate
2858 over them. The we can just skip ahead to the next DR here. */
2860 /* DRs in a different loop should not be put into the same
2861 interleaving group. */
2862 if (gimple_bb (DR_STMT (dra))->loop_father
2863 != gimple_bb (DR_STMT (drb))->loop_father)
2864 break;
2866 /* Check that the data-refs have same first location (except init)
2867 and they are both either store or load (not load and store,
2868 not masked loads or stores). */
2869 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2870 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2871 DR_BASE_ADDRESS (drb)) != 0
2872 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2873 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2874 break;
2876 /* Check that the data-refs have the same constant size. */
2877 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2878 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2879 if (!tree_fits_uhwi_p (sza)
2880 || !tree_fits_uhwi_p (szb)
2881 || !tree_int_cst_equal (sza, szb))
2882 break;
2884 /* Check that the data-refs have the same step. */
2885 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2886 break;
2888 /* Check the types are compatible.
2889 ??? We don't distinguish this during sorting. */
2890 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2891 TREE_TYPE (DR_REF (drb))))
2892 break;
2894 /* Check that the DR_INITs are compile-time constants. */
2895 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2896 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2897 break;
2899 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2900 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2901 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2902 HOST_WIDE_INT init_prev
2903 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2904 gcc_assert (init_a <= init_b
2905 && init_a <= init_prev
2906 && init_prev <= init_b);
2908 /* Do not place the same access in the interleaving chain twice. */
2909 if (init_b == init_prev)
2911 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2912 < gimple_uid (DR_STMT (drb)));
2913 /* ??? For now we simply "drop" the later reference which is
2914 otherwise the same rather than finishing off this group.
2915 In the end we'd want to re-process duplicates forming
2916 multiple groups from the refs, likely by just collecting
2917 all candidates (including duplicates and split points
2918 below) in a vector and then process them together. */
2919 continue;
2922 /* If init_b == init_a + the size of the type * k, we have an
2923 interleaving, and DRA is accessed before DRB. */
2924 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2925 if (type_size_a == 0
2926 || (init_b - init_a) % type_size_a != 0)
2927 break;
2929 /* If we have a store, the accesses are adjacent. This splits
2930 groups into chunks we support (we don't support vectorization
2931 of stores with gaps). */
2932 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2933 break;
2935 /* If the step (if not zero or non-constant) is greater than the
2936 difference between data-refs' inits this splits groups into
2937 suitable sizes. */
2938 if (tree_fits_shwi_p (DR_STEP (dra)))
2940 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2941 if (step != 0 && step <= (init_b - init_a))
2942 break;
2945 if (dump_enabled_p ())
2946 dump_printf_loc (MSG_NOTE, vect_location,
2947 DR_IS_READ (dra)
2948 ? "Detected interleaving load %T and %T\n"
2949 : "Detected interleaving store %T and %T\n",
2950 DR_REF (dra), DR_REF (drb));
2952 /* Link the found element into the group list. */
2953 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
2955 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
2956 lastinfo = stmtinfo_a;
2958 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
2959 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
2960 lastinfo = stmtinfo_b;
2964 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2966 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2967 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
2968 && !vect_analyze_data_ref_access (dr_info))
2970 if (dump_enabled_p ())
2971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2972 "not vectorized: complicated access pattern.\n");
2974 if (is_a <bb_vec_info> (vinfo))
2976 /* Mark the statement as not vectorizable. */
2977 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
2978 continue;
2980 else
2982 datarefs_copy.release ();
2983 return opt_result::failure_at (dr_info->stmt->stmt,
2984 "not vectorized:"
2985 " complicated access pattern.\n");
2990 datarefs_copy.release ();
2991 return opt_result::success ();
2994 /* Function vect_vfa_segment_size.
2996 Input:
2997 DR_INFO: The data reference.
2998 LENGTH_FACTOR: segment length to consider.
3000 Return a value suitable for the dr_with_seg_len::seg_len field.
3001 This is the "distance travelled" by the pointer from the first
3002 iteration in the segment to the last. Note that it does not include
3003 the size of the access; in effect it only describes the first byte. */
3005 static tree
3006 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3008 length_factor = size_binop (MINUS_EXPR,
3009 fold_convert (sizetype, length_factor),
3010 size_one_node);
3011 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3012 length_factor);
3015 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3016 gives the worst-case number of bytes covered by the segment. */
3018 static unsigned HOST_WIDE_INT
3019 vect_vfa_access_size (dr_vec_info *dr_info)
3021 stmt_vec_info stmt_vinfo = dr_info->stmt;
3022 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3023 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3024 unsigned HOST_WIDE_INT access_size = ref_size;
3025 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3027 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3028 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3030 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3031 && (vect_supportable_dr_alignment (dr_info, false)
3032 == dr_explicit_realign_optimized))
3034 /* We might access a full vector's worth. */
3035 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3036 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3038 return access_size;
3041 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3042 describes. */
3044 static unsigned int
3045 vect_vfa_align (dr_vec_info *dr_info)
3047 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3050 /* Function vect_no_alias_p.
3052 Given data references A and B with equal base and offset, see whether
3053 the alias relation can be decided at compilation time. Return 1 if
3054 it can and the references alias, 0 if it can and the references do
3055 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3056 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3057 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3059 static int
3060 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3061 tree segment_length_a, tree segment_length_b,
3062 unsigned HOST_WIDE_INT access_size_a,
3063 unsigned HOST_WIDE_INT access_size_b)
3065 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3066 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3067 poly_uint64 const_length_a;
3068 poly_uint64 const_length_b;
3070 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3071 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3072 [a, a+12) */
3073 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3075 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3076 offset_a = (offset_a + access_size_a) - const_length_a;
3078 else
3079 const_length_a = tree_to_poly_uint64 (segment_length_a);
3080 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3082 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3083 offset_b = (offset_b + access_size_b) - const_length_b;
3085 else
3086 const_length_b = tree_to_poly_uint64 (segment_length_b);
3088 const_length_a += access_size_a;
3089 const_length_b += access_size_b;
3091 if (ranges_known_overlap_p (offset_a, const_length_a,
3092 offset_b, const_length_b))
3093 return 1;
3095 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3096 offset_b, const_length_b))
3097 return 0;
3099 return -1;
3102 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3103 in DDR is >= VF. */
3105 static bool
3106 dependence_distance_ge_vf (data_dependence_relation *ddr,
3107 unsigned int loop_depth, poly_uint64 vf)
3109 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3110 || DDR_NUM_DIST_VECTS (ddr) == 0)
3111 return false;
3113 /* If the dependence is exact, we should have limited the VF instead. */
3114 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3116 unsigned int i;
3117 lambda_vector dist_v;
3118 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3120 HOST_WIDE_INT dist = dist_v[loop_depth];
3121 if (dist != 0
3122 && !(dist > 0 && DDR_REVERSED_P (ddr))
3123 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3124 return false;
3127 if (dump_enabled_p ())
3128 dump_printf_loc (MSG_NOTE, vect_location,
3129 "dependence distance between %T and %T is >= VF\n",
3130 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3132 return true;
3135 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3137 static void
3138 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3140 dump_printf (dump_kind, "%s (%T) >= ",
3141 lower_bound.unsigned_p ? "unsigned" : "abs",
3142 lower_bound.expr);
3143 dump_dec (dump_kind, lower_bound.min_value);
3146 /* Record that the vectorized loop requires the vec_lower_bound described
3147 by EXPR, UNSIGNED_P and MIN_VALUE. */
3149 static void
3150 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3151 poly_uint64 min_value)
3153 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3154 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3155 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3157 unsigned_p &= lower_bounds[i].unsigned_p;
3158 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3159 if (lower_bounds[i].unsigned_p != unsigned_p
3160 || maybe_lt (lower_bounds[i].min_value, min_value))
3162 lower_bounds[i].unsigned_p = unsigned_p;
3163 lower_bounds[i].min_value = min_value;
3164 if (dump_enabled_p ())
3166 dump_printf_loc (MSG_NOTE, vect_location,
3167 "updating run-time check to ");
3168 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3169 dump_printf (MSG_NOTE, "\n");
3172 return;
3175 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3176 if (dump_enabled_p ())
3178 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3179 dump_lower_bound (MSG_NOTE, lower_bound);
3180 dump_printf (MSG_NOTE, "\n");
3182 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3185 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3186 will span fewer than GAP bytes. */
3188 static bool
3189 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3190 poly_int64 gap)
3192 stmt_vec_info stmt_info = dr_info->stmt;
3193 HOST_WIDE_INT count
3194 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3195 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3196 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3197 return (estimated_poly_value (gap)
3198 <= count * vect_get_scalar_dr_size (dr_info));
3201 /* Return true if we know that there is no alias between DR_INFO_A and
3202 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3203 When returning true, set *LOWER_BOUND_OUT to this N. */
3205 static bool
3206 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3207 poly_uint64 *lower_bound_out)
3209 /* Check that there is a constant gap of known sign between DR_A
3210 and DR_B. */
3211 data_reference *dr_a = dr_info_a->dr;
3212 data_reference *dr_b = dr_info_b->dr;
3213 poly_int64 init_a, init_b;
3214 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3215 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3216 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3217 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3218 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3219 || !ordered_p (init_a, init_b))
3220 return false;
3222 /* Sort DR_A and DR_B by the address they access. */
3223 if (maybe_lt (init_b, init_a))
3225 std::swap (init_a, init_b);
3226 std::swap (dr_info_a, dr_info_b);
3227 std::swap (dr_a, dr_b);
3230 /* If the two accesses could be dependent within a scalar iteration,
3231 make sure that we'd retain their order. */
3232 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3233 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3234 return false;
3236 /* There is no alias if abs (DR_STEP) is greater than or equal to
3237 the bytes spanned by the combination of the two accesses. */
3238 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3239 return true;
3242 /* Function vect_prune_runtime_alias_test_list.
3244 Prune a list of ddrs to be tested at run-time by versioning for alias.
3245 Merge several alias checks into one if possible.
3246 Return FALSE if resulting list of ddrs is longer then allowed by
3247 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3249 opt_result
3250 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3252 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3253 hash_set <tree_pair_hash> compared_objects;
3255 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3256 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3257 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3258 vec<vec_object_pair> &check_unequal_addrs
3259 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3260 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3261 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3263 ddr_p ddr;
3264 unsigned int i;
3265 tree length_factor;
3267 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3269 /* Step values are irrelevant for aliasing if the number of vector
3270 iterations is equal to the number of scalar iterations (which can
3271 happen for fully-SLP loops). */
3272 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3274 if (!ignore_step_p)
3276 /* Convert the checks for nonzero steps into bound tests. */
3277 tree value;
3278 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3279 vect_check_lower_bound (loop_vinfo, value, true, 1);
3282 if (may_alias_ddrs.is_empty ())
3283 return opt_result::success ();
3285 comp_alias_ddrs.create (may_alias_ddrs.length ());
3287 unsigned int loop_depth
3288 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3289 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3291 /* First, we collect all data ref pairs for aliasing checks. */
3292 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3294 int comp_res;
3295 poly_uint64 lower_bound;
3296 tree segment_length_a, segment_length_b;
3297 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3298 unsigned int align_a, align_b;
3300 /* Ignore the alias if the VF we chose ended up being no greater
3301 than the dependence distance. */
3302 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3303 continue;
3305 if (DDR_OBJECT_A (ddr))
3307 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3308 if (!compared_objects.add (new_pair))
3310 if (dump_enabled_p ())
3311 dump_printf_loc (MSG_NOTE, vect_location,
3312 "checking that %T and %T"
3313 " have different addresses\n",
3314 new_pair.first, new_pair.second);
3315 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3317 continue;
3320 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3321 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3323 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3324 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3326 /* Skip the pair if inter-iteration dependencies are irrelevant
3327 and intra-iteration dependencies are guaranteed to be honored. */
3328 if (ignore_step_p
3329 && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3330 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3331 &lower_bound)))
3333 if (dump_enabled_p ())
3334 dump_printf_loc (MSG_NOTE, vect_location,
3335 "no need for alias check between "
3336 "%T and %T when VF is 1\n",
3337 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3338 continue;
3341 /* See whether we can handle the alias using a bounds check on
3342 the step, and whether that's likely to be the best approach.
3343 (It might not be, for example, if the minimum step is much larger
3344 than the number of bytes handled by one vector iteration.) */
3345 if (!ignore_step_p
3346 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3347 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3348 &lower_bound)
3349 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3350 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3352 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3353 if (dump_enabled_p ())
3355 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3356 "%T and %T when the step %T is outside ",
3357 DR_REF (dr_info_a->dr),
3358 DR_REF (dr_info_b->dr),
3359 DR_STEP (dr_info_a->dr));
3360 if (unsigned_p)
3361 dump_printf (MSG_NOTE, "[0");
3362 else
3364 dump_printf (MSG_NOTE, "(");
3365 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3367 dump_printf (MSG_NOTE, ", ");
3368 dump_dec (MSG_NOTE, lower_bound);
3369 dump_printf (MSG_NOTE, ")\n");
3371 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3372 unsigned_p, lower_bound);
3373 continue;
3376 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3377 if (dr_group_first_a)
3379 stmt_info_a = dr_group_first_a;
3380 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3383 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3384 if (dr_group_first_b)
3386 stmt_info_b = dr_group_first_b;
3387 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3390 if (ignore_step_p)
3392 segment_length_a = size_zero_node;
3393 segment_length_b = size_zero_node;
3395 else
3397 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3398 DR_STEP (dr_info_b->dr), 0))
3399 length_factor = scalar_loop_iters;
3400 else
3401 length_factor = size_int (vect_factor);
3402 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3403 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3405 access_size_a = vect_vfa_access_size (dr_info_a);
3406 access_size_b = vect_vfa_access_size (dr_info_b);
3407 align_a = vect_vfa_align (dr_info_a);
3408 align_b = vect_vfa_align (dr_info_b);
3410 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3411 DR_BASE_ADDRESS (dr_info_b->dr));
3412 if (comp_res == 0)
3413 comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3414 DR_OFFSET (dr_info_b->dr));
3416 /* See whether the alias is known at compilation time. */
3417 if (comp_res == 0
3418 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3419 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3420 && poly_int_tree_p (segment_length_a)
3421 && poly_int_tree_p (segment_length_b))
3423 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3424 segment_length_a,
3425 segment_length_b,
3426 access_size_a,
3427 access_size_b);
3428 if (res >= 0 && dump_enabled_p ())
3430 dump_printf_loc (MSG_NOTE, vect_location,
3431 "can tell at compile time that %T and %T",
3432 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3433 if (res == 0)
3434 dump_printf (MSG_NOTE, " do not alias\n");
3435 else
3436 dump_printf (MSG_NOTE, " alias\n");
3439 if (res == 0)
3440 continue;
3442 if (res == 1)
3443 return opt_result::failure_at (stmt_info_b->stmt,
3444 "not vectorized:"
3445 " compilation time alias: %G%G",
3446 stmt_info_a->stmt,
3447 stmt_info_b->stmt);
3450 dr_with_seg_len_pair_t dr_with_seg_len_pair
3451 (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3452 access_size_a, align_a),
3453 dr_with_seg_len (dr_info_b->dr, segment_length_b,
3454 access_size_b, align_b));
3456 /* Canonicalize pairs by sorting the two DR members. */
3457 if (comp_res > 0)
3458 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3460 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3463 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3465 unsigned int count = (comp_alias_ddrs.length ()
3466 + check_unequal_addrs.length ());
3468 dump_printf_loc (MSG_NOTE, vect_location,
3469 "improved number of alias checks from %d to %d\n",
3470 may_alias_ddrs.length (), count);
3471 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3472 return opt_result::failure_at
3473 (vect_location,
3474 "number of versioning for alias "
3475 "run-time tests exceeds %d "
3476 "(--param vect-max-version-for-alias-checks)\n",
3477 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3479 return opt_result::success ();
3482 /* Check whether we can use an internal function for a gather load
3483 or scatter store. READ_P is true for loads and false for stores.
3484 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3485 the type of the memory elements being loaded or stored. OFFSET_BITS
3486 is the number of bits in each scalar offset and OFFSET_SIGN is the
3487 sign of the offset. SCALE is the amount by which the offset should
3488 be multiplied *after* it has been converted to address width.
3490 Return true if the function is supported, storing the function
3491 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3493 bool
3494 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3495 tree memory_type, unsigned int offset_bits,
3496 signop offset_sign, int scale,
3497 internal_fn *ifn_out, tree *element_type_out)
3499 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3500 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3501 if (offset_bits > element_bits)
3502 /* Internal functions require the offset to be the same width as
3503 the vector elements. We can extend narrower offsets, but it isn't
3504 safe to truncate wider offsets. */
3505 return false;
3507 if (element_bits != memory_bits)
3508 /* For now the vector elements must be the same width as the
3509 memory elements. */
3510 return false;
3512 /* Work out which function we need. */
3513 internal_fn ifn;
3514 if (read_p)
3515 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3516 else
3517 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3519 /* Test whether the target supports this combination. */
3520 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3521 offset_sign, scale))
3522 return false;
3524 *ifn_out = ifn;
3525 *element_type_out = TREE_TYPE (vectype);
3526 return true;
3529 /* STMT_INFO is a call to an internal gather load or scatter store function.
3530 Describe the operation in INFO. */
3532 static void
3533 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3534 gather_scatter_info *info)
3536 gcall *call = as_a <gcall *> (stmt_info->stmt);
3537 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3538 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3540 info->ifn = gimple_call_internal_fn (call);
3541 info->decl = NULL_TREE;
3542 info->base = gimple_call_arg (call, 0);
3543 info->offset = gimple_call_arg (call, 1);
3544 info->offset_dt = vect_unknown_def_type;
3545 info->offset_vectype = NULL_TREE;
3546 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3547 info->element_type = TREE_TYPE (vectype);
3548 info->memory_type = TREE_TYPE (DR_REF (dr));
3551 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3552 gather load or scatter store. Describe the operation in *INFO if so. */
3554 bool
3555 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3556 gather_scatter_info *info)
3558 HOST_WIDE_INT scale = 1;
3559 poly_int64 pbitpos, pbitsize;
3560 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3561 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3562 tree offtype = NULL_TREE;
3563 tree decl = NULL_TREE, base, off;
3564 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3565 tree memory_type = TREE_TYPE (DR_REF (dr));
3566 machine_mode pmode;
3567 int punsignedp, reversep, pvolatilep = 0;
3568 internal_fn ifn;
3569 tree element_type;
3570 bool masked_p = false;
3572 /* See whether this is already a call to a gather/scatter internal function.
3573 If not, see whether it's a masked load or store. */
3574 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3575 if (call && gimple_call_internal_p (call))
3577 ifn = gimple_call_internal_fn (call);
3578 if (internal_gather_scatter_fn_p (ifn))
3580 vect_describe_gather_scatter_call (stmt_info, info);
3581 return true;
3583 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3586 /* True if we should aim to use internal functions rather than
3587 built-in functions. */
3588 bool use_ifn_p = (DR_IS_READ (dr)
3589 ? supports_vec_gather_load_p ()
3590 : supports_vec_scatter_store_p ());
3592 base = DR_REF (dr);
3593 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3594 see if we can use the def stmt of the address. */
3595 if (masked_p
3596 && TREE_CODE (base) == MEM_REF
3597 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3598 && integer_zerop (TREE_OPERAND (base, 1))
3599 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3601 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3602 if (is_gimple_assign (def_stmt)
3603 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3604 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3607 /* The gather and scatter builtins need address of the form
3608 loop_invariant + vector * {1, 2, 4, 8}
3610 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3611 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3612 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3613 multiplications and additions in it. To get a vector, we need
3614 a single SSA_NAME that will be defined in the loop and will
3615 contain everything that is not loop invariant and that can be
3616 vectorized. The following code attempts to find such a preexistng
3617 SSA_NAME OFF and put the loop invariants into a tree BASE
3618 that can be gimplified before the loop. */
3619 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3620 &punsignedp, &reversep, &pvolatilep);
3621 if (reversep)
3622 return false;
3624 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3626 if (TREE_CODE (base) == MEM_REF)
3628 if (!integer_zerop (TREE_OPERAND (base, 1)))
3630 if (off == NULL_TREE)
3631 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3632 else
3633 off = size_binop (PLUS_EXPR, off,
3634 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3636 base = TREE_OPERAND (base, 0);
3638 else
3639 base = build_fold_addr_expr (base);
3641 if (off == NULL_TREE)
3642 off = size_zero_node;
3644 /* If base is not loop invariant, either off is 0, then we start with just
3645 the constant offset in the loop invariant BASE and continue with base
3646 as OFF, otherwise give up.
3647 We could handle that case by gimplifying the addition of base + off
3648 into some SSA_NAME and use that as off, but for now punt. */
3649 if (!expr_invariant_in_loop_p (loop, base))
3651 if (!integer_zerop (off))
3652 return false;
3653 off = base;
3654 base = size_int (pbytepos);
3656 /* Otherwise put base + constant offset into the loop invariant BASE
3657 and continue with OFF. */
3658 else
3660 base = fold_convert (sizetype, base);
3661 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3664 /* OFF at this point may be either a SSA_NAME or some tree expression
3665 from get_inner_reference. Try to peel off loop invariants from it
3666 into BASE as long as possible. */
3667 STRIP_NOPS (off);
3668 while (offtype == NULL_TREE)
3670 enum tree_code code;
3671 tree op0, op1, add = NULL_TREE;
3673 if (TREE_CODE (off) == SSA_NAME)
3675 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3677 if (expr_invariant_in_loop_p (loop, off))
3678 return false;
3680 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3681 break;
3683 op0 = gimple_assign_rhs1 (def_stmt);
3684 code = gimple_assign_rhs_code (def_stmt);
3685 op1 = gimple_assign_rhs2 (def_stmt);
3687 else
3689 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3690 return false;
3691 code = TREE_CODE (off);
3692 extract_ops_from_tree (off, &code, &op0, &op1);
3694 switch (code)
3696 case POINTER_PLUS_EXPR:
3697 case PLUS_EXPR:
3698 if (expr_invariant_in_loop_p (loop, op0))
3700 add = op0;
3701 off = op1;
3702 do_add:
3703 add = fold_convert (sizetype, add);
3704 if (scale != 1)
3705 add = size_binop (MULT_EXPR, add, size_int (scale));
3706 base = size_binop (PLUS_EXPR, base, add);
3707 continue;
3709 if (expr_invariant_in_loop_p (loop, op1))
3711 add = op1;
3712 off = op0;
3713 goto do_add;
3715 break;
3716 case MINUS_EXPR:
3717 if (expr_invariant_in_loop_p (loop, op1))
3719 add = fold_convert (sizetype, op1);
3720 add = size_binop (MINUS_EXPR, size_zero_node, add);
3721 off = op0;
3722 goto do_add;
3724 break;
3725 case MULT_EXPR:
3726 if (scale == 1 && tree_fits_shwi_p (op1))
3728 int new_scale = tree_to_shwi (op1);
3729 /* Only treat this as a scaling operation if the target
3730 supports it. */
3731 if (use_ifn_p
3732 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3733 vectype, memory_type, 1,
3734 TYPE_SIGN (TREE_TYPE (op0)),
3735 new_scale, &ifn,
3736 &element_type))
3737 break;
3738 scale = new_scale;
3739 off = op0;
3740 continue;
3742 break;
3743 case SSA_NAME:
3744 off = op0;
3745 continue;
3746 CASE_CONVERT:
3747 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3748 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3749 break;
3750 if (TYPE_PRECISION (TREE_TYPE (op0))
3751 == TYPE_PRECISION (TREE_TYPE (off)))
3753 off = op0;
3754 continue;
3757 /* The internal functions need the offset to be the same width
3758 as the elements of VECTYPE. Don't include operations that
3759 cast the offset from that width to a different width. */
3760 if (use_ifn_p
3761 && (int_size_in_bytes (TREE_TYPE (vectype))
3762 == int_size_in_bytes (TREE_TYPE (off))))
3763 break;
3765 if (TYPE_PRECISION (TREE_TYPE (op0))
3766 < TYPE_PRECISION (TREE_TYPE (off)))
3768 off = op0;
3769 offtype = TREE_TYPE (off);
3770 STRIP_NOPS (off);
3771 continue;
3773 break;
3774 default:
3775 break;
3777 break;
3780 /* If at the end OFF still isn't a SSA_NAME or isn't
3781 defined in the loop, punt. */
3782 if (TREE_CODE (off) != SSA_NAME
3783 || expr_invariant_in_loop_p (loop, off))
3784 return false;
3786 if (offtype == NULL_TREE)
3787 offtype = TREE_TYPE (off);
3789 if (use_ifn_p)
3791 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3792 memory_type, TYPE_PRECISION (offtype),
3793 TYPE_SIGN (offtype), scale, &ifn,
3794 &element_type))
3795 return false;
3797 else
3799 if (DR_IS_READ (dr))
3801 if (targetm.vectorize.builtin_gather)
3802 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3804 else
3806 if (targetm.vectorize.builtin_scatter)
3807 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3810 if (!decl)
3811 return false;
3813 ifn = IFN_LAST;
3814 element_type = TREE_TYPE (vectype);
3817 info->ifn = ifn;
3818 info->decl = decl;
3819 info->base = base;
3820 info->offset = off;
3821 info->offset_dt = vect_unknown_def_type;
3822 info->offset_vectype = NULL_TREE;
3823 info->scale = scale;
3824 info->element_type = element_type;
3825 info->memory_type = memory_type;
3826 return true;
3829 /* Find the data references in STMT, analyze them with respect to LOOP and
3830 append them to DATAREFS. Return false if datarefs in this stmt cannot
3831 be handled. */
3833 opt_result
3834 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3835 vec<data_reference_p> *datarefs)
3837 /* We can ignore clobbers for dataref analysis - they are removed during
3838 loop vectorization and BB vectorization checks dependences with a
3839 stmt walk. */
3840 if (gimple_clobber_p (stmt))
3841 return opt_result::success ();
3843 if (gimple_has_volatile_ops (stmt))
3844 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
3845 stmt);
3847 if (stmt_can_throw_internal (stmt))
3848 return opt_result::failure_at (stmt,
3849 "not vectorized:"
3850 " statement can throw an exception: %G",
3851 stmt);
3853 auto_vec<data_reference_p, 2> refs;
3854 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
3855 if (!res)
3856 return res;
3858 if (refs.is_empty ())
3859 return opt_result::success ();
3861 if (refs.length () > 1)
3862 return opt_result::failure_at (stmt,
3863 "not vectorized:"
3864 " more than one data ref in stmt: %G", stmt);
3866 if (gcall *call = dyn_cast <gcall *> (stmt))
3867 if (!gimple_call_internal_p (call)
3868 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3869 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3870 return opt_result::failure_at (stmt,
3871 "not vectorized: dr in a call %G", stmt);
3873 data_reference_p dr = refs.pop ();
3874 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3875 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3876 return opt_result::failure_at (stmt,
3877 "not vectorized:"
3878 " statement is bitfield access %G", stmt);
3880 if (DR_BASE_ADDRESS (dr)
3881 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3882 return opt_result::failure_at (stmt,
3883 "not vectorized:"
3884 " base addr of dr is a constant\n");
3886 /* Check whether this may be a SIMD lane access and adjust the
3887 DR to make it easier for us to handle it. */
3888 if (loop
3889 && loop->simduid
3890 && (!DR_BASE_ADDRESS (dr)
3891 || !DR_OFFSET (dr)
3892 || !DR_INIT (dr)
3893 || !DR_STEP (dr)))
3895 struct data_reference *newdr
3896 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
3897 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
3898 if (DR_BASE_ADDRESS (newdr)
3899 && DR_OFFSET (newdr)
3900 && DR_INIT (newdr)
3901 && DR_STEP (newdr)
3902 && integer_zerop (DR_STEP (newdr)))
3904 tree off = DR_OFFSET (newdr);
3905 STRIP_NOPS (off);
3906 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3907 && TREE_CODE (off) == MULT_EXPR
3908 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3910 tree step = TREE_OPERAND (off, 1);
3911 off = TREE_OPERAND (off, 0);
3912 STRIP_NOPS (off);
3913 if (CONVERT_EXPR_P (off)
3914 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
3915 < TYPE_PRECISION (TREE_TYPE (off))))
3916 off = TREE_OPERAND (off, 0);
3917 if (TREE_CODE (off) == SSA_NAME)
3919 gimple *def = SSA_NAME_DEF_STMT (off);
3920 tree reft = TREE_TYPE (DR_REF (newdr));
3921 if (is_gimple_call (def)
3922 && gimple_call_internal_p (def)
3923 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
3925 tree arg = gimple_call_arg (def, 0);
3926 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3927 arg = SSA_NAME_VAR (arg);
3928 if (arg == loop->simduid
3929 /* For now. */
3930 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
3932 DR_OFFSET (newdr) = ssize_int (0);
3933 DR_STEP (newdr) = step;
3934 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
3935 DR_STEP_ALIGNMENT (newdr)
3936 = highest_pow2_factor (step);
3937 /* Mark as simd-lane access. */
3938 newdr->aux = (void *)-1;
3939 free_data_ref (dr);
3940 datarefs->safe_push (newdr);
3941 return opt_result::success ();
3947 free_data_ref (newdr);
3950 datarefs->safe_push (dr);
3951 return opt_result::success ();
3954 /* Function vect_analyze_data_refs.
3956 Find all the data references in the loop or basic block.
3958 The general structure of the analysis of data refs in the vectorizer is as
3959 follows:
3960 1- vect_analyze_data_refs(loop/bb): call
3961 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3962 in the loop/bb and their dependences.
3963 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3964 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3965 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3969 opt_result
3970 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3972 struct loop *loop = NULL;
3973 unsigned int i;
3974 struct data_reference *dr;
3975 tree scalar_type;
3977 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
3979 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3980 loop = LOOP_VINFO_LOOP (loop_vinfo);
3982 /* Go through the data-refs, check that the analysis succeeded. Update
3983 pointer from stmt_vec_info struct to DR and vectype. */
3985 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3986 FOR_EACH_VEC_ELT (datarefs, i, dr)
3988 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3989 poly_uint64 vf;
3991 gcc_assert (DR_REF (dr));
3992 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
3993 gcc_assert (!stmt_info->dr_aux.dr);
3994 stmt_info->dr_aux.dr = dr;
3995 stmt_info->dr_aux.stmt = stmt_info;
3997 /* Check that analysis of the data-ref succeeded. */
3998 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3999 || !DR_STEP (dr))
4001 bool maybe_gather
4002 = DR_IS_READ (dr)
4003 && !TREE_THIS_VOLATILE (DR_REF (dr))
4004 && (targetm.vectorize.builtin_gather != NULL
4005 || supports_vec_gather_load_p ());
4006 bool maybe_scatter
4007 = DR_IS_WRITE (dr)
4008 && !TREE_THIS_VOLATILE (DR_REF (dr))
4009 && (targetm.vectorize.builtin_scatter != NULL
4010 || supports_vec_scatter_store_p ());
4012 /* If target supports vector gather loads or scatter stores,
4013 see if they can't be used. */
4014 if (is_a <loop_vec_info> (vinfo)
4015 && !nested_in_vect_loop_p (loop, stmt_info))
4017 if (maybe_gather || maybe_scatter)
4019 if (maybe_gather)
4020 gatherscatter = GATHER;
4021 else
4022 gatherscatter = SCATTER;
4026 if (gatherscatter == SG_NONE)
4028 if (dump_enabled_p ())
4029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4030 "not vectorized: data ref analysis "
4031 "failed %G", stmt_info->stmt);
4032 if (is_a <bb_vec_info> (vinfo))
4034 /* In BB vectorization the ref can still participate
4035 in dependence analysis, we just can't vectorize it. */
4036 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4037 continue;
4039 return opt_result::failure_at (stmt_info->stmt,
4040 "not vectorized:"
4041 " data ref analysis failed: %G",
4042 stmt_info->stmt);
4046 /* See if this was detected as SIMD lane access. */
4047 if (dr->aux == (void *)-1)
4049 if (nested_in_vect_loop_p (loop, stmt_info))
4050 return opt_result::failure_at (stmt_info->stmt,
4051 "not vectorized:"
4052 " data ref analysis failed: %G",
4053 stmt_info->stmt);
4054 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4057 tree base = get_base_address (DR_REF (dr));
4058 if (base && VAR_P (base) && DECL_NONALIASED (base))
4060 if (dump_enabled_p ())
4061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4062 "not vectorized: base object not addressable "
4063 "for stmt: %G", stmt_info->stmt);
4064 if (is_a <bb_vec_info> (vinfo))
4066 /* In BB vectorization the ref can still participate
4067 in dependence analysis, we just can't vectorize it. */
4068 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4069 continue;
4071 return opt_result::failure_at (stmt_info->stmt,
4072 "not vectorized: base object not"
4073 " addressable for stmt: %G",
4074 stmt_info->stmt);
4077 if (is_a <loop_vec_info> (vinfo)
4078 && DR_STEP (dr)
4079 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4081 if (nested_in_vect_loop_p (loop, stmt_info))
4082 return opt_result::failure_at (stmt_info->stmt,
4083 "not vectorized:"
4084 "not suitable for strided load %G",
4085 stmt_info->stmt);
4086 STMT_VINFO_STRIDED_P (stmt_info) = true;
4089 /* Update DR field in stmt_vec_info struct. */
4091 /* If the dataref is in an inner-loop of the loop that is considered for
4092 for vectorization, we also want to analyze the access relative to
4093 the outer-loop (DR contains information only relative to the
4094 inner-most enclosing loop). We do that by building a reference to the
4095 first location accessed by the inner-loop, and analyze it relative to
4096 the outer-loop. */
4097 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4099 /* Build a reference to the first location accessed by the
4100 inner loop: *(BASE + INIT + OFFSET). By construction,
4101 this address must be invariant in the inner loop, so we
4102 can consider it as being used in the outer loop. */
4103 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4104 tree offset = unshare_expr (DR_OFFSET (dr));
4105 tree init = unshare_expr (DR_INIT (dr));
4106 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4107 init, offset);
4108 tree init_addr = fold_build_pointer_plus (base, init_offset);
4109 tree init_ref = build_fold_indirect_ref (init_addr);
4111 if (dump_enabled_p ())
4112 dump_printf_loc (MSG_NOTE, vect_location,
4113 "analyze in outer loop: %T\n", init_ref);
4115 opt_result res
4116 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4117 init_ref, loop, stmt_info->stmt);
4118 if (!res)
4119 /* dr_analyze_innermost already explained the failure. */
4120 return res;
4122 if (dump_enabled_p ())
4123 dump_printf_loc (MSG_NOTE, vect_location,
4124 "\touter base_address: %T\n"
4125 "\touter offset from base address: %T\n"
4126 "\touter constant offset from base address: %T\n"
4127 "\touter step: %T\n"
4128 "\touter base alignment: %d\n\n"
4129 "\touter base misalignment: %d\n"
4130 "\touter offset alignment: %d\n"
4131 "\touter step alignment: %d\n",
4132 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4133 STMT_VINFO_DR_OFFSET (stmt_info),
4134 STMT_VINFO_DR_INIT (stmt_info),
4135 STMT_VINFO_DR_STEP (stmt_info),
4136 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4137 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4138 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4139 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4142 /* Set vectype for STMT. */
4143 scalar_type = TREE_TYPE (DR_REF (dr));
4144 STMT_VINFO_VECTYPE (stmt_info)
4145 = get_vectype_for_scalar_type (scalar_type);
4146 if (!STMT_VINFO_VECTYPE (stmt_info))
4148 if (dump_enabled_p ())
4150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4151 "not vectorized: no vectype for stmt: %G",
4152 stmt_info->stmt);
4153 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4154 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4155 scalar_type);
4156 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4159 if (is_a <bb_vec_info> (vinfo))
4161 /* No vector type is fine, the ref can still participate
4162 in dependence analysis, we just can't vectorize it. */
4163 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4164 continue;
4166 return opt_result::failure_at (stmt_info->stmt,
4167 "not vectorized:"
4168 " no vectype for stmt: %G"
4169 " scalar_type: %T\n",
4170 stmt_info->stmt, scalar_type);
4172 else
4174 if (dump_enabled_p ())
4175 dump_printf_loc (MSG_NOTE, vect_location,
4176 "got vectype for stmt: %G%T\n",
4177 stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4180 /* Adjust the minimal vectorization factor according to the
4181 vector type. */
4182 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4183 *min_vf = upper_bound (*min_vf, vf);
4185 if (gatherscatter != SG_NONE)
4187 gather_scatter_info gs_info;
4188 if (!vect_check_gather_scatter (stmt_info,
4189 as_a <loop_vec_info> (vinfo),
4190 &gs_info)
4191 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4192 return opt_result::failure_at
4193 (stmt_info->stmt,
4194 (gatherscatter == GATHER) ?
4195 "not vectorized: not suitable for gather load %G" :
4196 "not vectorized: not suitable for scatter store %G",
4197 stmt_info->stmt);
4198 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4202 /* We used to stop processing and prune the list here. Verify we no
4203 longer need to. */
4204 gcc_assert (i == datarefs.length ());
4206 return opt_result::success ();
4210 /* Function vect_get_new_vect_var.
4212 Returns a name for a new variable. The current naming scheme appends the
4213 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4214 the name of vectorizer generated variables, and appends that to NAME if
4215 provided. */
4217 tree
4218 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4220 const char *prefix;
4221 tree new_vect_var;
4223 switch (var_kind)
4225 case vect_simple_var:
4226 prefix = "vect";
4227 break;
4228 case vect_scalar_var:
4229 prefix = "stmp";
4230 break;
4231 case vect_mask_var:
4232 prefix = "mask";
4233 break;
4234 case vect_pointer_var:
4235 prefix = "vectp";
4236 break;
4237 default:
4238 gcc_unreachable ();
4241 if (name)
4243 char* tmp = concat (prefix, "_", name, NULL);
4244 new_vect_var = create_tmp_reg (type, tmp);
4245 free (tmp);
4247 else
4248 new_vect_var = create_tmp_reg (type, prefix);
4250 return new_vect_var;
4253 /* Like vect_get_new_vect_var but return an SSA name. */
4255 tree
4256 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4258 const char *prefix;
4259 tree new_vect_var;
4261 switch (var_kind)
4263 case vect_simple_var:
4264 prefix = "vect";
4265 break;
4266 case vect_scalar_var:
4267 prefix = "stmp";
4268 break;
4269 case vect_pointer_var:
4270 prefix = "vectp";
4271 break;
4272 default:
4273 gcc_unreachable ();
4276 if (name)
4278 char* tmp = concat (prefix, "_", name, NULL);
4279 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4280 free (tmp);
4282 else
4283 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4285 return new_vect_var;
4288 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4290 static void
4291 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4293 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4294 int misalign = DR_MISALIGNMENT (dr_info);
4295 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4296 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4297 else
4298 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4299 DR_TARGET_ALIGNMENT (dr_info), misalign);
4302 /* Function vect_create_addr_base_for_vector_ref.
4304 Create an expression that computes the address of the first memory location
4305 that will be accessed for a data reference.
4307 Input:
4308 STMT_INFO: The statement containing the data reference.
4309 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4310 OFFSET: Optional. If supplied, it is be added to the initial address.
4311 LOOP: Specify relative to which loop-nest should the address be computed.
4312 For example, when the dataref is in an inner-loop nested in an
4313 outer-loop that is now being vectorized, LOOP can be either the
4314 outer-loop, or the inner-loop. The first memory location accessed
4315 by the following dataref ('in' points to short):
4317 for (i=0; i<N; i++)
4318 for (j=0; j<M; j++)
4319 s += in[i+j]
4321 is as follows:
4322 if LOOP=i_loop: &in (relative to i_loop)
4323 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4324 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4325 initial address. Unlike OFFSET, which is number of elements to
4326 be added, BYTE_OFFSET is measured in bytes.
4328 Output:
4329 1. Return an SSA_NAME whose value is the address of the memory location of
4330 the first vector of the data reference.
4331 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4332 these statement(s) which define the returned SSA_NAME.
4334 FORNOW: We are only handling array accesses with step 1. */
4336 tree
4337 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4338 gimple_seq *new_stmt_list,
4339 tree offset,
4340 tree byte_offset)
4342 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4343 struct data_reference *dr = dr_info->dr;
4344 const char *base_name;
4345 tree addr_base;
4346 tree dest;
4347 gimple_seq seq = NULL;
4348 tree vect_ptr_type;
4349 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4350 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4351 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4353 tree data_ref_base = unshare_expr (drb->base_address);
4354 tree base_offset = unshare_expr (drb->offset);
4355 tree init = unshare_expr (drb->init);
4357 if (loop_vinfo)
4358 base_name = get_name (data_ref_base);
4359 else
4361 base_offset = ssize_int (0);
4362 init = ssize_int (0);
4363 base_name = get_name (DR_REF (dr));
4366 /* Create base_offset */
4367 base_offset = size_binop (PLUS_EXPR,
4368 fold_convert (sizetype, base_offset),
4369 fold_convert (sizetype, init));
4371 if (offset)
4373 offset = fold_build2 (MULT_EXPR, sizetype,
4374 fold_convert (sizetype, offset), step);
4375 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4376 base_offset, offset);
4378 if (byte_offset)
4380 byte_offset = fold_convert (sizetype, byte_offset);
4381 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4382 base_offset, byte_offset);
4385 /* base + base_offset */
4386 if (loop_vinfo)
4387 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4388 else
4390 addr_base = build1 (ADDR_EXPR,
4391 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4392 unshare_expr (DR_REF (dr)));
4395 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4396 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4397 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4398 gimple_seq_add_seq (new_stmt_list, seq);
4400 if (DR_PTR_INFO (dr)
4401 && TREE_CODE (addr_base) == SSA_NAME
4402 && !SSA_NAME_PTR_INFO (addr_base))
4404 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4405 if (offset || byte_offset)
4406 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4409 if (dump_enabled_p ())
4410 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4412 return addr_base;
4416 /* Function vect_create_data_ref_ptr.
4418 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4419 location accessed in the loop by STMT_INFO, along with the def-use update
4420 chain to appropriately advance the pointer through the loop iterations.
4421 Also set aliasing information for the pointer. This pointer is used by
4422 the callers to this function to create a memory reference expression for
4423 vector load/store access.
4425 Input:
4426 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4427 GIMPLE_ASSIGN <name, data-ref> or
4428 GIMPLE_ASSIGN <data-ref, name>.
4429 2. AGGR_TYPE: the type of the reference, which should be either a vector
4430 or an array.
4431 3. AT_LOOP: the loop where the vector memref is to be created.
4432 4. OFFSET (optional): an offset to be added to the initial address accessed
4433 by the data-ref in STMT_INFO.
4434 5. BSI: location where the new stmts are to be placed if there is no loop
4435 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4436 pointing to the initial address.
4437 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4438 to the initial address accessed by the data-ref in STMT_INFO. This is
4439 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4440 in bytes.
4441 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4442 to the IV during each iteration of the loop. NULL says to move
4443 by one copy of AGGR_TYPE up or down, depending on the step of the
4444 data reference.
4446 Output:
4447 1. Declare a new ptr to vector_type, and have it point to the base of the
4448 data reference (initial addressed accessed by the data reference).
4449 For example, for vector of type V8HI, the following code is generated:
4451 v8hi *ap;
4452 ap = (v8hi *)initial_address;
4454 if OFFSET is not supplied:
4455 initial_address = &a[init];
4456 if OFFSET is supplied:
4457 initial_address = &a[init + OFFSET];
4458 if BYTE_OFFSET is supplied:
4459 initial_address = &a[init] + BYTE_OFFSET;
4461 Return the initial_address in INITIAL_ADDRESS.
4463 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4464 update the pointer in each iteration of the loop.
4466 Return the increment stmt that updates the pointer in PTR_INCR.
4468 3. Return the pointer. */
4470 tree
4471 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4472 struct loop *at_loop, tree offset,
4473 tree *initial_address, gimple_stmt_iterator *gsi,
4474 gimple **ptr_incr, bool only_init,
4475 tree byte_offset, tree iv_step)
4477 const char *base_name;
4478 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4479 struct loop *loop = NULL;
4480 bool nested_in_vect_loop = false;
4481 struct loop *containing_loop = NULL;
4482 tree aggr_ptr_type;
4483 tree aggr_ptr;
4484 tree new_temp;
4485 gimple_seq new_stmt_list = NULL;
4486 edge pe = NULL;
4487 basic_block new_bb;
4488 tree aggr_ptr_init;
4489 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4490 struct data_reference *dr = dr_info->dr;
4491 tree aptr;
4492 gimple_stmt_iterator incr_gsi;
4493 bool insert_after;
4494 tree indx_before_incr, indx_after_incr;
4495 gimple *incr;
4496 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4498 gcc_assert (iv_step != NULL_TREE
4499 || TREE_CODE (aggr_type) == ARRAY_TYPE
4500 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4502 if (loop_vinfo)
4504 loop = LOOP_VINFO_LOOP (loop_vinfo);
4505 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4506 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4507 pe = loop_preheader_edge (loop);
4509 else
4511 gcc_assert (bb_vinfo);
4512 only_init = true;
4513 *ptr_incr = NULL;
4516 /* Create an expression for the first address accessed by this load
4517 in LOOP. */
4518 base_name = get_name (DR_BASE_ADDRESS (dr));
4520 if (dump_enabled_p ())
4522 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4523 dump_printf_loc (MSG_NOTE, vect_location,
4524 "create %s-pointer variable to type: %T",
4525 get_tree_code_name (TREE_CODE (aggr_type)),
4526 aggr_type);
4527 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4528 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4529 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4530 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4531 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4532 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4533 else
4534 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4535 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4538 /* (1) Create the new aggregate-pointer variable.
4539 Vector and array types inherit the alias set of their component
4540 type by default so we need to use a ref-all pointer if the data
4541 reference does not conflict with the created aggregated data
4542 reference because it is not addressable. */
4543 bool need_ref_all = false;
4544 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4545 get_alias_set (DR_REF (dr))))
4546 need_ref_all = true;
4547 /* Likewise for any of the data references in the stmt group. */
4548 else if (DR_GROUP_SIZE (stmt_info) > 1)
4550 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4553 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4554 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4555 get_alias_set (DR_REF (sdr))))
4557 need_ref_all = true;
4558 break;
4560 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4562 while (sinfo);
4564 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4565 need_ref_all);
4566 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4569 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4570 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4571 def-use update cycles for the pointer: one relative to the outer-loop
4572 (LOOP), which is what steps (3) and (4) below do. The other is relative
4573 to the inner-loop (which is the inner-most loop containing the dataref),
4574 and this is done be step (5) below.
4576 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4577 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4578 redundant. Steps (3),(4) create the following:
4580 vp0 = &base_addr;
4581 LOOP: vp1 = phi(vp0,vp2)
4584 vp2 = vp1 + step
4585 goto LOOP
4587 If there is an inner-loop nested in loop, then step (5) will also be
4588 applied, and an additional update in the inner-loop will be created:
4590 vp0 = &base_addr;
4591 LOOP: vp1 = phi(vp0,vp2)
4593 inner: vp3 = phi(vp1,vp4)
4594 vp4 = vp3 + inner_step
4595 if () goto inner
4597 vp2 = vp1 + step
4598 if () goto LOOP */
4600 /* (2) Calculate the initial address of the aggregate-pointer, and set
4601 the aggregate-pointer to point to it before the loop. */
4603 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4605 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4606 offset, byte_offset);
4607 if (new_stmt_list)
4609 if (pe)
4611 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4612 gcc_assert (!new_bb);
4614 else
4615 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4618 *initial_address = new_temp;
4619 aggr_ptr_init = new_temp;
4621 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4622 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4623 inner-loop nested in LOOP (during outer-loop vectorization). */
4625 /* No update in loop is required. */
4626 if (only_init && (!loop_vinfo || at_loop == loop))
4627 aptr = aggr_ptr_init;
4628 else
4630 /* Accesses to invariant addresses should be handled specially
4631 by the caller. */
4632 tree step = vect_dr_behavior (dr_info)->step;
4633 gcc_assert (!integer_zerop (step));
4635 if (iv_step == NULL_TREE)
4637 /* The step of the aggregate pointer is the type size,
4638 negated for downward accesses. */
4639 iv_step = TYPE_SIZE_UNIT (aggr_type);
4640 if (tree_int_cst_sgn (step) == -1)
4641 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4644 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4646 create_iv (aggr_ptr_init,
4647 fold_convert (aggr_ptr_type, iv_step),
4648 aggr_ptr, loop, &incr_gsi, insert_after,
4649 &indx_before_incr, &indx_after_incr);
4650 incr = gsi_stmt (incr_gsi);
4651 loop_vinfo->add_stmt (incr);
4653 /* Copy the points-to information if it exists. */
4654 if (DR_PTR_INFO (dr))
4656 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4657 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4659 if (ptr_incr)
4660 *ptr_incr = incr;
4662 aptr = indx_before_incr;
4665 if (!nested_in_vect_loop || only_init)
4666 return aptr;
4669 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4670 nested in LOOP, if exists. */
4672 gcc_assert (nested_in_vect_loop);
4673 if (!only_init)
4675 standard_iv_increment_position (containing_loop, &incr_gsi,
4676 &insert_after);
4677 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4678 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4679 &indx_after_incr);
4680 incr = gsi_stmt (incr_gsi);
4681 loop_vinfo->add_stmt (incr);
4683 /* Copy the points-to information if it exists. */
4684 if (DR_PTR_INFO (dr))
4686 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4687 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4689 if (ptr_incr)
4690 *ptr_incr = incr;
4692 return indx_before_incr;
4694 else
4695 gcc_unreachable ();
4699 /* Function bump_vector_ptr
4701 Increment a pointer (to a vector type) by vector-size. If requested,
4702 i.e. if PTR-INCR is given, then also connect the new increment stmt
4703 to the existing def-use update-chain of the pointer, by modifying
4704 the PTR_INCR as illustrated below:
4706 The pointer def-use update-chain before this function:
4707 DATAREF_PTR = phi (p_0, p_2)
4708 ....
4709 PTR_INCR: p_2 = DATAREF_PTR + step
4711 The pointer def-use update-chain after this function:
4712 DATAREF_PTR = phi (p_0, p_2)
4713 ....
4714 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4715 ....
4716 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4718 Input:
4719 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4720 in the loop.
4721 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4722 the loop. The increment amount across iterations is expected
4723 to be vector_size.
4724 BSI - location where the new update stmt is to be placed.
4725 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4726 BUMP - optional. The offset by which to bump the pointer. If not given,
4727 the offset is assumed to be vector_size.
4729 Output: Return NEW_DATAREF_PTR as illustrated above.
4733 tree
4734 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4735 stmt_vec_info stmt_info, tree bump)
4737 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4738 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4739 tree update = TYPE_SIZE_UNIT (vectype);
4740 gassign *incr_stmt;
4741 ssa_op_iter iter;
4742 use_operand_p use_p;
4743 tree new_dataref_ptr;
4745 if (bump)
4746 update = bump;
4748 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4749 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4750 else
4751 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4752 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4753 dataref_ptr, update);
4754 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4756 /* Copy the points-to information if it exists. */
4757 if (DR_PTR_INFO (dr))
4759 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4760 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4763 if (!ptr_incr)
4764 return new_dataref_ptr;
4766 /* Update the vector-pointer's cross-iteration increment. */
4767 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4769 tree use = USE_FROM_PTR (use_p);
4771 if (use == dataref_ptr)
4772 SET_USE (use_p, new_dataref_ptr);
4773 else
4774 gcc_assert (operand_equal_p (use, update, 0));
4777 return new_dataref_ptr;
4781 /* Copy memory reference info such as base/clique from the SRC reference
4782 to the DEST MEM_REF. */
4784 void
4785 vect_copy_ref_info (tree dest, tree src)
4787 if (TREE_CODE (dest) != MEM_REF)
4788 return;
4790 tree src_base = src;
4791 while (handled_component_p (src_base))
4792 src_base = TREE_OPERAND (src_base, 0);
4793 if (TREE_CODE (src_base) != MEM_REF
4794 && TREE_CODE (src_base) != TARGET_MEM_REF)
4795 return;
4797 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4798 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4802 /* Function vect_create_destination_var.
4804 Create a new temporary of type VECTYPE. */
4806 tree
4807 vect_create_destination_var (tree scalar_dest, tree vectype)
4809 tree vec_dest;
4810 const char *name;
4811 char *new_name;
4812 tree type;
4813 enum vect_var_kind kind;
4815 kind = vectype
4816 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4817 ? vect_mask_var
4818 : vect_simple_var
4819 : vect_scalar_var;
4820 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4822 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4824 name = get_name (scalar_dest);
4825 if (name)
4826 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4827 else
4828 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4829 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4830 free (new_name);
4832 return vec_dest;
4835 /* Function vect_grouped_store_supported.
4837 Returns TRUE if interleave high and interleave low permutations
4838 are supported, and FALSE otherwise. */
4840 bool
4841 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4843 machine_mode mode = TYPE_MODE (vectype);
4845 /* vect_permute_store_chain requires the group size to be equal to 3 or
4846 be a power of two. */
4847 if (count != 3 && exact_log2 (count) == -1)
4849 if (dump_enabled_p ())
4850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4851 "the size of the group of accesses"
4852 " is not a power of 2 or not eqaul to 3\n");
4853 return false;
4856 /* Check that the permutation is supported. */
4857 if (VECTOR_MODE_P (mode))
4859 unsigned int i;
4860 if (count == 3)
4862 unsigned int j0 = 0, j1 = 0, j2 = 0;
4863 unsigned int i, j;
4865 unsigned int nelt;
4866 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4868 if (dump_enabled_p ())
4869 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4870 "cannot handle groups of 3 stores for"
4871 " variable-length vectors\n");
4872 return false;
4875 vec_perm_builder sel (nelt, nelt, 1);
4876 sel.quick_grow (nelt);
4877 vec_perm_indices indices;
4878 for (j = 0; j < 3; j++)
4880 int nelt0 = ((3 - j) * nelt) % 3;
4881 int nelt1 = ((3 - j) * nelt + 1) % 3;
4882 int nelt2 = ((3 - j) * nelt + 2) % 3;
4883 for (i = 0; i < nelt; i++)
4885 if (3 * i + nelt0 < nelt)
4886 sel[3 * i + nelt0] = j0++;
4887 if (3 * i + nelt1 < nelt)
4888 sel[3 * i + nelt1] = nelt + j1++;
4889 if (3 * i + nelt2 < nelt)
4890 sel[3 * i + nelt2] = 0;
4892 indices.new_vector (sel, 2, nelt);
4893 if (!can_vec_perm_const_p (mode, indices))
4895 if (dump_enabled_p ())
4896 dump_printf (MSG_MISSED_OPTIMIZATION,
4897 "permutation op not supported by target.\n");
4898 return false;
4901 for (i = 0; i < nelt; i++)
4903 if (3 * i + nelt0 < nelt)
4904 sel[3 * i + nelt0] = 3 * i + nelt0;
4905 if (3 * i + nelt1 < nelt)
4906 sel[3 * i + nelt1] = 3 * i + nelt1;
4907 if (3 * i + nelt2 < nelt)
4908 sel[3 * i + nelt2] = nelt + j2++;
4910 indices.new_vector (sel, 2, nelt);
4911 if (!can_vec_perm_const_p (mode, indices))
4913 if (dump_enabled_p ())
4914 dump_printf (MSG_MISSED_OPTIMIZATION,
4915 "permutation op not supported by target.\n");
4916 return false;
4919 return true;
4921 else
4923 /* If length is not equal to 3 then only power of 2 is supported. */
4924 gcc_assert (pow2p_hwi (count));
4925 poly_uint64 nelt = GET_MODE_NUNITS (mode);
4927 /* The encoding has 2 interleaved stepped patterns. */
4928 vec_perm_builder sel (nelt, 2, 3);
4929 sel.quick_grow (6);
4930 for (i = 0; i < 3; i++)
4932 sel[i * 2] = i;
4933 sel[i * 2 + 1] = i + nelt;
4935 vec_perm_indices indices (sel, 2, nelt);
4936 if (can_vec_perm_const_p (mode, indices))
4938 for (i = 0; i < 6; i++)
4939 sel[i] += exact_div (nelt, 2);
4940 indices.new_vector (sel, 2, nelt);
4941 if (can_vec_perm_const_p (mode, indices))
4942 return true;
4947 if (dump_enabled_p ())
4948 dump_printf (MSG_MISSED_OPTIMIZATION,
4949 "permutation op not supported by target.\n");
4950 return false;
4954 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
4955 type VECTYPE. MASKED_P says whether the masked form is needed. */
4957 bool
4958 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
4959 bool masked_p)
4961 if (masked_p)
4962 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
4963 vec_mask_store_lanes_optab,
4964 vectype, count);
4965 else
4966 return vect_lanes_optab_supported_p ("vec_store_lanes",
4967 vec_store_lanes_optab,
4968 vectype, count);
4972 /* Function vect_permute_store_chain.
4974 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4975 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4976 the data correctly for the stores. Return the final references for stores
4977 in RESULT_CHAIN.
4979 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4980 The input is 4 vectors each containing 8 elements. We assign a number to
4981 each element, the input sequence is:
4983 1st vec: 0 1 2 3 4 5 6 7
4984 2nd vec: 8 9 10 11 12 13 14 15
4985 3rd vec: 16 17 18 19 20 21 22 23
4986 4th vec: 24 25 26 27 28 29 30 31
4988 The output sequence should be:
4990 1st vec: 0 8 16 24 1 9 17 25
4991 2nd vec: 2 10 18 26 3 11 19 27
4992 3rd vec: 4 12 20 28 5 13 21 30
4993 4th vec: 6 14 22 30 7 15 23 31
4995 i.e., we interleave the contents of the four vectors in their order.
4997 We use interleave_high/low instructions to create such output. The input of
4998 each interleave_high/low operation is two vectors:
4999 1st vec 2nd vec
5000 0 1 2 3 4 5 6 7
5001 the even elements of the result vector are obtained left-to-right from the
5002 high/low elements of the first vector. The odd elements of the result are
5003 obtained left-to-right from the high/low elements of the second vector.
5004 The output of interleave_high will be: 0 4 1 5
5005 and of interleave_low: 2 6 3 7
5008 The permutation is done in log LENGTH stages. In each stage interleave_high
5009 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5010 where the first argument is taken from the first half of DR_CHAIN and the
5011 second argument from it's second half.
5012 In our example,
5014 I1: interleave_high (1st vec, 3rd vec)
5015 I2: interleave_low (1st vec, 3rd vec)
5016 I3: interleave_high (2nd vec, 4th vec)
5017 I4: interleave_low (2nd vec, 4th vec)
5019 The output for the first stage is:
5021 I1: 0 16 1 17 2 18 3 19
5022 I2: 4 20 5 21 6 22 7 23
5023 I3: 8 24 9 25 10 26 11 27
5024 I4: 12 28 13 29 14 30 15 31
5026 The output of the second stage, i.e. the final result is:
5028 I1: 0 8 16 24 1 9 17 25
5029 I2: 2 10 18 26 3 11 19 27
5030 I3: 4 12 20 28 5 13 21 30
5031 I4: 6 14 22 30 7 15 23 31. */
5033 void
5034 vect_permute_store_chain (vec<tree> dr_chain,
5035 unsigned int length,
5036 stmt_vec_info stmt_info,
5037 gimple_stmt_iterator *gsi,
5038 vec<tree> *result_chain)
5040 tree vect1, vect2, high, low;
5041 gimple *perm_stmt;
5042 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5043 tree perm_mask_low, perm_mask_high;
5044 tree data_ref;
5045 tree perm3_mask_low, perm3_mask_high;
5046 unsigned int i, j, n, log_length = exact_log2 (length);
5048 result_chain->quick_grow (length);
5049 memcpy (result_chain->address (), dr_chain.address (),
5050 length * sizeof (tree));
5052 if (length == 3)
5054 /* vect_grouped_store_supported ensures that this is constant. */
5055 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5056 unsigned int j0 = 0, j1 = 0, j2 = 0;
5058 vec_perm_builder sel (nelt, nelt, 1);
5059 sel.quick_grow (nelt);
5060 vec_perm_indices indices;
5061 for (j = 0; j < 3; j++)
5063 int nelt0 = ((3 - j) * nelt) % 3;
5064 int nelt1 = ((3 - j) * nelt + 1) % 3;
5065 int nelt2 = ((3 - j) * nelt + 2) % 3;
5067 for (i = 0; i < nelt; i++)
5069 if (3 * i + nelt0 < nelt)
5070 sel[3 * i + nelt0] = j0++;
5071 if (3 * i + nelt1 < nelt)
5072 sel[3 * i + nelt1] = nelt + j1++;
5073 if (3 * i + nelt2 < nelt)
5074 sel[3 * i + nelt2] = 0;
5076 indices.new_vector (sel, 2, nelt);
5077 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5079 for (i = 0; i < nelt; i++)
5081 if (3 * i + nelt0 < nelt)
5082 sel[3 * i + nelt0] = 3 * i + nelt0;
5083 if (3 * i + nelt1 < nelt)
5084 sel[3 * i + nelt1] = 3 * i + nelt1;
5085 if (3 * i + nelt2 < nelt)
5086 sel[3 * i + nelt2] = nelt + j2++;
5088 indices.new_vector (sel, 2, nelt);
5089 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5091 vect1 = dr_chain[0];
5092 vect2 = dr_chain[1];
5094 /* Create interleaving stmt:
5095 low = VEC_PERM_EXPR <vect1, vect2,
5096 {j, nelt, *, j + 1, nelt + j + 1, *,
5097 j + 2, nelt + j + 2, *, ...}> */
5098 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5099 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5100 vect2, perm3_mask_low);
5101 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5103 vect1 = data_ref;
5104 vect2 = dr_chain[2];
5105 /* Create interleaving stmt:
5106 low = VEC_PERM_EXPR <vect1, vect2,
5107 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5108 6, 7, nelt + j + 2, ...}> */
5109 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5110 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5111 vect2, perm3_mask_high);
5112 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5113 (*result_chain)[j] = data_ref;
5116 else
5118 /* If length is not equal to 3 then only power of 2 is supported. */
5119 gcc_assert (pow2p_hwi (length));
5121 /* The encoding has 2 interleaved stepped patterns. */
5122 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5123 vec_perm_builder sel (nelt, 2, 3);
5124 sel.quick_grow (6);
5125 for (i = 0; i < 3; i++)
5127 sel[i * 2] = i;
5128 sel[i * 2 + 1] = i + nelt;
5130 vec_perm_indices indices (sel, 2, nelt);
5131 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5133 for (i = 0; i < 6; i++)
5134 sel[i] += exact_div (nelt, 2);
5135 indices.new_vector (sel, 2, nelt);
5136 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5138 for (i = 0, n = log_length; i < n; i++)
5140 for (j = 0; j < length/2; j++)
5142 vect1 = dr_chain[j];
5143 vect2 = dr_chain[j+length/2];
5145 /* Create interleaving stmt:
5146 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5147 ...}> */
5148 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5149 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5150 vect2, perm_mask_high);
5151 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5152 (*result_chain)[2*j] = high;
5154 /* Create interleaving stmt:
5155 low = VEC_PERM_EXPR <vect1, vect2,
5156 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5157 ...}> */
5158 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5159 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5160 vect2, perm_mask_low);
5161 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5162 (*result_chain)[2*j+1] = low;
5164 memcpy (dr_chain.address (), result_chain->address (),
5165 length * sizeof (tree));
5170 /* Function vect_setup_realignment
5172 This function is called when vectorizing an unaligned load using
5173 the dr_explicit_realign[_optimized] scheme.
5174 This function generates the following code at the loop prolog:
5176 p = initial_addr;
5177 x msq_init = *(floor(p)); # prolog load
5178 realignment_token = call target_builtin;
5179 loop:
5180 x msq = phi (msq_init, ---)
5182 The stmts marked with x are generated only for the case of
5183 dr_explicit_realign_optimized.
5185 The code above sets up a new (vector) pointer, pointing to the first
5186 location accessed by STMT_INFO, and a "floor-aligned" load using that
5187 pointer. It also generates code to compute the "realignment-token"
5188 (if the relevant target hook was defined), and creates a phi-node at the
5189 loop-header bb whose arguments are the result of the prolog-load (created
5190 by this function) and the result of a load that takes place in the loop
5191 (to be created by the caller to this function).
5193 For the case of dr_explicit_realign_optimized:
5194 The caller to this function uses the phi-result (msq) to create the
5195 realignment code inside the loop, and sets up the missing phi argument,
5196 as follows:
5197 loop:
5198 msq = phi (msq_init, lsq)
5199 lsq = *(floor(p')); # load in loop
5200 result = realign_load (msq, lsq, realignment_token);
5202 For the case of dr_explicit_realign:
5203 loop:
5204 msq = *(floor(p)); # load in loop
5205 p' = p + (VS-1);
5206 lsq = *(floor(p')); # load in loop
5207 result = realign_load (msq, lsq, realignment_token);
5209 Input:
5210 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5211 a memory location that may be unaligned.
5212 BSI - place where new code is to be inserted.
5213 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5214 is used.
5216 Output:
5217 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5218 target hook, if defined.
5219 Return value - the result of the loop-header phi node. */
5221 tree
5222 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5223 tree *realignment_token,
5224 enum dr_alignment_support alignment_support_scheme,
5225 tree init_addr,
5226 struct loop **at_loop)
5228 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5229 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5230 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5231 struct data_reference *dr = dr_info->dr;
5232 struct loop *loop = NULL;
5233 edge pe = NULL;
5234 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5235 tree vec_dest;
5236 gimple *inc;
5237 tree ptr;
5238 tree data_ref;
5239 basic_block new_bb;
5240 tree msq_init = NULL_TREE;
5241 tree new_temp;
5242 gphi *phi_stmt;
5243 tree msq = NULL_TREE;
5244 gimple_seq stmts = NULL;
5245 bool compute_in_loop = false;
5246 bool nested_in_vect_loop = false;
5247 struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5248 struct loop *loop_for_initial_load = NULL;
5250 if (loop_vinfo)
5252 loop = LOOP_VINFO_LOOP (loop_vinfo);
5253 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5256 gcc_assert (alignment_support_scheme == dr_explicit_realign
5257 || alignment_support_scheme == dr_explicit_realign_optimized);
5259 /* We need to generate three things:
5260 1. the misalignment computation
5261 2. the extra vector load (for the optimized realignment scheme).
5262 3. the phi node for the two vectors from which the realignment is
5263 done (for the optimized realignment scheme). */
5265 /* 1. Determine where to generate the misalignment computation.
5267 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5268 calculation will be generated by this function, outside the loop (in the
5269 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5270 caller, inside the loop.
5272 Background: If the misalignment remains fixed throughout the iterations of
5273 the loop, then both realignment schemes are applicable, and also the
5274 misalignment computation can be done outside LOOP. This is because we are
5275 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5276 are a multiple of VS (the Vector Size), and therefore the misalignment in
5277 different vectorized LOOP iterations is always the same.
5278 The problem arises only if the memory access is in an inner-loop nested
5279 inside LOOP, which is now being vectorized using outer-loop vectorization.
5280 This is the only case when the misalignment of the memory access may not
5281 remain fixed throughout the iterations of the inner-loop (as explained in
5282 detail in vect_supportable_dr_alignment). In this case, not only is the
5283 optimized realignment scheme not applicable, but also the misalignment
5284 computation (and generation of the realignment token that is passed to
5285 REALIGN_LOAD) have to be done inside the loop.
5287 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5288 or not, which in turn determines if the misalignment is computed inside
5289 the inner-loop, or outside LOOP. */
5291 if (init_addr != NULL_TREE || !loop_vinfo)
5293 compute_in_loop = true;
5294 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5298 /* 2. Determine where to generate the extra vector load.
5300 For the optimized realignment scheme, instead of generating two vector
5301 loads in each iteration, we generate a single extra vector load in the
5302 preheader of the loop, and in each iteration reuse the result of the
5303 vector load from the previous iteration. In case the memory access is in
5304 an inner-loop nested inside LOOP, which is now being vectorized using
5305 outer-loop vectorization, we need to determine whether this initial vector
5306 load should be generated at the preheader of the inner-loop, or can be
5307 generated at the preheader of LOOP. If the memory access has no evolution
5308 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5309 to be generated inside LOOP (in the preheader of the inner-loop). */
5311 if (nested_in_vect_loop)
5313 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5314 bool invariant_in_outerloop =
5315 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5316 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5318 else
5319 loop_for_initial_load = loop;
5320 if (at_loop)
5321 *at_loop = loop_for_initial_load;
5323 if (loop_for_initial_load)
5324 pe = loop_preheader_edge (loop_for_initial_load);
5326 /* 3. For the case of the optimized realignment, create the first vector
5327 load at the loop preheader. */
5329 if (alignment_support_scheme == dr_explicit_realign_optimized)
5331 /* Create msq_init = *(floor(p1)) in the loop preheader */
5332 gassign *new_stmt;
5334 gcc_assert (!compute_in_loop);
5335 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5336 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5337 loop_for_initial_load, NULL_TREE,
5338 &init_addr, NULL, &inc, true);
5339 if (TREE_CODE (ptr) == SSA_NAME)
5340 new_temp = copy_ssa_name (ptr);
5341 else
5342 new_temp = make_ssa_name (TREE_TYPE (ptr));
5343 unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5344 new_stmt = gimple_build_assign
5345 (new_temp, BIT_AND_EXPR, ptr,
5346 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5347 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5348 gcc_assert (!new_bb);
5349 data_ref
5350 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5351 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5352 vect_copy_ref_info (data_ref, DR_REF (dr));
5353 new_stmt = gimple_build_assign (vec_dest, data_ref);
5354 new_temp = make_ssa_name (vec_dest, new_stmt);
5355 gimple_assign_set_lhs (new_stmt, new_temp);
5356 if (pe)
5358 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5359 gcc_assert (!new_bb);
5361 else
5362 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5364 msq_init = gimple_assign_lhs (new_stmt);
5367 /* 4. Create realignment token using a target builtin, if available.
5368 It is done either inside the containing loop, or before LOOP (as
5369 determined above). */
5371 if (targetm.vectorize.builtin_mask_for_load)
5373 gcall *new_stmt;
5374 tree builtin_decl;
5376 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5377 if (!init_addr)
5379 /* Generate the INIT_ADDR computation outside LOOP. */
5380 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5381 NULL_TREE);
5382 if (loop)
5384 pe = loop_preheader_edge (loop);
5385 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5386 gcc_assert (!new_bb);
5388 else
5389 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5392 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5393 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5394 vec_dest =
5395 vect_create_destination_var (scalar_dest,
5396 gimple_call_return_type (new_stmt));
5397 new_temp = make_ssa_name (vec_dest, new_stmt);
5398 gimple_call_set_lhs (new_stmt, new_temp);
5400 if (compute_in_loop)
5401 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5402 else
5404 /* Generate the misalignment computation outside LOOP. */
5405 pe = loop_preheader_edge (loop);
5406 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5407 gcc_assert (!new_bb);
5410 *realignment_token = gimple_call_lhs (new_stmt);
5412 /* The result of the CALL_EXPR to this builtin is determined from
5413 the value of the parameter and no global variables are touched
5414 which makes the builtin a "const" function. Requiring the
5415 builtin to have the "const" attribute makes it unnecessary
5416 to call mark_call_clobbered. */
5417 gcc_assert (TREE_READONLY (builtin_decl));
5420 if (alignment_support_scheme == dr_explicit_realign)
5421 return msq;
5423 gcc_assert (!compute_in_loop);
5424 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5427 /* 5. Create msq = phi <msq_init, lsq> in loop */
5429 pe = loop_preheader_edge (containing_loop);
5430 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5431 msq = make_ssa_name (vec_dest);
5432 phi_stmt = create_phi_node (msq, containing_loop->header);
5433 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5435 return msq;
5439 /* Function vect_grouped_load_supported.
5441 COUNT is the size of the load group (the number of statements plus the
5442 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5443 only one statement, with a gap of COUNT - 1.
5445 Returns true if a suitable permute exists. */
5447 bool
5448 vect_grouped_load_supported (tree vectype, bool single_element_p,
5449 unsigned HOST_WIDE_INT count)
5451 machine_mode mode = TYPE_MODE (vectype);
5453 /* If this is single-element interleaving with an element distance
5454 that leaves unused vector loads around punt - we at least create
5455 very sub-optimal code in that case (and blow up memory,
5456 see PR65518). */
5457 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5459 if (dump_enabled_p ())
5460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5461 "single-element interleaving not supported "
5462 "for not adjacent vector loads\n");
5463 return false;
5466 /* vect_permute_load_chain requires the group size to be equal to 3 or
5467 be a power of two. */
5468 if (count != 3 && exact_log2 (count) == -1)
5470 if (dump_enabled_p ())
5471 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5472 "the size of the group of accesses"
5473 " is not a power of 2 or not equal to 3\n");
5474 return false;
5477 /* Check that the permutation is supported. */
5478 if (VECTOR_MODE_P (mode))
5480 unsigned int i, j;
5481 if (count == 3)
5483 unsigned int nelt;
5484 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5486 if (dump_enabled_p ())
5487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5488 "cannot handle groups of 3 loads for"
5489 " variable-length vectors\n");
5490 return false;
5493 vec_perm_builder sel (nelt, nelt, 1);
5494 sel.quick_grow (nelt);
5495 vec_perm_indices indices;
5496 unsigned int k;
5497 for (k = 0; k < 3; k++)
5499 for (i = 0; i < nelt; i++)
5500 if (3 * i + k < 2 * nelt)
5501 sel[i] = 3 * i + k;
5502 else
5503 sel[i] = 0;
5504 indices.new_vector (sel, 2, nelt);
5505 if (!can_vec_perm_const_p (mode, indices))
5507 if (dump_enabled_p ())
5508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5509 "shuffle of 3 loads is not supported by"
5510 " target\n");
5511 return false;
5513 for (i = 0, j = 0; i < nelt; i++)
5514 if (3 * i + k < 2 * nelt)
5515 sel[i] = i;
5516 else
5517 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5518 indices.new_vector (sel, 2, nelt);
5519 if (!can_vec_perm_const_p (mode, indices))
5521 if (dump_enabled_p ())
5522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5523 "shuffle of 3 loads is not supported by"
5524 " target\n");
5525 return false;
5528 return true;
5530 else
5532 /* If length is not equal to 3 then only power of 2 is supported. */
5533 gcc_assert (pow2p_hwi (count));
5534 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5536 /* The encoding has a single stepped pattern. */
5537 vec_perm_builder sel (nelt, 1, 3);
5538 sel.quick_grow (3);
5539 for (i = 0; i < 3; i++)
5540 sel[i] = i * 2;
5541 vec_perm_indices indices (sel, 2, nelt);
5542 if (can_vec_perm_const_p (mode, indices))
5544 for (i = 0; i < 3; i++)
5545 sel[i] = i * 2 + 1;
5546 indices.new_vector (sel, 2, nelt);
5547 if (can_vec_perm_const_p (mode, indices))
5548 return true;
5553 if (dump_enabled_p ())
5554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5555 "extract even/odd not supported by target\n");
5556 return false;
5559 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5560 type VECTYPE. MASKED_P says whether the masked form is needed. */
5562 bool
5563 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5564 bool masked_p)
5566 if (masked_p)
5567 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5568 vec_mask_load_lanes_optab,
5569 vectype, count);
5570 else
5571 return vect_lanes_optab_supported_p ("vec_load_lanes",
5572 vec_load_lanes_optab,
5573 vectype, count);
5576 /* Function vect_permute_load_chain.
5578 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5579 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5580 the input data correctly. Return the final references for loads in
5581 RESULT_CHAIN.
5583 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5584 The input is 4 vectors each containing 8 elements. We assign a number to each
5585 element, the input sequence is:
5587 1st vec: 0 1 2 3 4 5 6 7
5588 2nd vec: 8 9 10 11 12 13 14 15
5589 3rd vec: 16 17 18 19 20 21 22 23
5590 4th vec: 24 25 26 27 28 29 30 31
5592 The output sequence should be:
5594 1st vec: 0 4 8 12 16 20 24 28
5595 2nd vec: 1 5 9 13 17 21 25 29
5596 3rd vec: 2 6 10 14 18 22 26 30
5597 4th vec: 3 7 11 15 19 23 27 31
5599 i.e., the first output vector should contain the first elements of each
5600 interleaving group, etc.
5602 We use extract_even/odd instructions to create such output. The input of
5603 each extract_even/odd operation is two vectors
5604 1st vec 2nd vec
5605 0 1 2 3 4 5 6 7
5607 and the output is the vector of extracted even/odd elements. The output of
5608 extract_even will be: 0 2 4 6
5609 and of extract_odd: 1 3 5 7
5612 The permutation is done in log LENGTH stages. In each stage extract_even
5613 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5614 their order. In our example,
5616 E1: extract_even (1st vec, 2nd vec)
5617 E2: extract_odd (1st vec, 2nd vec)
5618 E3: extract_even (3rd vec, 4th vec)
5619 E4: extract_odd (3rd vec, 4th vec)
5621 The output for the first stage will be:
5623 E1: 0 2 4 6 8 10 12 14
5624 E2: 1 3 5 7 9 11 13 15
5625 E3: 16 18 20 22 24 26 28 30
5626 E4: 17 19 21 23 25 27 29 31
5628 In order to proceed and create the correct sequence for the next stage (or
5629 for the correct output, if the second stage is the last one, as in our
5630 example), we first put the output of extract_even operation and then the
5631 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5632 The input for the second stage is:
5634 1st vec (E1): 0 2 4 6 8 10 12 14
5635 2nd vec (E3): 16 18 20 22 24 26 28 30
5636 3rd vec (E2): 1 3 5 7 9 11 13 15
5637 4th vec (E4): 17 19 21 23 25 27 29 31
5639 The output of the second stage:
5641 E1: 0 4 8 12 16 20 24 28
5642 E2: 2 6 10 14 18 22 26 30
5643 E3: 1 5 9 13 17 21 25 29
5644 E4: 3 7 11 15 19 23 27 31
5646 And RESULT_CHAIN after reordering:
5648 1st vec (E1): 0 4 8 12 16 20 24 28
5649 2nd vec (E3): 1 5 9 13 17 21 25 29
5650 3rd vec (E2): 2 6 10 14 18 22 26 30
5651 4th vec (E4): 3 7 11 15 19 23 27 31. */
5653 static void
5654 vect_permute_load_chain (vec<tree> dr_chain,
5655 unsigned int length,
5656 stmt_vec_info stmt_info,
5657 gimple_stmt_iterator *gsi,
5658 vec<tree> *result_chain)
5660 tree data_ref, first_vect, second_vect;
5661 tree perm_mask_even, perm_mask_odd;
5662 tree perm3_mask_low, perm3_mask_high;
5663 gimple *perm_stmt;
5664 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5665 unsigned int i, j, log_length = exact_log2 (length);
5667 result_chain->quick_grow (length);
5668 memcpy (result_chain->address (), dr_chain.address (),
5669 length * sizeof (tree));
5671 if (length == 3)
5673 /* vect_grouped_load_supported ensures that this is constant. */
5674 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5675 unsigned int k;
5677 vec_perm_builder sel (nelt, nelt, 1);
5678 sel.quick_grow (nelt);
5679 vec_perm_indices indices;
5680 for (k = 0; k < 3; k++)
5682 for (i = 0; i < nelt; i++)
5683 if (3 * i + k < 2 * nelt)
5684 sel[i] = 3 * i + k;
5685 else
5686 sel[i] = 0;
5687 indices.new_vector (sel, 2, nelt);
5688 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5690 for (i = 0, j = 0; i < nelt; i++)
5691 if (3 * i + k < 2 * nelt)
5692 sel[i] = i;
5693 else
5694 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5695 indices.new_vector (sel, 2, nelt);
5696 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5698 first_vect = dr_chain[0];
5699 second_vect = dr_chain[1];
5701 /* Create interleaving stmt (low part of):
5702 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5703 ...}> */
5704 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5705 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5706 second_vect, perm3_mask_low);
5707 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5709 /* Create interleaving stmt (high part of):
5710 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5711 ...}> */
5712 first_vect = data_ref;
5713 second_vect = dr_chain[2];
5714 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5715 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5716 second_vect, perm3_mask_high);
5717 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5718 (*result_chain)[k] = data_ref;
5721 else
5723 /* If length is not equal to 3 then only power of 2 is supported. */
5724 gcc_assert (pow2p_hwi (length));
5726 /* The encoding has a single stepped pattern. */
5727 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5728 vec_perm_builder sel (nelt, 1, 3);
5729 sel.quick_grow (3);
5730 for (i = 0; i < 3; ++i)
5731 sel[i] = i * 2;
5732 vec_perm_indices indices (sel, 2, nelt);
5733 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5735 for (i = 0; i < 3; ++i)
5736 sel[i] = i * 2 + 1;
5737 indices.new_vector (sel, 2, nelt);
5738 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5740 for (i = 0; i < log_length; i++)
5742 for (j = 0; j < length; j += 2)
5744 first_vect = dr_chain[j];
5745 second_vect = dr_chain[j+1];
5747 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5748 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5749 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5750 first_vect, second_vect,
5751 perm_mask_even);
5752 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5753 (*result_chain)[j/2] = data_ref;
5755 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5756 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5757 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5758 first_vect, second_vect,
5759 perm_mask_odd);
5760 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5761 (*result_chain)[j/2+length/2] = data_ref;
5763 memcpy (dr_chain.address (), result_chain->address (),
5764 length * sizeof (tree));
5769 /* Function vect_shift_permute_load_chain.
5771 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5772 sequence of stmts to reorder the input data accordingly.
5773 Return the final references for loads in RESULT_CHAIN.
5774 Return true if successed, false otherwise.
5776 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5777 The input is 3 vectors each containing 8 elements. We assign a
5778 number to each element, the input sequence is:
5780 1st vec: 0 1 2 3 4 5 6 7
5781 2nd vec: 8 9 10 11 12 13 14 15
5782 3rd vec: 16 17 18 19 20 21 22 23
5784 The output sequence should be:
5786 1st vec: 0 3 6 9 12 15 18 21
5787 2nd vec: 1 4 7 10 13 16 19 22
5788 3rd vec: 2 5 8 11 14 17 20 23
5790 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5792 First we shuffle all 3 vectors to get correct elements order:
5794 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5795 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5796 3rd vec: (16 19 22) (17 20 23) (18 21)
5798 Next we unite and shift vector 3 times:
5800 1st step:
5801 shift right by 6 the concatenation of:
5802 "1st vec" and "2nd vec"
5803 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5804 "2nd vec" and "3rd vec"
5805 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5806 "3rd vec" and "1st vec"
5807 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5808 | New vectors |
5810 So that now new vectors are:
5812 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5813 2nd vec: (10 13) (16 19 22) (17 20 23)
5814 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5816 2nd step:
5817 shift right by 5 the concatenation of:
5818 "1st vec" and "3rd vec"
5819 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5820 "2nd vec" and "1st vec"
5821 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5822 "3rd vec" and "2nd vec"
5823 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5824 | New vectors |
5826 So that now new vectors are:
5828 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5829 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5830 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5832 3rd step:
5833 shift right by 5 the concatenation of:
5834 "1st vec" and "1st vec"
5835 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5836 shift right by 3 the concatenation of:
5837 "2nd vec" and "2nd vec"
5838 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5839 | New vectors |
5841 So that now all vectors are READY:
5842 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5843 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5844 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5846 This algorithm is faster than one in vect_permute_load_chain if:
5847 1. "shift of a concatination" is faster than general permutation.
5848 This is usually so.
5849 2. The TARGET machine can't execute vector instructions in parallel.
5850 This is because each step of the algorithm depends on previous.
5851 The algorithm in vect_permute_load_chain is much more parallel.
5853 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5856 static bool
5857 vect_shift_permute_load_chain (vec<tree> dr_chain,
5858 unsigned int length,
5859 stmt_vec_info stmt_info,
5860 gimple_stmt_iterator *gsi,
5861 vec<tree> *result_chain)
5863 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5864 tree perm2_mask1, perm2_mask2, perm3_mask;
5865 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5866 gimple *perm_stmt;
5868 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5869 unsigned int i;
5870 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5872 unsigned HOST_WIDE_INT nelt, vf;
5873 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5874 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5875 /* Not supported for variable-length vectors. */
5876 return false;
5878 vec_perm_builder sel (nelt, nelt, 1);
5879 sel.quick_grow (nelt);
5881 result_chain->quick_grow (length);
5882 memcpy (result_chain->address (), dr_chain.address (),
5883 length * sizeof (tree));
5885 if (pow2p_hwi (length) && vf > 4)
5887 unsigned int j, log_length = exact_log2 (length);
5888 for (i = 0; i < nelt / 2; ++i)
5889 sel[i] = i * 2;
5890 for (i = 0; i < nelt / 2; ++i)
5891 sel[nelt / 2 + i] = i * 2 + 1;
5892 vec_perm_indices indices (sel, 2, nelt);
5893 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5895 if (dump_enabled_p ())
5896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5897 "shuffle of 2 fields structure is not \
5898 supported by target\n");
5899 return false;
5901 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5903 for (i = 0; i < nelt / 2; ++i)
5904 sel[i] = i * 2 + 1;
5905 for (i = 0; i < nelt / 2; ++i)
5906 sel[nelt / 2 + i] = i * 2;
5907 indices.new_vector (sel, 2, nelt);
5908 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5910 if (dump_enabled_p ())
5911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5912 "shuffle of 2 fields structure is not \
5913 supported by target\n");
5914 return false;
5916 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5918 /* Generating permutation constant to shift all elements.
5919 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5920 for (i = 0; i < nelt; i++)
5921 sel[i] = nelt / 2 + i;
5922 indices.new_vector (sel, 2, nelt);
5923 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5925 if (dump_enabled_p ())
5926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5927 "shift permutation is not supported by target\n");
5928 return false;
5930 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5932 /* Generating permutation constant to select vector from 2.
5933 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5934 for (i = 0; i < nelt / 2; i++)
5935 sel[i] = i;
5936 for (i = nelt / 2; i < nelt; i++)
5937 sel[i] = nelt + i;
5938 indices.new_vector (sel, 2, nelt);
5939 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5941 if (dump_enabled_p ())
5942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5943 "select is not supported by target\n");
5944 return false;
5946 select_mask = vect_gen_perm_mask_checked (vectype, indices);
5948 for (i = 0; i < log_length; i++)
5950 for (j = 0; j < length; j += 2)
5952 first_vect = dr_chain[j];
5953 second_vect = dr_chain[j + 1];
5955 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5956 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5957 first_vect, first_vect,
5958 perm2_mask1);
5959 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5960 vect[0] = data_ref;
5962 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5963 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5964 second_vect, second_vect,
5965 perm2_mask2);
5966 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5967 vect[1] = data_ref;
5969 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5970 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5971 vect[0], vect[1], shift1_mask);
5972 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5973 (*result_chain)[j/2 + length/2] = data_ref;
5975 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5976 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5977 vect[0], vect[1], select_mask);
5978 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5979 (*result_chain)[j/2] = data_ref;
5981 memcpy (dr_chain.address (), result_chain->address (),
5982 length * sizeof (tree));
5984 return true;
5986 if (length == 3 && vf > 2)
5988 unsigned int k = 0, l = 0;
5990 /* Generating permutation constant to get all elements in rigth order.
5991 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5992 for (i = 0; i < nelt; i++)
5994 if (3 * k + (l % 3) >= nelt)
5996 k = 0;
5997 l += (3 - (nelt % 3));
5999 sel[i] = 3 * k + (l % 3);
6000 k++;
6002 vec_perm_indices indices (sel, 2, nelt);
6003 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6005 if (dump_enabled_p ())
6006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6007 "shuffle of 3 fields structure is not \
6008 supported by target\n");
6009 return false;
6011 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6013 /* Generating permutation constant to shift all elements.
6014 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6015 for (i = 0; i < nelt; i++)
6016 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6017 indices.new_vector (sel, 2, nelt);
6018 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6020 if (dump_enabled_p ())
6021 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6022 "shift permutation is not supported by target\n");
6023 return false;
6025 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6027 /* Generating permutation constant to shift all elements.
6028 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6029 for (i = 0; i < nelt; i++)
6030 sel[i] = 2 * (nelt / 3) + 1 + i;
6031 indices.new_vector (sel, 2, nelt);
6032 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6034 if (dump_enabled_p ())
6035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6036 "shift permutation is not supported by target\n");
6037 return false;
6039 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6041 /* Generating permutation constant to shift all elements.
6042 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6043 for (i = 0; i < nelt; i++)
6044 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6045 indices.new_vector (sel, 2, nelt);
6046 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6048 if (dump_enabled_p ())
6049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6050 "shift permutation is not supported by target\n");
6051 return false;
6053 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6055 /* Generating permutation constant to shift all elements.
6056 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6057 for (i = 0; i < nelt; i++)
6058 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6059 indices.new_vector (sel, 2, nelt);
6060 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6062 if (dump_enabled_p ())
6063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6064 "shift permutation is not supported by target\n");
6065 return false;
6067 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6069 for (k = 0; k < 3; k++)
6071 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6072 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6073 dr_chain[k], dr_chain[k],
6074 perm3_mask);
6075 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6076 vect[k] = data_ref;
6079 for (k = 0; k < 3; k++)
6081 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6082 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6083 vect[k % 3], vect[(k + 1) % 3],
6084 shift1_mask);
6085 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6086 vect_shift[k] = data_ref;
6089 for (k = 0; k < 3; k++)
6091 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6092 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6093 vect_shift[(4 - k) % 3],
6094 vect_shift[(3 - k) % 3],
6095 shift2_mask);
6096 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6097 vect[k] = data_ref;
6100 (*result_chain)[3 - (nelt % 3)] = vect[2];
6102 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6103 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6104 vect[0], shift3_mask);
6105 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6106 (*result_chain)[nelt % 3] = data_ref;
6108 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6109 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6110 vect[1], shift4_mask);
6111 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6112 (*result_chain)[0] = data_ref;
6113 return true;
6115 return false;
6118 /* Function vect_transform_grouped_load.
6120 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6121 to perform their permutation and ascribe the result vectorized statements to
6122 the scalar statements.
6125 void
6126 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6127 int size, gimple_stmt_iterator *gsi)
6129 machine_mode mode;
6130 vec<tree> result_chain = vNULL;
6132 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6133 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6134 vectors, that are ready for vector computation. */
6135 result_chain.create (size);
6137 /* If reassociation width for vector type is 2 or greater target machine can
6138 execute 2 or more vector instructions in parallel. Otherwise try to
6139 get chain for loads group using vect_shift_permute_load_chain. */
6140 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6141 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6142 || pow2p_hwi (size)
6143 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6144 gsi, &result_chain))
6145 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6146 vect_record_grouped_load_vectors (stmt_info, result_chain);
6147 result_chain.release ();
6150 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6151 generated as part of the vectorization of STMT_INFO. Assign the statement
6152 for each vector to the associated scalar statement. */
6154 void
6155 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6156 vec<tree> result_chain)
6158 vec_info *vinfo = stmt_info->vinfo;
6159 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6160 unsigned int i, gap_count;
6161 tree tmp_data_ref;
6163 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6164 Since we scan the chain starting from it's first node, their order
6165 corresponds the order of data-refs in RESULT_CHAIN. */
6166 stmt_vec_info next_stmt_info = first_stmt_info;
6167 gap_count = 1;
6168 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6170 if (!next_stmt_info)
6171 break;
6173 /* Skip the gaps. Loads created for the gaps will be removed by dead
6174 code elimination pass later. No need to check for the first stmt in
6175 the group, since it always exists.
6176 DR_GROUP_GAP is the number of steps in elements from the previous
6177 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6178 correspond to the gaps. */
6179 if (next_stmt_info != first_stmt_info
6180 && gap_count < DR_GROUP_GAP (next_stmt_info))
6182 gap_count++;
6183 continue;
6186 while (next_stmt_info)
6188 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6189 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6190 copies, and we put the new vector statement in the first available
6191 RELATED_STMT. */
6192 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6193 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6194 else
6196 if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6198 stmt_vec_info prev_stmt_info
6199 = STMT_VINFO_VEC_STMT (next_stmt_info);
6200 stmt_vec_info rel_stmt_info
6201 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6202 while (rel_stmt_info)
6204 prev_stmt_info = rel_stmt_info;
6205 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6208 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6212 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6213 gap_count = 1;
6214 /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6215 put the same TMP_DATA_REF as its vectorized statement; otherwise
6216 get the next data-ref from RESULT_CHAIN. */
6217 if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6218 break;
6223 /* Function vect_force_dr_alignment_p.
6225 Returns whether the alignment of a DECL can be forced to be aligned
6226 on ALIGNMENT bit boundary. */
6228 bool
6229 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6231 if (!VAR_P (decl))
6232 return false;
6234 if (decl_in_symtab_p (decl)
6235 && !symtab_node::get (decl)->can_increase_alignment_p ())
6236 return false;
6238 if (TREE_STATIC (decl))
6239 return (alignment <= MAX_OFILE_ALIGNMENT);
6240 else
6241 return (alignment <= MAX_STACK_ALIGNMENT);
6245 /* Return whether the data reference DR_INFO is supported with respect to its
6246 alignment.
6247 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6248 it is aligned, i.e., check if it is possible to vectorize it with different
6249 alignment. */
6251 enum dr_alignment_support
6252 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6253 bool check_aligned_accesses)
6255 data_reference *dr = dr_info->dr;
6256 stmt_vec_info stmt_info = dr_info->stmt;
6257 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6258 machine_mode mode = TYPE_MODE (vectype);
6259 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6260 struct loop *vect_loop = NULL;
6261 bool nested_in_vect_loop = false;
6263 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6264 return dr_aligned;
6266 /* For now assume all conditional loads/stores support unaligned
6267 access without any special code. */
6268 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6269 if (gimple_call_internal_p (stmt)
6270 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6271 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6272 return dr_unaligned_supported;
6274 if (loop_vinfo)
6276 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6277 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6280 /* Possibly unaligned access. */
6282 /* We can choose between using the implicit realignment scheme (generating
6283 a misaligned_move stmt) and the explicit realignment scheme (generating
6284 aligned loads with a REALIGN_LOAD). There are two variants to the
6285 explicit realignment scheme: optimized, and unoptimized.
6286 We can optimize the realignment only if the step between consecutive
6287 vector loads is equal to the vector size. Since the vector memory
6288 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6289 is guaranteed that the misalignment amount remains the same throughout the
6290 execution of the vectorized loop. Therefore, we can create the
6291 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6292 at the loop preheader.
6294 However, in the case of outer-loop vectorization, when vectorizing a
6295 memory access in the inner-loop nested within the LOOP that is now being
6296 vectorized, while it is guaranteed that the misalignment of the
6297 vectorized memory access will remain the same in different outer-loop
6298 iterations, it is *not* guaranteed that is will remain the same throughout
6299 the execution of the inner-loop. This is because the inner-loop advances
6300 with the original scalar step (and not in steps of VS). If the inner-loop
6301 step happens to be a multiple of VS, then the misalignment remains fixed
6302 and we can use the optimized realignment scheme. For example:
6304 for (i=0; i<N; i++)
6305 for (j=0; j<M; j++)
6306 s += a[i+j];
6308 When vectorizing the i-loop in the above example, the step between
6309 consecutive vector loads is 1, and so the misalignment does not remain
6310 fixed across the execution of the inner-loop, and the realignment cannot
6311 be optimized (as illustrated in the following pseudo vectorized loop):
6313 for (i=0; i<N; i+=4)
6314 for (j=0; j<M; j++){
6315 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6316 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6317 // (assuming that we start from an aligned address).
6320 We therefore have to use the unoptimized realignment scheme:
6322 for (i=0; i<N; i+=4)
6323 for (j=k; j<M; j+=4)
6324 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6325 // that the misalignment of the initial address is
6326 // 0).
6328 The loop can then be vectorized as follows:
6330 for (k=0; k<4; k++){
6331 rt = get_realignment_token (&vp[k]);
6332 for (i=0; i<N; i+=4){
6333 v1 = vp[i+k];
6334 for (j=k; j<M; j+=4){
6335 v2 = vp[i+j+VS-1];
6336 va = REALIGN_LOAD <v1,v2,rt>;
6337 vs += va;
6338 v1 = v2;
6341 } */
6343 if (DR_IS_READ (dr))
6345 bool is_packed = false;
6346 tree type = (TREE_TYPE (DR_REF (dr)));
6348 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6349 && (!targetm.vectorize.builtin_mask_for_load
6350 || targetm.vectorize.builtin_mask_for_load ()))
6352 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6354 /* If we are doing SLP then the accesses need not have the
6355 same alignment, instead it depends on the SLP group size. */
6356 if (loop_vinfo
6357 && STMT_SLP_TYPE (stmt_info)
6358 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6359 * (DR_GROUP_SIZE
6360 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6361 TYPE_VECTOR_SUBPARTS (vectype)))
6363 else if (!loop_vinfo
6364 || (nested_in_vect_loop
6365 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6366 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6367 return dr_explicit_realign;
6368 else
6369 return dr_explicit_realign_optimized;
6371 if (!known_alignment_for_access_p (dr_info))
6372 is_packed = not_size_aligned (DR_REF (dr));
6374 if (targetm.vectorize.support_vector_misalignment
6375 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6376 /* Can't software pipeline the loads, but can at least do them. */
6377 return dr_unaligned_supported;
6379 else
6381 bool is_packed = false;
6382 tree type = (TREE_TYPE (DR_REF (dr)));
6384 if (!known_alignment_for_access_p (dr_info))
6385 is_packed = not_size_aligned (DR_REF (dr));
6387 if (targetm.vectorize.support_vector_misalignment
6388 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6389 return dr_unaligned_supported;
6392 /* Unsupported. */
6393 return dr_unaligned_unsupported;