PR rtl-optimization/88470
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobdeb7121623ee4408cba706b61743a06bcee57626
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
120 HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt_info->stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
134 if (assign
135 && (gimple_assign_cast_p (assign)
136 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
142 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
145 if (rhs < lhs)
146 scalar_type = rhs_type;
149 *lhs_size_unit = lhs;
150 *rhs_size_unit = rhs;
151 return scalar_type;
155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
156 tested at run-time. Return TRUE if DDR was successfully inserted.
157 Return false if versioning is not supported. */
159 static opt_result
160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
162 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
164 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
165 return opt_result::failure_at (vect_location,
166 "will not create alias checks, as"
167 " --param vect-max-version-for-alias-checks"
168 " == 0\n");
170 opt_result res
171 = runtime_alias_check_p (ddr, loop,
172 optimize_loop_nest_for_speed_p (loop));
173 if (!res)
174 return res;
176 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
177 return opt_result::success ();
180 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
182 static void
183 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
185 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
186 for (unsigned int i = 0; i < checks.length(); ++i)
187 if (checks[i] == value)
188 return;
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_NOTE, vect_location,
192 "need run-time check that %T is nonzero\n",
193 value);
194 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
197 /* Return true if we know that the order of vectorized DR_INFO_A and
198 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
199 DR_INFO_B. At least one of the accesses is a write. */
201 static bool
202 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
204 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
205 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
207 /* Single statements are always kept in their original order. */
208 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
209 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
210 return true;
212 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
213 group are emitted at the position of the last scalar load and all
214 stores in a group are emitted at the position of the last scalar store.
215 Compute that position and check whether the resulting order matches
216 the current one. */
217 stmt_vec_info last_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
218 if (last_a)
219 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
220 s = DR_GROUP_NEXT_ELEMENT (s))
221 last_a = get_later_stmt (last_a, s);
222 else
223 last_a = stmtinfo_a;
224 stmt_vec_info last_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
225 if (last_b)
226 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
227 s = DR_GROUP_NEXT_ELEMENT (s))
228 last_b = get_later_stmt (last_b, s);
229 else
230 last_b = stmtinfo_b;
231 return ((get_later_stmt (last_a, last_b) == last_a)
232 == (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a));
235 /* A subroutine of vect_analyze_data_ref_dependence. Handle
236 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
237 distances. These distances are conservatively correct but they don't
238 reflect a guaranteed dependence.
240 Return true if this function does all the work necessary to avoid
241 an alias or false if the caller should use the dependence distances
242 to limit the vectorization factor in the usual way. LOOP_DEPTH is
243 the depth of the loop described by LOOP_VINFO and the other arguments
244 are as for vect_analyze_data_ref_dependence. */
246 static bool
247 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
248 loop_vec_info loop_vinfo,
249 int loop_depth, unsigned int *max_vf)
251 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
252 lambda_vector dist_v;
253 unsigned int i;
254 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
256 int dist = dist_v[loop_depth];
257 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
259 /* If the user asserted safelen >= DIST consecutive iterations
260 can be executed concurrently, assume independence.
262 ??? An alternative would be to add the alias check even
263 in this case, and vectorize the fallback loop with the
264 maximum VF set to safelen. However, if the user has
265 explicitly given a length, it's less likely that that
266 would be a win. */
267 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
269 if ((unsigned int) loop->safelen < *max_vf)
270 *max_vf = loop->safelen;
271 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
272 continue;
275 /* For dependence distances of 2 or more, we have the option
276 of limiting VF or checking for an alias at runtime.
277 Prefer to check at runtime if we can, to avoid limiting
278 the VF unnecessarily when the bases are in fact independent.
280 Note that the alias checks will be removed if the VF ends up
281 being small enough. */
282 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
283 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
284 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
285 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
286 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
289 return true;
293 /* Function vect_analyze_data_ref_dependence.
295 FIXME: I needed to change the sense of the returned flag.
297 Return FALSE if there (might) exist a dependence between a memory-reference
298 DRA and a memory-reference DRB. When versioning for alias may check a
299 dependence at run-time, return TRUE. Adjust *MAX_VF according to
300 the data dependence. */
302 static opt_result
303 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
304 loop_vec_info loop_vinfo,
305 unsigned int *max_vf)
307 unsigned int i;
308 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
309 struct data_reference *dra = DDR_A (ddr);
310 struct data_reference *drb = DDR_B (ddr);
311 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
312 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
313 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
314 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
315 lambda_vector dist_v;
316 unsigned int loop_depth;
318 /* In loop analysis all data references should be vectorizable. */
319 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
320 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
321 gcc_unreachable ();
323 /* Independent data accesses. */
324 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
325 return opt_result::success ();
327 if (dra == drb
328 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
329 return opt_result::success ();
331 /* We do not have to consider dependences between accesses that belong
332 to the same group, unless the stride could be smaller than the
333 group size. */
334 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
335 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
336 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
337 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
338 return opt_result::success ();
340 /* Even if we have an anti-dependence then, as the vectorized loop covers at
341 least two scalar iterations, there is always also a true dependence.
342 As the vectorizer does not re-order loads and stores we can ignore
343 the anti-dependence if TBAA can disambiguate both DRs similar to the
344 case with known negative distance anti-dependences (positive
345 distance anti-dependences would violate TBAA constraints). */
346 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
347 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
348 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
349 get_alias_set (DR_REF (drb))))
350 return opt_result::success ();
352 /* Unknown data dependence. */
353 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
355 /* If user asserted safelen consecutive iterations can be
356 executed concurrently, assume independence. */
357 if (loop->safelen >= 2)
359 if ((unsigned int) loop->safelen < *max_vf)
360 *max_vf = loop->safelen;
361 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
362 return opt_result::success ();
365 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
366 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
367 return opt_result::failure_at
368 (stmtinfo_a->stmt,
369 "versioning for alias not supported for: "
370 "can't determine dependence between %T and %T\n",
371 DR_REF (dra), DR_REF (drb));
373 if (dump_enabled_p ())
374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
375 "versioning for alias required: "
376 "can't determine dependence between %T and %T\n",
377 DR_REF (dra), DR_REF (drb));
379 /* Add to list of ddrs that need to be tested at run-time. */
380 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
383 /* Known data dependence. */
384 if (DDR_NUM_DIST_VECTS (ddr) == 0)
386 /* If user asserted safelen consecutive iterations can be
387 executed concurrently, assume independence. */
388 if (loop->safelen >= 2)
390 if ((unsigned int) loop->safelen < *max_vf)
391 *max_vf = loop->safelen;
392 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
393 return opt_result::success ();
396 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
397 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
398 return opt_result::failure_at
399 (stmtinfo_a->stmt,
400 "versioning for alias not supported for: "
401 "bad dist vector for %T and %T\n",
402 DR_REF (dra), DR_REF (drb));
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
406 "versioning for alias required: "
407 "bad dist vector for %T and %T\n",
408 DR_REF (dra), DR_REF (drb));
409 /* Add to list of ddrs that need to be tested at run-time. */
410 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
413 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
415 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
416 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
417 loop_depth, max_vf))
418 return opt_result::success ();
420 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
422 int dist = dist_v[loop_depth];
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE, vect_location,
426 "dependence distance = %d.\n", dist);
428 if (dist == 0)
430 if (dump_enabled_p ())
431 dump_printf_loc (MSG_NOTE, vect_location,
432 "dependence distance == 0 between %T and %T\n",
433 DR_REF (dra), DR_REF (drb));
435 /* When we perform grouped accesses and perform implicit CSE
436 by detecting equal accesses and doing disambiguation with
437 runtime alias tests like for
438 .. = a[i];
439 .. = a[i+1];
440 a[i] = ..;
441 a[i+1] = ..;
442 *p = ..;
443 .. = a[i];
444 .. = a[i+1];
445 where we will end up loading { a[i], a[i+1] } once, make
446 sure that inserting group loads before the first load and
447 stores after the last store will do the right thing.
448 Similar for groups like
449 a[i] = ...;
450 ... = a[i];
451 a[i+1] = ...;
452 where loads from the group interleave with the store. */
453 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
454 return opt_result::failure_at (stmtinfo_a->stmt,
455 "READ_WRITE dependence"
456 " in interleaving.\n");
458 if (loop->safelen < 2)
460 tree indicator = dr_zero_step_indicator (dra);
461 if (!indicator || integer_zerop (indicator))
462 return opt_result::failure_at (stmtinfo_a->stmt,
463 "access also has a zero step\n");
464 else if (TREE_CODE (indicator) != INTEGER_CST)
465 vect_check_nonzero_value (loop_vinfo, indicator);
467 continue;
470 if (dist > 0 && DDR_REVERSED_P (ddr))
472 /* If DDR_REVERSED_P the order of the data-refs in DDR was
473 reversed (to make distance vector positive), and the actual
474 distance is negative. */
475 if (dump_enabled_p ())
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477 "dependence distance negative.\n");
478 /* Record a negative dependence distance to later limit the
479 amount of stmt copying / unrolling we can perform.
480 Only need to handle read-after-write dependence. */
481 if (DR_IS_READ (drb)
482 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
483 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
484 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
485 continue;
488 unsigned int abs_dist = abs (dist);
489 if (abs_dist >= 2 && abs_dist < *max_vf)
491 /* The dependence distance requires reduction of the maximal
492 vectorization factor. */
493 *max_vf = abs (dist);
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_NOTE, vect_location,
496 "adjusting maximal vectorization factor to %i\n",
497 *max_vf);
500 if (abs_dist >= *max_vf)
502 /* Dependence distance does not create dependence, as far as
503 vectorization is concerned, in this case. */
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location,
506 "dependence distance >= VF.\n");
507 continue;
510 return opt_result::failure_at (stmtinfo_a->stmt,
511 "not vectorized, possible dependence "
512 "between data-refs %T and %T\n",
513 DR_REF (dra), DR_REF (drb));
516 return opt_result::success ();
519 /* Function vect_analyze_data_ref_dependences.
521 Examine all the data references in the loop, and make sure there do not
522 exist any data dependences between them. Set *MAX_VF according to
523 the maximum vectorization factor the data dependences allow. */
525 opt_result
526 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
527 unsigned int *max_vf)
529 unsigned int i;
530 struct data_dependence_relation *ddr;
532 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
534 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
536 LOOP_VINFO_DDRS (loop_vinfo)
537 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
538 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
539 /* We need read-read dependences to compute
540 STMT_VINFO_SAME_ALIGN_REFS. */
541 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
542 &LOOP_VINFO_DDRS (loop_vinfo),
543 LOOP_VINFO_LOOP_NEST (loop_vinfo),
544 true);
545 gcc_assert (res);
548 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
550 /* For epilogues we either have no aliases or alias versioning
551 was applied to original loop. Therefore we may just get max_vf
552 using VF of original loop. */
553 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
554 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
555 else
556 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
558 opt_result res
559 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
560 if (!res)
561 return res;
564 return opt_result::success ();
568 /* Function vect_slp_analyze_data_ref_dependence.
570 Return TRUE if there (might) exist a dependence between a memory-reference
571 DRA and a memory-reference DRB for VINFO. When versioning for alias
572 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
573 according to the data dependence. */
575 static bool
576 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
577 struct data_dependence_relation *ddr)
579 struct data_reference *dra = DDR_A (ddr);
580 struct data_reference *drb = DDR_B (ddr);
581 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
582 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
584 /* We need to check dependences of statements marked as unvectorizable
585 as well, they still can prohibit vectorization. */
587 /* Independent data accesses. */
588 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
589 return false;
591 if (dra == drb)
592 return false;
594 /* Read-read is OK. */
595 if (DR_IS_READ (dra) && DR_IS_READ (drb))
596 return false;
598 /* If dra and drb are part of the same interleaving chain consider
599 them independent. */
600 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
601 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
602 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
603 return false;
605 /* Unknown data dependence. */
606 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
608 if (dump_enabled_p ())
609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
610 "can't determine dependence between %T and %T\n",
611 DR_REF (dra), DR_REF (drb));
613 else if (dump_enabled_p ())
614 dump_printf_loc (MSG_NOTE, vect_location,
615 "determined dependence between %T and %T\n",
616 DR_REF (dra), DR_REF (drb));
618 return true;
622 /* Analyze dependences involved in the transform of SLP NODE. STORES
623 contain the vector of scalar stores of this instance if we are
624 disambiguating the loads. */
626 static bool
627 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
628 vec<stmt_vec_info> stores,
629 stmt_vec_info last_store_info)
631 /* This walks over all stmts involved in the SLP load/store done
632 in NODE verifying we can sink them up to the last stmt in the
633 group. */
634 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
635 vec_info *vinfo = last_access_info->vinfo;
636 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
638 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
639 if (access_info == last_access_info)
640 continue;
641 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
642 ao_ref ref;
643 bool ref_initialized_p = false;
644 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
645 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
647 gimple *stmt = gsi_stmt (gsi);
648 if (! gimple_vuse (stmt)
649 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
650 continue;
652 /* If we couldn't record a (single) data reference for this
653 stmt we have to resort to the alias oracle. */
654 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
655 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
656 if (!dr_b)
658 /* We are moving a store or sinking a load - this means
659 we cannot use TBAA for disambiguation. */
660 if (!ref_initialized_p)
661 ao_ref_init (&ref, DR_REF (dr_a));
662 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
663 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
664 return false;
665 continue;
668 bool dependent = false;
669 /* If we run into a store of this same instance (we've just
670 marked those) then delay dependence checking until we run
671 into the last store because this is where it will have
672 been sunk to (and we verify if we can do that as well). */
673 if (gimple_visited_p (stmt))
675 if (stmt_info != last_store_info)
676 continue;
677 unsigned i;
678 stmt_vec_info store_info;
679 FOR_EACH_VEC_ELT (stores, i, store_info)
681 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
682 ddr_p ddr = initialize_data_dependence_relation
683 (dr_a, store_dr, vNULL);
684 dependent
685 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
686 free_dependence_relation (ddr);
687 if (dependent)
688 break;
691 else
693 ddr_p ddr = initialize_data_dependence_relation (dr_a,
694 dr_b, vNULL);
695 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
696 free_dependence_relation (ddr);
698 if (dependent)
699 return false;
702 return true;
706 /* Function vect_analyze_data_ref_dependences.
708 Examine all the data references in the basic-block, and make sure there
709 do not exist any data dependences between them. Set *MAX_VF according to
710 the maximum vectorization factor the data dependences allow. */
712 bool
713 vect_slp_analyze_instance_dependence (slp_instance instance)
715 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
717 /* The stores of this instance are at the root of the SLP tree. */
718 slp_tree store = SLP_INSTANCE_TREE (instance);
719 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
720 store = NULL;
722 /* Verify we can sink stores to the vectorized stmt insert location. */
723 stmt_vec_info last_store_info = NULL;
724 if (store)
726 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
727 return false;
729 /* Mark stores in this instance and remember the last one. */
730 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
731 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
732 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
735 bool res = true;
737 /* Verify we can sink loads to the vectorized stmt insert location,
738 special-casing stores of this instance. */
739 slp_tree load;
740 unsigned int i;
741 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
742 if (! vect_slp_analyze_node_dependences (instance, load,
743 store
744 ? SLP_TREE_SCALAR_STMTS (store)
745 : vNULL, last_store_info))
747 res = false;
748 break;
751 /* Unset the visited flag. */
752 if (store)
753 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
754 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
756 return res;
759 /* Record the base alignment guarantee given by DRB, which occurs
760 in STMT_INFO. */
762 static void
763 vect_record_base_alignment (stmt_vec_info stmt_info,
764 innermost_loop_behavior *drb)
766 vec_info *vinfo = stmt_info->vinfo;
767 bool existed;
768 innermost_loop_behavior *&entry
769 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
770 if (!existed || entry->base_alignment < drb->base_alignment)
772 entry = drb;
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE, vect_location,
775 "recording new base alignment for %T\n"
776 " alignment: %d\n"
777 " misalignment: %d\n"
778 " based on: %G",
779 drb->base_address,
780 drb->base_alignment,
781 drb->base_misalignment,
782 stmt_info->stmt);
786 /* If the region we're going to vectorize is reached, all unconditional
787 data references occur at least once. We can therefore pool the base
788 alignment guarantees from each unconditional reference. Do this by
789 going through all the data references in VINFO and checking whether
790 the containing statement makes the reference unconditionally. If so,
791 record the alignment of the base address in VINFO so that it can be
792 used for all other references with the same base. */
794 void
795 vect_record_base_alignments (vec_info *vinfo)
797 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
798 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
799 data_reference *dr;
800 unsigned int i;
801 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
803 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
804 stmt_vec_info stmt_info = dr_info->stmt;
805 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
806 && STMT_VINFO_VECTORIZABLE (stmt_info)
807 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
809 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
811 /* If DR is nested in the loop that is being vectorized, we can also
812 record the alignment of the base wrt the outer loop. */
813 if (loop && nested_in_vect_loop_p (loop, stmt_info))
814 vect_record_base_alignment
815 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
820 /* Return the target alignment for the vectorized form of DR_INFO. */
822 static poly_uint64
823 vect_calculate_target_alignment (dr_vec_info *dr_info)
825 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
826 return targetm.vectorize.preferred_vector_alignment (vectype);
829 /* Function vect_compute_data_ref_alignment
831 Compute the misalignment of the data reference DR_INFO.
833 Output:
834 1. DR_MISALIGNMENT (DR_INFO) is defined.
836 FOR NOW: No analysis is actually performed. Misalignment is calculated
837 only for trivial cases. TODO. */
839 static void
840 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
842 stmt_vec_info stmt_info = dr_info->stmt;
843 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
844 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
845 struct loop *loop = NULL;
846 tree ref = DR_REF (dr_info->dr);
847 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
849 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE, vect_location,
851 "vect_compute_data_ref_alignment:\n");
853 if (loop_vinfo)
854 loop = LOOP_VINFO_LOOP (loop_vinfo);
856 /* Initialize misalignment to unknown. */
857 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
859 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
860 return;
862 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
863 bool step_preserves_misalignment_p;
865 poly_uint64 vector_alignment
866 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
867 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
869 unsigned HOST_WIDE_INT vect_align_c;
870 if (!vector_alignment.is_constant (&vect_align_c))
871 return;
873 /* No step for BB vectorization. */
874 if (!loop)
876 gcc_assert (integer_zerop (drb->step));
877 step_preserves_misalignment_p = true;
880 /* In case the dataref is in an inner-loop of the loop that is being
881 vectorized (LOOP), we use the base and misalignment information
882 relative to the outer-loop (LOOP). This is ok only if the misalignment
883 stays the same throughout the execution of the inner-loop, which is why
884 we have to check that the stride of the dataref in the inner-loop evenly
885 divides by the vector alignment. */
886 else if (nested_in_vect_loop_p (loop, stmt_info))
888 step_preserves_misalignment_p
889 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
891 if (dump_enabled_p ())
893 if (step_preserves_misalignment_p)
894 dump_printf_loc (MSG_NOTE, vect_location,
895 "inner step divides the vector alignment.\n");
896 else
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "inner step doesn't divide the vector"
899 " alignment.\n");
903 /* Similarly we can only use base and misalignment information relative to
904 an innermost loop if the misalignment stays the same throughout the
905 execution of the loop. As above, this is the case if the stride of
906 the dataref evenly divides by the alignment. */
907 else
909 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
910 step_preserves_misalignment_p
911 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
913 if (!step_preserves_misalignment_p && dump_enabled_p ())
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 "step doesn't divide the vector alignment.\n");
918 unsigned int base_alignment = drb->base_alignment;
919 unsigned int base_misalignment = drb->base_misalignment;
921 /* Calculate the maximum of the pooled base address alignment and the
922 alignment that we can compute for DR itself. */
923 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
924 if (entry && base_alignment < (*entry)->base_alignment)
926 base_alignment = (*entry)->base_alignment;
927 base_misalignment = (*entry)->base_misalignment;
930 if (drb->offset_alignment < vect_align_c
931 || !step_preserves_misalignment_p
932 /* We need to know whether the step wrt the vectorized loop is
933 negative when computing the starting misalignment below. */
934 || TREE_CODE (drb->step) != INTEGER_CST)
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
938 "Unknown alignment for access: %T\n", ref);
939 return;
942 if (base_alignment < vect_align_c)
944 unsigned int max_alignment;
945 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
946 if (max_alignment < vect_align_c
947 || !vect_can_force_dr_alignment_p (base,
948 vect_align_c * BITS_PER_UNIT))
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location,
952 "can't force alignment of ref: %T\n", ref);
953 return;
956 /* Force the alignment of the decl.
957 NOTE: This is the only change to the code we make during
958 the analysis phase, before deciding to vectorize the loop. */
959 if (dump_enabled_p ())
960 dump_printf_loc (MSG_NOTE, vect_location,
961 "force alignment of %T\n", ref);
963 dr_info->base_decl = base;
964 dr_info->base_misaligned = true;
965 base_misalignment = 0;
967 poly_int64 misalignment
968 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
970 /* If this is a backward running DR then first access in the larger
971 vectype actually is N-1 elements before the address in the DR.
972 Adjust misalign accordingly. */
973 if (tree_int_cst_sgn (drb->step) < 0)
974 /* PLUS because STEP is negative. */
975 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
976 * TREE_INT_CST_LOW (drb->step));
978 unsigned int const_misalignment;
979 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
981 if (dump_enabled_p ())
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
983 "Non-constant misalignment for access: %T\n", ref);
984 return;
987 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
989 if (dump_enabled_p ())
990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
991 "misalign = %d bytes of ref %T\n",
992 DR_MISALIGNMENT (dr_info), ref);
994 return;
997 /* Function vect_update_misalignment_for_peel.
998 Sets DR_INFO's misalignment
999 - to 0 if it has the same alignment as DR_PEEL_INFO,
1000 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1001 - to -1 (unknown) otherwise.
1003 DR_INFO - the data reference whose misalignment is to be adjusted.
1004 DR_PEEL_INFO - the data reference whose misalignment is being made
1005 zero in the vector loop by the peel.
1006 NPEEL - the number of iterations in the peel loop if the misalignment
1007 of DR_PEEL_INFO is known at compile time. */
1009 static void
1010 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1011 dr_vec_info *dr_peel_info, int npeel)
1013 unsigned int i;
1014 vec<dr_p> same_aligned_drs;
1015 struct data_reference *current_dr;
1016 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1018 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1019 it is aligned in the vector loop. */
1020 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1021 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1023 if (current_dr != dr_info->dr)
1024 continue;
1025 gcc_assert (!known_alignment_for_access_p (dr_info)
1026 || !known_alignment_for_access_p (dr_peel_info)
1027 || (DR_MISALIGNMENT (dr_info)
1028 == DR_MISALIGNMENT (dr_peel_info)));
1029 SET_DR_MISALIGNMENT (dr_info, 0);
1030 return;
1033 unsigned HOST_WIDE_INT alignment;
1034 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1035 && known_alignment_for_access_p (dr_info)
1036 && known_alignment_for_access_p (dr_peel_info))
1038 int misal = DR_MISALIGNMENT (dr_info);
1039 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1040 misal &= alignment - 1;
1041 SET_DR_MISALIGNMENT (dr_info, misal);
1042 return;
1045 if (dump_enabled_p ())
1046 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1047 "to unknown (-1).\n");
1048 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1052 /* Function verify_data_ref_alignment
1054 Return TRUE if DR_INFO can be handled with respect to alignment. */
1056 static opt_result
1057 verify_data_ref_alignment (dr_vec_info *dr_info)
1059 enum dr_alignment_support supportable_dr_alignment
1060 = vect_supportable_dr_alignment (dr_info, false);
1061 if (!supportable_dr_alignment)
1062 return opt_result::failure_at
1063 (dr_info->stmt->stmt,
1064 DR_IS_READ (dr_info->dr)
1065 ? "not vectorized: unsupported unaligned load: %T\n"
1066 : "not vectorized: unsupported unaligned store: %T\n",
1067 DR_REF (dr_info->dr));
1069 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1070 dump_printf_loc (MSG_NOTE, vect_location,
1071 "Vectorizing an unaligned access.\n");
1073 return opt_result::success ();
1076 /* Function vect_verify_datarefs_alignment
1078 Return TRUE if all data references in the loop can be
1079 handled with respect to alignment. */
1081 opt_result
1082 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1084 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1085 struct data_reference *dr;
1086 unsigned int i;
1088 FOR_EACH_VEC_ELT (datarefs, i, dr)
1090 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1091 stmt_vec_info stmt_info = dr_info->stmt;
1093 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1094 continue;
1096 /* For interleaving, only the alignment of the first access matters. */
1097 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1098 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1099 continue;
1101 /* Strided accesses perform only component accesses, alignment is
1102 irrelevant for them. */
1103 if (STMT_VINFO_STRIDED_P (stmt_info)
1104 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1105 continue;
1107 opt_result res = verify_data_ref_alignment (dr_info);
1108 if (!res)
1109 return res;
1112 return opt_result::success ();
1115 /* Given an memory reference EXP return whether its alignment is less
1116 than its size. */
1118 static bool
1119 not_size_aligned (tree exp)
1121 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1122 return true;
1124 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1125 > get_object_alignment (exp));
1128 /* Function vector_alignment_reachable_p
1130 Return true if vector alignment for DR_INFO is reachable by peeling
1131 a few loop iterations. Return false otherwise. */
1133 static bool
1134 vector_alignment_reachable_p (dr_vec_info *dr_info)
1136 stmt_vec_info stmt_info = dr_info->stmt;
1137 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1139 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1141 /* For interleaved access we peel only if number of iterations in
1142 the prolog loop ({VF - misalignment}), is a multiple of the
1143 number of the interleaved accesses. */
1144 int elem_size, mis_in_elements;
1146 /* FORNOW: handle only known alignment. */
1147 if (!known_alignment_for_access_p (dr_info))
1148 return false;
1150 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1151 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1152 elem_size = vector_element_size (vector_size, nelements);
1153 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1155 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1156 return false;
1159 /* If misalignment is known at the compile time then allow peeling
1160 only if natural alignment is reachable through peeling. */
1161 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1163 HOST_WIDE_INT elmsize =
1164 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1165 if (dump_enabled_p ())
1167 dump_printf_loc (MSG_NOTE, vect_location,
1168 "data size = %wd. misalignment = %d.\n", elmsize,
1169 DR_MISALIGNMENT (dr_info));
1171 if (DR_MISALIGNMENT (dr_info) % elmsize)
1173 if (dump_enabled_p ())
1174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1175 "data size does not divide the misalignment.\n");
1176 return false;
1180 if (!known_alignment_for_access_p (dr_info))
1182 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1183 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1184 if (dump_enabled_p ())
1185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186 "Unknown misalignment, %snaturally aligned\n",
1187 is_packed ? "not " : "");
1188 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1191 return true;
1195 /* Calculate the cost of the memory access represented by DR_INFO. */
1197 static void
1198 vect_get_data_access_cost (dr_vec_info *dr_info,
1199 unsigned int *inside_cost,
1200 unsigned int *outside_cost,
1201 stmt_vector_for_cost *body_cost_vec,
1202 stmt_vector_for_cost *prologue_cost_vec)
1204 stmt_vec_info stmt_info = dr_info->stmt;
1205 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1206 int ncopies;
1208 if (PURE_SLP_STMT (stmt_info))
1209 ncopies = 1;
1210 else
1211 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1213 if (DR_IS_READ (dr_info->dr))
1214 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1215 prologue_cost_vec, body_cost_vec, false);
1216 else
1217 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1219 if (dump_enabled_p ())
1220 dump_printf_loc (MSG_NOTE, vect_location,
1221 "vect_get_data_access_cost: inside_cost = %d, "
1222 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1226 typedef struct _vect_peel_info
1228 dr_vec_info *dr_info;
1229 int npeel;
1230 unsigned int count;
1231 } *vect_peel_info;
1233 typedef struct _vect_peel_extended_info
1235 struct _vect_peel_info peel_info;
1236 unsigned int inside_cost;
1237 unsigned int outside_cost;
1238 } *vect_peel_extended_info;
1241 /* Peeling hashtable helpers. */
1243 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1245 static inline hashval_t hash (const _vect_peel_info *);
1246 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1249 inline hashval_t
1250 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1252 return (hashval_t) peel_info->npeel;
1255 inline bool
1256 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1258 return (a->npeel == b->npeel);
1262 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1264 static void
1265 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1266 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1267 int npeel)
1269 struct _vect_peel_info elem, *slot;
1270 _vect_peel_info **new_slot;
1271 bool supportable_dr_alignment
1272 = vect_supportable_dr_alignment (dr_info, true);
1274 elem.npeel = npeel;
1275 slot = peeling_htab->find (&elem);
1276 if (slot)
1277 slot->count++;
1278 else
1280 slot = XNEW (struct _vect_peel_info);
1281 slot->npeel = npeel;
1282 slot->dr_info = dr_info;
1283 slot->count = 1;
1284 new_slot = peeling_htab->find_slot (slot, INSERT);
1285 *new_slot = slot;
1288 if (!supportable_dr_alignment
1289 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1290 slot->count += VECT_MAX_COST;
1294 /* Traverse peeling hash table to find peeling option that aligns maximum
1295 number of data accesses. */
1298 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1299 _vect_peel_extended_info *max)
1301 vect_peel_info elem = *slot;
1303 if (elem->count > max->peel_info.count
1304 || (elem->count == max->peel_info.count
1305 && max->peel_info.npeel > elem->npeel))
1307 max->peel_info.npeel = elem->npeel;
1308 max->peel_info.count = elem->count;
1309 max->peel_info.dr_info = elem->dr_info;
1312 return 1;
1315 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1316 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1317 we assume DR0_INFO's misalignment will be zero after peeling. */
1319 static void
1320 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1321 dr_vec_info *dr0_info,
1322 unsigned int *inside_cost,
1323 unsigned int *outside_cost,
1324 stmt_vector_for_cost *body_cost_vec,
1325 stmt_vector_for_cost *prologue_cost_vec,
1326 unsigned int npeel,
1327 bool unknown_misalignment)
1329 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1330 unsigned i;
1331 data_reference *dr;
1333 FOR_EACH_VEC_ELT (datarefs, i, dr)
1335 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1336 stmt_vec_info stmt_info = dr_info->stmt;
1337 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1338 continue;
1340 /* For interleaving, only the alignment of the first access
1341 matters. */
1342 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1343 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1344 continue;
1346 /* Strided accesses perform only component accesses, alignment is
1347 irrelevant for them. */
1348 if (STMT_VINFO_STRIDED_P (stmt_info)
1349 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1350 continue;
1352 int save_misalignment;
1353 save_misalignment = DR_MISALIGNMENT (dr_info);
1354 if (npeel == 0)
1356 else if (unknown_misalignment && dr_info == dr0_info)
1357 SET_DR_MISALIGNMENT (dr_info, 0);
1358 else
1359 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1360 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1361 body_cost_vec, prologue_cost_vec);
1362 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1366 /* Traverse peeling hash table and calculate cost for each peeling option.
1367 Find the one with the lowest cost. */
1370 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1371 _vect_peel_extended_info *min)
1373 vect_peel_info elem = *slot;
1374 int dummy;
1375 unsigned int inside_cost = 0, outside_cost = 0;
1376 stmt_vec_info stmt_info = elem->dr_info->stmt;
1377 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1378 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1379 epilogue_cost_vec;
1381 prologue_cost_vec.create (2);
1382 body_cost_vec.create (2);
1383 epilogue_cost_vec.create (2);
1385 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1386 &outside_cost, &body_cost_vec,
1387 &prologue_cost_vec, elem->npeel, false);
1389 body_cost_vec.release ();
1391 outside_cost += vect_get_known_peeling_cost
1392 (loop_vinfo, elem->npeel, &dummy,
1393 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1394 &prologue_cost_vec, &epilogue_cost_vec);
1396 /* Prologue and epilogue costs are added to the target model later.
1397 These costs depend only on the scalar iteration cost, the
1398 number of peeling iterations finally chosen, and the number of
1399 misaligned statements. So discard the information found here. */
1400 prologue_cost_vec.release ();
1401 epilogue_cost_vec.release ();
1403 if (inside_cost < min->inside_cost
1404 || (inside_cost == min->inside_cost
1405 && outside_cost < min->outside_cost))
1407 min->inside_cost = inside_cost;
1408 min->outside_cost = outside_cost;
1409 min->peel_info.dr_info = elem->dr_info;
1410 min->peel_info.npeel = elem->npeel;
1411 min->peel_info.count = elem->count;
1414 return 1;
1418 /* Choose best peeling option by traversing peeling hash table and either
1419 choosing an option with the lowest cost (if cost model is enabled) or the
1420 option that aligns as many accesses as possible. */
1422 static struct _vect_peel_extended_info
1423 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1424 loop_vec_info loop_vinfo)
1426 struct _vect_peel_extended_info res;
1428 res.peel_info.dr_info = NULL;
1430 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1432 res.inside_cost = INT_MAX;
1433 res.outside_cost = INT_MAX;
1434 peeling_htab->traverse <_vect_peel_extended_info *,
1435 vect_peeling_hash_get_lowest_cost> (&res);
1437 else
1439 res.peel_info.count = 0;
1440 peeling_htab->traverse <_vect_peel_extended_info *,
1441 vect_peeling_hash_get_most_frequent> (&res);
1442 res.inside_cost = 0;
1443 res.outside_cost = 0;
1446 return res;
1449 /* Return true if the new peeling NPEEL is supported. */
1451 static bool
1452 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1453 unsigned npeel)
1455 unsigned i;
1456 struct data_reference *dr = NULL;
1457 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1458 enum dr_alignment_support supportable_dr_alignment;
1460 /* Ensure that all data refs can be vectorized after the peel. */
1461 FOR_EACH_VEC_ELT (datarefs, i, dr)
1463 int save_misalignment;
1465 if (dr == dr0_info->dr)
1466 continue;
1468 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1469 stmt_vec_info stmt_info = dr_info->stmt;
1470 /* For interleaving, only the alignment of the first access
1471 matters. */
1472 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1473 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1474 continue;
1476 /* Strided accesses perform only component accesses, alignment is
1477 irrelevant for them. */
1478 if (STMT_VINFO_STRIDED_P (stmt_info)
1479 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1480 continue;
1482 save_misalignment = DR_MISALIGNMENT (dr_info);
1483 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1484 supportable_dr_alignment
1485 = vect_supportable_dr_alignment (dr_info, false);
1486 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1488 if (!supportable_dr_alignment)
1489 return false;
1492 return true;
1495 /* Function vect_enhance_data_refs_alignment
1497 This pass will use loop versioning and loop peeling in order to enhance
1498 the alignment of data references in the loop.
1500 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1501 original loop is to be vectorized. Any other loops that are created by
1502 the transformations performed in this pass - are not supposed to be
1503 vectorized. This restriction will be relaxed.
1505 This pass will require a cost model to guide it whether to apply peeling
1506 or versioning or a combination of the two. For example, the scheme that
1507 intel uses when given a loop with several memory accesses, is as follows:
1508 choose one memory access ('p') which alignment you want to force by doing
1509 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1510 other accesses are not necessarily aligned, or (2) use loop versioning to
1511 generate one loop in which all accesses are aligned, and another loop in
1512 which only 'p' is necessarily aligned.
1514 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1515 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1516 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1518 Devising a cost model is the most critical aspect of this work. It will
1519 guide us on which access to peel for, whether to use loop versioning, how
1520 many versions to create, etc. The cost model will probably consist of
1521 generic considerations as well as target specific considerations (on
1522 powerpc for example, misaligned stores are more painful than misaligned
1523 loads).
1525 Here are the general steps involved in alignment enhancements:
1527 -- original loop, before alignment analysis:
1528 for (i=0; i<N; i++){
1529 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1530 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1533 -- After vect_compute_data_refs_alignment:
1534 for (i=0; i<N; i++){
1535 x = q[i]; # DR_MISALIGNMENT(q) = 3
1536 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1539 -- Possibility 1: we do loop versioning:
1540 if (p is aligned) {
1541 for (i=0; i<N; i++){ # loop 1A
1542 x = q[i]; # DR_MISALIGNMENT(q) = 3
1543 p[i] = y; # DR_MISALIGNMENT(p) = 0
1546 else {
1547 for (i=0; i<N; i++){ # loop 1B
1548 x = q[i]; # DR_MISALIGNMENT(q) = 3
1549 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1553 -- Possibility 2: we do loop peeling:
1554 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1555 x = q[i];
1556 p[i] = y;
1558 for (i = 3; i < N; i++){ # loop 2A
1559 x = q[i]; # DR_MISALIGNMENT(q) = 0
1560 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1563 -- Possibility 3: combination of loop peeling and versioning:
1564 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1565 x = q[i];
1566 p[i] = y;
1568 if (p is aligned) {
1569 for (i = 3; i<N; i++){ # loop 3A
1570 x = q[i]; # DR_MISALIGNMENT(q) = 0
1571 p[i] = y; # DR_MISALIGNMENT(p) = 0
1574 else {
1575 for (i = 3; i<N; i++){ # loop 3B
1576 x = q[i]; # DR_MISALIGNMENT(q) = 0
1577 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1581 These loops are later passed to loop_transform to be vectorized. The
1582 vectorizer will use the alignment information to guide the transformation
1583 (whether to generate regular loads/stores, or with special handling for
1584 misalignment). */
1586 opt_result
1587 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1589 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1590 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1591 enum dr_alignment_support supportable_dr_alignment;
1592 dr_vec_info *first_store = NULL;
1593 dr_vec_info *dr0_info = NULL;
1594 struct data_reference *dr;
1595 unsigned int i, j;
1596 bool do_peeling = false;
1597 bool do_versioning = false;
1598 unsigned int npeel = 0;
1599 bool one_misalignment_known = false;
1600 bool one_misalignment_unknown = false;
1601 bool one_dr_unsupportable = false;
1602 dr_vec_info *unsupportable_dr_info = NULL;
1603 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1604 unsigned possible_npeel_number = 1;
1605 tree vectype;
1606 unsigned int mis, same_align_drs_max = 0;
1607 hash_table<peel_info_hasher> peeling_htab (1);
1609 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1611 /* Reset data so we can safely be called multiple times. */
1612 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1613 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1615 /* While cost model enhancements are expected in the future, the high level
1616 view of the code at this time is as follows:
1618 A) If there is a misaligned access then see if peeling to align
1619 this access can make all data references satisfy
1620 vect_supportable_dr_alignment. If so, update data structures
1621 as needed and return true.
1623 B) If peeling wasn't possible and there is a data reference with an
1624 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1625 then see if loop versioning checks can be used to make all data
1626 references satisfy vect_supportable_dr_alignment. If so, update
1627 data structures as needed and return true.
1629 C) If neither peeling nor versioning were successful then return false if
1630 any data reference does not satisfy vect_supportable_dr_alignment.
1632 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1634 Note, Possibility 3 above (which is peeling and versioning together) is not
1635 being done at this time. */
1637 /* (1) Peeling to force alignment. */
1639 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1640 Considerations:
1641 + How many accesses will become aligned due to the peeling
1642 - How many accesses will become unaligned due to the peeling,
1643 and the cost of misaligned accesses.
1644 - The cost of peeling (the extra runtime checks, the increase
1645 in code size). */
1647 FOR_EACH_VEC_ELT (datarefs, i, dr)
1649 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1650 stmt_vec_info stmt_info = dr_info->stmt;
1652 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1653 continue;
1655 /* For interleaving, only the alignment of the first access
1656 matters. */
1657 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1658 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1659 continue;
1661 /* For scatter-gather or invariant accesses there is nothing
1662 to enhance. */
1663 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1664 || integer_zerop (DR_STEP (dr)))
1665 continue;
1667 /* Strided accesses perform only component accesses, alignment is
1668 irrelevant for them. */
1669 if (STMT_VINFO_STRIDED_P (stmt_info)
1670 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1671 continue;
1673 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1674 do_peeling = vector_alignment_reachable_p (dr_info);
1675 if (do_peeling)
1677 if (known_alignment_for_access_p (dr_info))
1679 unsigned int npeel_tmp = 0;
1680 bool negative = tree_int_cst_compare (DR_STEP (dr),
1681 size_zero_node) < 0;
1683 vectype = STMT_VINFO_VECTYPE (stmt_info);
1684 /* If known_alignment_for_access_p then we have set
1685 DR_MISALIGNMENT which is only done if we know it at compiler
1686 time, so it is safe to assume target alignment is constant.
1688 unsigned int target_align =
1689 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1690 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1691 mis = (negative
1692 ? DR_MISALIGNMENT (dr_info)
1693 : -DR_MISALIGNMENT (dr_info));
1694 if (DR_MISALIGNMENT (dr_info) != 0)
1695 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1697 /* For multiple types, it is possible that the bigger type access
1698 will have more than one peeling option. E.g., a loop with two
1699 types: one of size (vector size / 4), and the other one of
1700 size (vector size / 8). Vectorization factor will 8. If both
1701 accesses are misaligned by 3, the first one needs one scalar
1702 iteration to be aligned, and the second one needs 5. But the
1703 first one will be aligned also by peeling 5 scalar
1704 iterations, and in that case both accesses will be aligned.
1705 Hence, except for the immediate peeling amount, we also want
1706 to try to add full vector size, while we don't exceed
1707 vectorization factor.
1708 We do this automatically for cost model, since we calculate
1709 cost for every peeling option. */
1710 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1712 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1713 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1714 possible_npeel_number
1715 = vect_get_num_vectors (nscalars, vectype);
1717 /* NPEEL_TMP is 0 when there is no misalignment, but also
1718 allow peeling NELEMENTS. */
1719 if (DR_MISALIGNMENT (dr_info) == 0)
1720 possible_npeel_number++;
1723 /* Save info about DR in the hash table. Also include peeling
1724 amounts according to the explanation above. */
1725 for (j = 0; j < possible_npeel_number; j++)
1727 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1728 dr_info, npeel_tmp);
1729 npeel_tmp += target_align / dr_size;
1732 one_misalignment_known = true;
1734 else
1736 /* If we don't know any misalignment values, we prefer
1737 peeling for data-ref that has the maximum number of data-refs
1738 with the same alignment, unless the target prefers to align
1739 stores over load. */
1740 unsigned same_align_drs
1741 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1742 if (!dr0_info
1743 || same_align_drs_max < same_align_drs)
1745 same_align_drs_max = same_align_drs;
1746 dr0_info = dr_info;
1748 /* For data-refs with the same number of related
1749 accesses prefer the one where the misalign
1750 computation will be invariant in the outermost loop. */
1751 else if (same_align_drs_max == same_align_drs)
1753 struct loop *ivloop0, *ivloop;
1754 ivloop0 = outermost_invariant_loop_for_expr
1755 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1756 ivloop = outermost_invariant_loop_for_expr
1757 (loop, DR_BASE_ADDRESS (dr));
1758 if ((ivloop && !ivloop0)
1759 || (ivloop && ivloop0
1760 && flow_loop_nested_p (ivloop, ivloop0)))
1761 dr0_info = dr_info;
1764 one_misalignment_unknown = true;
1766 /* Check for data refs with unsupportable alignment that
1767 can be peeled. */
1768 if (!supportable_dr_alignment)
1770 one_dr_unsupportable = true;
1771 unsupportable_dr_info = dr_info;
1774 if (!first_store && DR_IS_WRITE (dr))
1775 first_store = dr_info;
1778 else
1780 if (!aligned_access_p (dr_info))
1782 if (dump_enabled_p ())
1783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1784 "vector alignment may not be reachable\n");
1785 break;
1790 /* Check if we can possibly peel the loop. */
1791 if (!vect_can_advance_ivs_p (loop_vinfo)
1792 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1793 || loop->inner)
1794 do_peeling = false;
1796 struct _vect_peel_extended_info peel_for_known_alignment;
1797 struct _vect_peel_extended_info peel_for_unknown_alignment;
1798 struct _vect_peel_extended_info best_peel;
1800 peel_for_unknown_alignment.inside_cost = INT_MAX;
1801 peel_for_unknown_alignment.outside_cost = INT_MAX;
1802 peel_for_unknown_alignment.peel_info.count = 0;
1804 if (do_peeling
1805 && one_misalignment_unknown)
1807 /* Check if the target requires to prefer stores over loads, i.e., if
1808 misaligned stores are more expensive than misaligned loads (taking
1809 drs with same alignment into account). */
1810 unsigned int load_inside_cost = 0;
1811 unsigned int load_outside_cost = 0;
1812 unsigned int store_inside_cost = 0;
1813 unsigned int store_outside_cost = 0;
1814 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1816 stmt_vector_for_cost dummy;
1817 dummy.create (2);
1818 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1819 &load_inside_cost,
1820 &load_outside_cost,
1821 &dummy, &dummy, estimated_npeels, true);
1822 dummy.release ();
1824 if (first_store)
1826 dummy.create (2);
1827 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1828 &store_inside_cost,
1829 &store_outside_cost,
1830 &dummy, &dummy,
1831 estimated_npeels, true);
1832 dummy.release ();
1834 else
1836 store_inside_cost = INT_MAX;
1837 store_outside_cost = INT_MAX;
1840 if (load_inside_cost > store_inside_cost
1841 || (load_inside_cost == store_inside_cost
1842 && load_outside_cost > store_outside_cost))
1844 dr0_info = first_store;
1845 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1846 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1848 else
1850 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1851 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1854 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1855 prologue_cost_vec.create (2);
1856 epilogue_cost_vec.create (2);
1858 int dummy2;
1859 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1860 (loop_vinfo, estimated_npeels, &dummy2,
1861 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1862 &prologue_cost_vec, &epilogue_cost_vec);
1864 prologue_cost_vec.release ();
1865 epilogue_cost_vec.release ();
1867 peel_for_unknown_alignment.peel_info.count = 1
1868 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1871 peel_for_unknown_alignment.peel_info.npeel = 0;
1872 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1874 best_peel = peel_for_unknown_alignment;
1876 peel_for_known_alignment.inside_cost = INT_MAX;
1877 peel_for_known_alignment.outside_cost = INT_MAX;
1878 peel_for_known_alignment.peel_info.count = 0;
1879 peel_for_known_alignment.peel_info.dr_info = NULL;
1881 if (do_peeling && one_misalignment_known)
1883 /* Peeling is possible, but there is no data access that is not supported
1884 unless aligned. So we try to choose the best possible peeling from
1885 the hash table. */
1886 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1887 (&peeling_htab, loop_vinfo);
1890 /* Compare costs of peeling for known and unknown alignment. */
1891 if (peel_for_known_alignment.peel_info.dr_info != NULL
1892 && peel_for_unknown_alignment.inside_cost
1893 >= peel_for_known_alignment.inside_cost)
1895 best_peel = peel_for_known_alignment;
1897 /* If the best peeling for known alignment has NPEEL == 0, perform no
1898 peeling at all except if there is an unsupportable dr that we can
1899 align. */
1900 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1901 do_peeling = false;
1904 /* If there is an unsupportable data ref, prefer this over all choices so far
1905 since we'd have to discard a chosen peeling except when it accidentally
1906 aligned the unsupportable data ref. */
1907 if (one_dr_unsupportable)
1908 dr0_info = unsupportable_dr_info;
1909 else if (do_peeling)
1911 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1912 TODO: Use nopeel_outside_cost or get rid of it? */
1913 unsigned nopeel_inside_cost = 0;
1914 unsigned nopeel_outside_cost = 0;
1916 stmt_vector_for_cost dummy;
1917 dummy.create (2);
1918 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1919 &nopeel_outside_cost, &dummy, &dummy,
1920 0, false);
1921 dummy.release ();
1923 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1924 costs will be recorded. */
1925 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1926 prologue_cost_vec.create (2);
1927 epilogue_cost_vec.create (2);
1929 int dummy2;
1930 nopeel_outside_cost += vect_get_known_peeling_cost
1931 (loop_vinfo, 0, &dummy2,
1932 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1933 &prologue_cost_vec, &epilogue_cost_vec);
1935 prologue_cost_vec.release ();
1936 epilogue_cost_vec.release ();
1938 npeel = best_peel.peel_info.npeel;
1939 dr0_info = best_peel.peel_info.dr_info;
1941 /* If no peeling is not more expensive than the best peeling we
1942 have so far, don't perform any peeling. */
1943 if (nopeel_inside_cost <= best_peel.inside_cost)
1944 do_peeling = false;
1947 if (do_peeling)
1949 stmt_vec_info stmt_info = dr0_info->stmt;
1950 vectype = STMT_VINFO_VECTYPE (stmt_info);
1952 if (known_alignment_for_access_p (dr0_info))
1954 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
1955 size_zero_node) < 0;
1956 if (!npeel)
1958 /* Since it's known at compile time, compute the number of
1959 iterations in the peeled loop (the peeling factor) for use in
1960 updating DR_MISALIGNMENT values. The peeling factor is the
1961 vectorization factor minus the misalignment as an element
1962 count. */
1963 mis = (negative
1964 ? DR_MISALIGNMENT (dr0_info)
1965 : -DR_MISALIGNMENT (dr0_info));
1966 /* If known_alignment_for_access_p then we have set
1967 DR_MISALIGNMENT which is only done if we know it at compiler
1968 time, so it is safe to assume target alignment is constant.
1970 unsigned int target_align =
1971 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
1972 npeel = ((mis & (target_align - 1))
1973 / vect_get_scalar_dr_size (dr0_info));
1976 /* For interleaved data access every iteration accesses all the
1977 members of the group, therefore we divide the number of iterations
1978 by the group size. */
1979 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1980 npeel /= DR_GROUP_SIZE (stmt_info);
1982 if (dump_enabled_p ())
1983 dump_printf_loc (MSG_NOTE, vect_location,
1984 "Try peeling by %d\n", npeel);
1987 /* Ensure that all datarefs can be vectorized after the peel. */
1988 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
1989 do_peeling = false;
1991 /* Check if all datarefs are supportable and log. */
1992 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
1994 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
1995 if (!stat)
1996 do_peeling = false;
1997 else
1998 return stat;
2001 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2002 if (do_peeling)
2004 unsigned max_allowed_peel
2005 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2006 if (max_allowed_peel != (unsigned)-1)
2008 unsigned max_peel = npeel;
2009 if (max_peel == 0)
2011 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2012 unsigned HOST_WIDE_INT target_align_c;
2013 if (target_align.is_constant (&target_align_c))
2014 max_peel =
2015 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2016 else
2018 do_peeling = false;
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_NOTE, vect_location,
2021 "Disable peeling, max peels set and vector"
2022 " alignment unknown\n");
2025 if (max_peel > max_allowed_peel)
2027 do_peeling = false;
2028 if (dump_enabled_p ())
2029 dump_printf_loc (MSG_NOTE, vect_location,
2030 "Disable peeling, max peels reached: %d\n", max_peel);
2035 /* Cost model #2 - if peeling may result in a remaining loop not
2036 iterating enough to be vectorized then do not peel. Since this
2037 is a cost heuristic rather than a correctness decision, use the
2038 most likely runtime value for variable vectorization factors. */
2039 if (do_peeling
2040 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2042 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2043 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2044 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2045 < assumed_vf + max_peel)
2046 do_peeling = false;
2049 if (do_peeling)
2051 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2052 If the misalignment of DR_i is identical to that of dr0 then set
2053 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2054 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2055 by the peeling factor times the element size of DR_i (MOD the
2056 vectorization factor times the size). Otherwise, the
2057 misalignment of DR_i must be set to unknown. */
2058 FOR_EACH_VEC_ELT (datarefs, i, dr)
2059 if (dr != dr0_info->dr)
2061 /* Strided accesses perform only component accesses, alignment
2062 is irrelevant for them. */
2063 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2064 stmt_info = dr_info->stmt;
2065 if (STMT_VINFO_STRIDED_P (stmt_info)
2066 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2067 continue;
2069 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2072 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2073 if (npeel)
2074 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2075 else
2076 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2077 = DR_MISALIGNMENT (dr0_info);
2078 SET_DR_MISALIGNMENT (dr0_info, 0);
2079 if (dump_enabled_p ())
2081 dump_printf_loc (MSG_NOTE, vect_location,
2082 "Alignment of access forced using peeling.\n");
2083 dump_printf_loc (MSG_NOTE, vect_location,
2084 "Peeling for alignment will be applied.\n");
2087 /* The inside-loop cost will be accounted for in vectorizable_load
2088 and vectorizable_store correctly with adjusted alignments.
2089 Drop the body_cst_vec on the floor here. */
2090 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2091 gcc_assert (stat);
2092 return stat;
2096 /* (2) Versioning to force alignment. */
2098 /* Try versioning if:
2099 1) optimize loop for speed
2100 2) there is at least one unsupported misaligned data ref with an unknown
2101 misalignment, and
2102 3) all misaligned data refs with a known misalignment are supported, and
2103 4) the number of runtime alignment checks is within reason. */
2105 do_versioning =
2106 optimize_loop_nest_for_speed_p (loop)
2107 && (!loop->inner); /* FORNOW */
2109 if (do_versioning)
2111 FOR_EACH_VEC_ELT (datarefs, i, dr)
2113 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2114 stmt_vec_info stmt_info = dr_info->stmt;
2116 /* For interleaving, only the alignment of the first access
2117 matters. */
2118 if (aligned_access_p (dr_info)
2119 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2120 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2121 continue;
2123 if (STMT_VINFO_STRIDED_P (stmt_info))
2125 /* Strided loads perform only component accesses, alignment is
2126 irrelevant for them. */
2127 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2128 continue;
2129 do_versioning = false;
2130 break;
2133 supportable_dr_alignment
2134 = vect_supportable_dr_alignment (dr_info, false);
2136 if (!supportable_dr_alignment)
2138 int mask;
2139 tree vectype;
2141 if (known_alignment_for_access_p (dr_info)
2142 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2143 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2145 do_versioning = false;
2146 break;
2149 vectype = STMT_VINFO_VECTYPE (stmt_info);
2150 gcc_assert (vectype);
2152 /* At present we don't support versioning for alignment
2153 with variable VF, since there's no guarantee that the
2154 VF is a power of two. We could relax this if we added
2155 a way of enforcing a power-of-two size. */
2156 unsigned HOST_WIDE_INT size;
2157 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2159 do_versioning = false;
2160 break;
2163 /* The rightmost bits of an aligned address must be zeros.
2164 Construct the mask needed for this test. For example,
2165 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2166 mask must be 15 = 0xf. */
2167 mask = size - 1;
2169 /* FORNOW: use the same mask to test all potentially unaligned
2170 references in the loop. The vectorizer currently supports
2171 a single vector size, see the reference to
2172 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2173 vectorization factor is computed. */
2174 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2175 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2176 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2177 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2181 /* Versioning requires at least one misaligned data reference. */
2182 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2183 do_versioning = false;
2184 else if (!do_versioning)
2185 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2188 if (do_versioning)
2190 vec<stmt_vec_info> may_misalign_stmts
2191 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2192 stmt_vec_info stmt_info;
2194 /* It can now be assumed that the data references in the statements
2195 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2196 of the loop being vectorized. */
2197 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2199 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2200 SET_DR_MISALIGNMENT (dr_info, 0);
2201 if (dump_enabled_p ())
2202 dump_printf_loc (MSG_NOTE, vect_location,
2203 "Alignment of access forced using versioning.\n");
2206 if (dump_enabled_p ())
2207 dump_printf_loc (MSG_NOTE, vect_location,
2208 "Versioning for alignment will be applied.\n");
2210 /* Peeling and versioning can't be done together at this time. */
2211 gcc_assert (! (do_peeling && do_versioning));
2213 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2214 gcc_assert (stat);
2215 return stat;
2218 /* This point is reached if neither peeling nor versioning is being done. */
2219 gcc_assert (! (do_peeling || do_versioning));
2221 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2222 return stat;
2226 /* Function vect_find_same_alignment_drs.
2228 Update group and alignment relations in VINFO according to the chosen
2229 vectorization factor. */
2231 static void
2232 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2234 struct data_reference *dra = DDR_A (ddr);
2235 struct data_reference *drb = DDR_B (ddr);
2236 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2237 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2238 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2239 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2241 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2242 return;
2244 if (dra == drb)
2245 return;
2247 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2248 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2249 return;
2251 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2252 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2253 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2254 return;
2256 /* Two references with distance zero have the same alignment. */
2257 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2258 - wi::to_poly_offset (DR_INIT (drb)));
2259 if (maybe_ne (diff, 0))
2261 /* Get the wider of the two alignments. */
2262 poly_uint64 align_a =
2263 exact_div (vect_calculate_target_alignment (dr_info_a),
2264 BITS_PER_UNIT);
2265 poly_uint64 align_b =
2266 exact_div (vect_calculate_target_alignment (dr_info_b),
2267 BITS_PER_UNIT);
2268 unsigned HOST_WIDE_INT align_a_c, align_b_c;
2269 if (!align_a.is_constant (&align_a_c)
2270 || !align_b.is_constant (&align_b_c))
2271 return;
2273 unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2275 /* Require the gap to be a multiple of the larger vector alignment. */
2276 if (!multiple_p (diff, max_align))
2277 return;
2280 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2281 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2282 if (dump_enabled_p ())
2283 dump_printf_loc (MSG_NOTE, vect_location,
2284 "accesses have the same alignment: %T and %T\n",
2285 DR_REF (dra), DR_REF (drb));
2289 /* Function vect_analyze_data_refs_alignment
2291 Analyze the alignment of the data-references in the loop.
2292 Return FALSE if a data reference is found that cannot be vectorized. */
2294 opt_result
2295 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2297 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2299 /* Mark groups of data references with same alignment using
2300 data dependence information. */
2301 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2302 struct data_dependence_relation *ddr;
2303 unsigned int i;
2305 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2306 vect_find_same_alignment_drs (vinfo, ddr);
2308 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2309 struct data_reference *dr;
2311 vect_record_base_alignments (vinfo);
2312 FOR_EACH_VEC_ELT (datarefs, i, dr)
2314 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2315 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2316 vect_compute_data_ref_alignment (dr_info);
2319 return opt_result::success ();
2323 /* Analyze alignment of DRs of stmts in NODE. */
2325 static bool
2326 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2328 /* We vectorize from the first scalar stmt in the node unless
2329 the node is permuted in which case we start from the first
2330 element in the group. */
2331 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2332 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2333 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2334 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2336 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2337 vect_compute_data_ref_alignment (dr_info);
2338 /* For creating the data-ref pointer we need alignment of the
2339 first element anyway. */
2340 if (dr_info != first_dr_info)
2341 vect_compute_data_ref_alignment (first_dr_info);
2342 if (! verify_data_ref_alignment (dr_info))
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "not vectorized: bad data alignment in basic "
2347 "block.\n");
2348 return false;
2351 return true;
2354 /* Function vect_slp_analyze_instance_alignment
2356 Analyze the alignment of the data-references in the SLP instance.
2357 Return FALSE if a data reference is found that cannot be vectorized. */
2359 bool
2360 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2362 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2364 slp_tree node;
2365 unsigned i;
2366 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2367 if (! vect_slp_analyze_and_verify_node_alignment (node))
2368 return false;
2370 node = SLP_INSTANCE_TREE (instance);
2371 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2372 && ! vect_slp_analyze_and_verify_node_alignment
2373 (SLP_INSTANCE_TREE (instance)))
2374 return false;
2376 return true;
2380 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2381 accesses of legal size, step, etc. Detect gaps, single element
2382 interleaving, and other special cases. Set grouped access info.
2383 Collect groups of strided stores for further use in SLP analysis.
2384 Worker for vect_analyze_group_access. */
2386 static bool
2387 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2389 data_reference *dr = dr_info->dr;
2390 tree step = DR_STEP (dr);
2391 tree scalar_type = TREE_TYPE (DR_REF (dr));
2392 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2393 stmt_vec_info stmt_info = dr_info->stmt;
2394 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2395 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2396 HOST_WIDE_INT dr_step = -1;
2397 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2398 bool slp_impossible = false;
2400 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2401 size of the interleaving group (including gaps). */
2402 if (tree_fits_shwi_p (step))
2404 dr_step = tree_to_shwi (step);
2405 /* Check that STEP is a multiple of type size. Otherwise there is
2406 a non-element-sized gap at the end of the group which we
2407 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2408 ??? As we can handle non-constant step fine here we should
2409 simply remove uses of DR_GROUP_GAP between the last and first
2410 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2411 simply not include that gap. */
2412 if ((dr_step % type_size) != 0)
2414 if (dump_enabled_p ())
2415 dump_printf_loc (MSG_NOTE, vect_location,
2416 "Step %T is not a multiple of the element size"
2417 " for %T\n",
2418 step, DR_REF (dr));
2419 return false;
2421 groupsize = absu_hwi (dr_step) / type_size;
2423 else
2424 groupsize = 0;
2426 /* Not consecutive access is possible only if it is a part of interleaving. */
2427 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2429 /* Check if it this DR is a part of interleaving, and is a single
2430 element of the group that is accessed in the loop. */
2432 /* Gaps are supported only for loads. STEP must be a multiple of the type
2433 size. */
2434 if (DR_IS_READ (dr)
2435 && (dr_step % type_size) == 0
2436 && groupsize > 0)
2438 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2439 DR_GROUP_SIZE (stmt_info) = groupsize;
2440 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2441 if (dump_enabled_p ())
2442 dump_printf_loc (MSG_NOTE, vect_location,
2443 "Detected single element interleaving %T"
2444 " step %T\n",
2445 DR_REF (dr), step);
2447 return true;
2450 if (dump_enabled_p ())
2451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2452 "not consecutive access %G", stmt_info->stmt);
2454 if (bb_vinfo)
2456 /* Mark the statement as unvectorizable. */
2457 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2458 return true;
2461 if (dump_enabled_p ())
2462 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2463 STMT_VINFO_STRIDED_P (stmt_info) = true;
2464 return true;
2467 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2469 /* First stmt in the interleaving chain. Check the chain. */
2470 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2471 struct data_reference *data_ref = dr;
2472 unsigned int count = 1;
2473 tree prev_init = DR_INIT (data_ref);
2474 stmt_vec_info prev = stmt_info;
2475 HOST_WIDE_INT diff, gaps = 0;
2477 /* By construction, all group members have INTEGER_CST DR_INITs. */
2478 while (next)
2480 /* Skip same data-refs. In case that two or more stmts share
2481 data-ref (supported only for loads), we vectorize only the first
2482 stmt, and the rest get their vectorized loads from the first
2483 one. */
2484 if (!tree_int_cst_compare (DR_INIT (data_ref),
2485 DR_INIT (STMT_VINFO_DATA_REF (next))))
2487 if (DR_IS_WRITE (data_ref))
2489 if (dump_enabled_p ())
2490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2491 "Two store stmts share the same dr.\n");
2492 return false;
2495 if (dump_enabled_p ())
2496 dump_printf_loc (MSG_NOTE, vect_location,
2497 "Two or more load stmts share the same dr.\n");
2499 /* For load use the same data-ref load. */
2500 DR_GROUP_SAME_DR_STMT (next) = prev;
2502 prev = next;
2503 next = DR_GROUP_NEXT_ELEMENT (next);
2504 continue;
2507 prev = next;
2508 data_ref = STMT_VINFO_DATA_REF (next);
2510 /* All group members have the same STEP by construction. */
2511 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2513 /* Check that the distance between two accesses is equal to the type
2514 size. Otherwise, we have gaps. */
2515 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2516 - TREE_INT_CST_LOW (prev_init)) / type_size;
2517 if (diff != 1)
2519 /* FORNOW: SLP of accesses with gaps is not supported. */
2520 slp_impossible = true;
2521 if (DR_IS_WRITE (data_ref))
2523 if (dump_enabled_p ())
2524 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2525 "interleaved store with gaps\n");
2526 return false;
2529 gaps += diff - 1;
2532 last_accessed_element += diff;
2534 /* Store the gap from the previous member of the group. If there is no
2535 gap in the access, DR_GROUP_GAP is always 1. */
2536 DR_GROUP_GAP (next) = diff;
2538 prev_init = DR_INIT (data_ref);
2539 next = DR_GROUP_NEXT_ELEMENT (next);
2540 /* Count the number of data-refs in the chain. */
2541 count++;
2544 if (groupsize == 0)
2545 groupsize = count + gaps;
2547 /* This could be UINT_MAX but as we are generating code in a very
2548 inefficient way we have to cap earlier. See PR78699 for example. */
2549 if (groupsize > 4096)
2551 if (dump_enabled_p ())
2552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2553 "group is too large\n");
2554 return false;
2557 /* Check that the size of the interleaving is equal to count for stores,
2558 i.e., that there are no gaps. */
2559 if (groupsize != count
2560 && !DR_IS_READ (dr))
2562 groupsize = count;
2563 STMT_VINFO_STRIDED_P (stmt_info) = true;
2566 /* If there is a gap after the last load in the group it is the
2567 difference between the groupsize and the last accessed
2568 element.
2569 When there is no gap, this difference should be 0. */
2570 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2572 DR_GROUP_SIZE (stmt_info) = groupsize;
2573 if (dump_enabled_p ())
2575 dump_printf_loc (MSG_NOTE, vect_location,
2576 "Detected interleaving ");
2577 if (DR_IS_READ (dr))
2578 dump_printf (MSG_NOTE, "load ");
2579 else if (STMT_VINFO_STRIDED_P (stmt_info))
2580 dump_printf (MSG_NOTE, "strided store ");
2581 else
2582 dump_printf (MSG_NOTE, "store ");
2583 dump_printf (MSG_NOTE, "of size %u\n",
2584 (unsigned)groupsize);
2585 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2586 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2587 while (next)
2589 if (DR_GROUP_GAP (next) != 1)
2590 dump_printf_loc (MSG_NOTE, vect_location,
2591 "\t<gap of %d elements>\n",
2592 DR_GROUP_GAP (next) - 1);
2593 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2594 next = DR_GROUP_NEXT_ELEMENT (next);
2596 if (DR_GROUP_GAP (stmt_info) != 0)
2597 dump_printf_loc (MSG_NOTE, vect_location,
2598 "\t<gap of %d elements>\n",
2599 DR_GROUP_GAP (stmt_info));
2602 /* SLP: create an SLP data structure for every interleaving group of
2603 stores for further analysis in vect_analyse_slp. */
2604 if (DR_IS_WRITE (dr) && !slp_impossible)
2606 if (loop_vinfo)
2607 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2608 if (bb_vinfo)
2609 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2613 return true;
2616 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2617 accesses of legal size, step, etc. Detect gaps, single element
2618 interleaving, and other special cases. Set grouped access info.
2619 Collect groups of strided stores for further use in SLP analysis. */
2621 static bool
2622 vect_analyze_group_access (dr_vec_info *dr_info)
2624 if (!vect_analyze_group_access_1 (dr_info))
2626 /* Dissolve the group if present. */
2627 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2628 while (stmt_info)
2630 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2631 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2632 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2633 stmt_info = next;
2635 return false;
2637 return true;
2640 /* Analyze the access pattern of the data-reference DR_INFO.
2641 In case of non-consecutive accesses call vect_analyze_group_access() to
2642 analyze groups of accesses. */
2644 static bool
2645 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2647 data_reference *dr = dr_info->dr;
2648 tree step = DR_STEP (dr);
2649 tree scalar_type = TREE_TYPE (DR_REF (dr));
2650 stmt_vec_info stmt_info = dr_info->stmt;
2651 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2652 struct loop *loop = NULL;
2654 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2655 return true;
2657 if (loop_vinfo)
2658 loop = LOOP_VINFO_LOOP (loop_vinfo);
2660 if (loop_vinfo && !step)
2662 if (dump_enabled_p ())
2663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2664 "bad data-ref access in loop\n");
2665 return false;
2668 /* Allow loads with zero step in inner-loop vectorization. */
2669 if (loop_vinfo && integer_zerop (step))
2671 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2672 if (!nested_in_vect_loop_p (loop, stmt_info))
2673 return DR_IS_READ (dr);
2674 /* Allow references with zero step for outer loops marked
2675 with pragma omp simd only - it guarantees absence of
2676 loop-carried dependencies between inner loop iterations. */
2677 if (loop->safelen < 2)
2679 if (dump_enabled_p ())
2680 dump_printf_loc (MSG_NOTE, vect_location,
2681 "zero step in inner loop of nest\n");
2682 return false;
2686 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2688 /* Interleaved accesses are not yet supported within outer-loop
2689 vectorization for references in the inner-loop. */
2690 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2692 /* For the rest of the analysis we use the outer-loop step. */
2693 step = STMT_VINFO_DR_STEP (stmt_info);
2694 if (integer_zerop (step))
2696 if (dump_enabled_p ())
2697 dump_printf_loc (MSG_NOTE, vect_location,
2698 "zero step in outer loop.\n");
2699 return DR_IS_READ (dr);
2703 /* Consecutive? */
2704 if (TREE_CODE (step) == INTEGER_CST)
2706 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2707 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2708 || (dr_step < 0
2709 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2711 /* Mark that it is not interleaving. */
2712 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2713 return true;
2717 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2719 if (dump_enabled_p ())
2720 dump_printf_loc (MSG_NOTE, vect_location,
2721 "grouped access in outer loop.\n");
2722 return false;
2726 /* Assume this is a DR handled by non-constant strided load case. */
2727 if (TREE_CODE (step) != INTEGER_CST)
2728 return (STMT_VINFO_STRIDED_P (stmt_info)
2729 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2730 || vect_analyze_group_access (dr_info)));
2732 /* Not consecutive access - check if it's a part of interleaving group. */
2733 return vect_analyze_group_access (dr_info);
2736 /* Compare two data-references DRA and DRB to group them into chunks
2737 suitable for grouping. */
2739 static int
2740 dr_group_sort_cmp (const void *dra_, const void *drb_)
2742 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2743 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2744 int cmp;
2746 /* Stabilize sort. */
2747 if (dra == drb)
2748 return 0;
2750 /* DRs in different loops never belong to the same group. */
2751 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2752 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2753 if (loopa != loopb)
2754 return loopa->num < loopb->num ? -1 : 1;
2756 /* Ordering of DRs according to base. */
2757 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2758 DR_BASE_ADDRESS (drb));
2759 if (cmp != 0)
2760 return cmp;
2762 /* And according to DR_OFFSET. */
2763 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2764 if (cmp != 0)
2765 return cmp;
2767 /* Put reads before writes. */
2768 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2769 return DR_IS_READ (dra) ? -1 : 1;
2771 /* Then sort after access size. */
2772 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2773 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2774 if (cmp != 0)
2775 return cmp;
2777 /* And after step. */
2778 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2779 if (cmp != 0)
2780 return cmp;
2782 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2783 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2784 if (cmp == 0)
2785 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2786 return cmp;
2789 /* If OP is the result of a conversion, return the unconverted value,
2790 otherwise return null. */
2792 static tree
2793 strip_conversion (tree op)
2795 if (TREE_CODE (op) != SSA_NAME)
2796 return NULL_TREE;
2797 gimple *stmt = SSA_NAME_DEF_STMT (op);
2798 if (!is_gimple_assign (stmt)
2799 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2800 return NULL_TREE;
2801 return gimple_assign_rhs1 (stmt);
2804 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2805 and STMT2_INFO being in a single group. */
2807 static bool
2808 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2810 if (gimple_assign_single_p (stmt1_info->stmt))
2811 return gimple_assign_single_p (stmt2_info->stmt);
2813 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2814 if (call1 && gimple_call_internal_p (call1))
2816 /* Check for two masked loads or two masked stores. */
2817 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2818 if (!call2 || !gimple_call_internal_p (call2))
2819 return false;
2820 internal_fn ifn = gimple_call_internal_fn (call1);
2821 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2822 return false;
2823 if (ifn != gimple_call_internal_fn (call2))
2824 return false;
2826 /* Check that the masks are the same. Cope with casts of masks,
2827 like those created by build_mask_conversion. */
2828 tree mask1 = gimple_call_arg (call1, 2);
2829 tree mask2 = gimple_call_arg (call2, 2);
2830 if (!operand_equal_p (mask1, mask2, 0))
2832 mask1 = strip_conversion (mask1);
2833 if (!mask1)
2834 return false;
2835 mask2 = strip_conversion (mask2);
2836 if (!mask2)
2837 return false;
2838 if (!operand_equal_p (mask1, mask2, 0))
2839 return false;
2841 return true;
2844 return false;
2847 /* Function vect_analyze_data_ref_accesses.
2849 Analyze the access pattern of all the data references in the loop.
2851 FORNOW: the only access pattern that is considered vectorizable is a
2852 simple step 1 (consecutive) access.
2854 FORNOW: handle only arrays and pointer accesses. */
2856 opt_result
2857 vect_analyze_data_ref_accesses (vec_info *vinfo)
2859 unsigned int i;
2860 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2861 struct data_reference *dr;
2863 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2865 if (datarefs.is_empty ())
2866 return opt_result::success ();
2868 /* Sort the array of datarefs to make building the interleaving chains
2869 linear. Don't modify the original vector's order, it is needed for
2870 determining what dependencies are reversed. */
2871 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2872 datarefs_copy.qsort (dr_group_sort_cmp);
2873 hash_set<stmt_vec_info> to_fixup;
2875 /* Build the interleaving chains. */
2876 for (i = 0; i < datarefs_copy.length () - 1;)
2878 data_reference_p dra = datarefs_copy[i];
2879 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2880 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2881 stmt_vec_info lastinfo = NULL;
2882 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2883 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2885 ++i;
2886 continue;
2888 for (i = i + 1; i < datarefs_copy.length (); ++i)
2890 data_reference_p drb = datarefs_copy[i];
2891 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2892 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2893 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2894 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2895 break;
2897 /* ??? Imperfect sorting (non-compatible types, non-modulo
2898 accesses, same accesses) can lead to a group to be artificially
2899 split here as we don't just skip over those. If it really
2900 matters we can push those to a worklist and re-iterate
2901 over them. The we can just skip ahead to the next DR here. */
2903 /* DRs in a different loop should not be put into the same
2904 interleaving group. */
2905 if (gimple_bb (DR_STMT (dra))->loop_father
2906 != gimple_bb (DR_STMT (drb))->loop_father)
2907 break;
2909 /* Check that the data-refs have same first location (except init)
2910 and they are both either store or load (not load and store,
2911 not masked loads or stores). */
2912 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2913 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2914 DR_BASE_ADDRESS (drb)) != 0
2915 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2916 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2917 break;
2919 /* Check that the data-refs have the same constant size. */
2920 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2921 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2922 if (!tree_fits_uhwi_p (sza)
2923 || !tree_fits_uhwi_p (szb)
2924 || !tree_int_cst_equal (sza, szb))
2925 break;
2927 /* Check that the data-refs have the same step. */
2928 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2929 break;
2931 /* Check the types are compatible.
2932 ??? We don't distinguish this during sorting. */
2933 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2934 TREE_TYPE (DR_REF (drb))))
2935 break;
2937 /* Check that the DR_INITs are compile-time constants. */
2938 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2939 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2940 break;
2942 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2943 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2944 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2945 HOST_WIDE_INT init_prev
2946 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2947 gcc_assert (init_a <= init_b
2948 && init_a <= init_prev
2949 && init_prev <= init_b);
2951 /* Do not place the same access in the interleaving chain twice. */
2952 if (init_b == init_prev)
2954 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2955 < gimple_uid (DR_STMT (drb)));
2956 /* Simply link in duplicates and fix up the chain below. */
2958 else
2960 /* If init_b == init_a + the size of the type * k, we have an
2961 interleaving, and DRA is accessed before DRB. */
2962 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2963 if (type_size_a == 0
2964 || (init_b - init_a) % type_size_a != 0)
2965 break;
2967 /* If we have a store, the accesses are adjacent. This splits
2968 groups into chunks we support (we don't support vectorization
2969 of stores with gaps). */
2970 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2971 break;
2973 /* If the step (if not zero or non-constant) is greater than the
2974 difference between data-refs' inits this splits groups into
2975 suitable sizes. */
2976 if (tree_fits_shwi_p (DR_STEP (dra)))
2978 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2979 if (step != 0 && step <= (init_b - init_a))
2980 break;
2984 if (dump_enabled_p ())
2985 dump_printf_loc (MSG_NOTE, vect_location,
2986 DR_IS_READ (dra)
2987 ? "Detected interleaving load %T and %T\n"
2988 : "Detected interleaving store %T and %T\n",
2989 DR_REF (dra), DR_REF (drb));
2991 /* Link the found element into the group list. */
2992 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
2994 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
2995 lastinfo = stmtinfo_a;
2997 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
2998 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
2999 lastinfo = stmtinfo_b;
3001 if (init_b == init_prev
3002 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3003 && dump_enabled_p ())
3004 dump_printf_loc (MSG_NOTE, vect_location,
3005 "Queuing group with duplicate access for fixup\n");
3009 /* Fixup groups with duplicate entries by splitting it. */
3010 while (1)
3012 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3013 if (!(it != to_fixup.end ()))
3014 break;
3015 stmt_vec_info grp = *it;
3016 to_fixup.remove (grp);
3018 /* Find the earliest duplicate group member. */
3019 unsigned first_duplicate = -1u;
3020 stmt_vec_info next, g = grp;
3021 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3023 if ((DR_INIT (STMT_VINFO_DR_INFO (next)->dr)
3024 == DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3025 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3026 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3027 g = next;
3029 if (first_duplicate == -1U)
3030 continue;
3032 /* Then move all stmts after the first duplicate to a new group.
3033 Note this is a heuristic but one with the property that *it
3034 is fixed up completely. */
3035 g = grp;
3036 stmt_vec_info newgroup = NULL, ng = grp;
3037 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3039 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3041 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3042 if (!newgroup)
3043 newgroup = next;
3044 else
3045 DR_GROUP_NEXT_ELEMENT (ng) = next;
3046 ng = next;
3047 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3049 else
3050 g = DR_GROUP_NEXT_ELEMENT (g);
3052 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3054 /* Fixup the new group which still may contain duplicates. */
3055 to_fixup.add (newgroup);
3058 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3060 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3061 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3062 && !vect_analyze_data_ref_access (dr_info))
3064 if (dump_enabled_p ())
3065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3066 "not vectorized: complicated access pattern.\n");
3068 if (is_a <bb_vec_info> (vinfo))
3070 /* Mark the statement as not vectorizable. */
3071 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3072 continue;
3074 else
3076 datarefs_copy.release ();
3077 return opt_result::failure_at (dr_info->stmt->stmt,
3078 "not vectorized:"
3079 " complicated access pattern.\n");
3084 datarefs_copy.release ();
3085 return opt_result::success ();
3088 /* Function vect_vfa_segment_size.
3090 Input:
3091 DR_INFO: The data reference.
3092 LENGTH_FACTOR: segment length to consider.
3094 Return a value suitable for the dr_with_seg_len::seg_len field.
3095 This is the "distance travelled" by the pointer from the first
3096 iteration in the segment to the last. Note that it does not include
3097 the size of the access; in effect it only describes the first byte. */
3099 static tree
3100 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3102 length_factor = size_binop (MINUS_EXPR,
3103 fold_convert (sizetype, length_factor),
3104 size_one_node);
3105 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3106 length_factor);
3109 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3110 gives the worst-case number of bytes covered by the segment. */
3112 static unsigned HOST_WIDE_INT
3113 vect_vfa_access_size (dr_vec_info *dr_info)
3115 stmt_vec_info stmt_vinfo = dr_info->stmt;
3116 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3117 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3118 unsigned HOST_WIDE_INT access_size = ref_size;
3119 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3121 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3122 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3124 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3125 && (vect_supportable_dr_alignment (dr_info, false)
3126 == dr_explicit_realign_optimized))
3128 /* We might access a full vector's worth. */
3129 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3130 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3132 return access_size;
3135 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3136 describes. */
3138 static unsigned int
3139 vect_vfa_align (dr_vec_info *dr_info)
3141 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3144 /* Function vect_no_alias_p.
3146 Given data references A and B with equal base and offset, see whether
3147 the alias relation can be decided at compilation time. Return 1 if
3148 it can and the references alias, 0 if it can and the references do
3149 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3150 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3151 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3153 static int
3154 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3155 tree segment_length_a, tree segment_length_b,
3156 unsigned HOST_WIDE_INT access_size_a,
3157 unsigned HOST_WIDE_INT access_size_b)
3159 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3160 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3161 poly_uint64 const_length_a;
3162 poly_uint64 const_length_b;
3164 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3165 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3166 [a, a+12) */
3167 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3169 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3170 offset_a = (offset_a + access_size_a) - const_length_a;
3172 else
3173 const_length_a = tree_to_poly_uint64 (segment_length_a);
3174 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3176 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3177 offset_b = (offset_b + access_size_b) - const_length_b;
3179 else
3180 const_length_b = tree_to_poly_uint64 (segment_length_b);
3182 const_length_a += access_size_a;
3183 const_length_b += access_size_b;
3185 if (ranges_known_overlap_p (offset_a, const_length_a,
3186 offset_b, const_length_b))
3187 return 1;
3189 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3190 offset_b, const_length_b))
3191 return 0;
3193 return -1;
3196 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3197 in DDR is >= VF. */
3199 static bool
3200 dependence_distance_ge_vf (data_dependence_relation *ddr,
3201 unsigned int loop_depth, poly_uint64 vf)
3203 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3204 || DDR_NUM_DIST_VECTS (ddr) == 0)
3205 return false;
3207 /* If the dependence is exact, we should have limited the VF instead. */
3208 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3210 unsigned int i;
3211 lambda_vector dist_v;
3212 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3214 HOST_WIDE_INT dist = dist_v[loop_depth];
3215 if (dist != 0
3216 && !(dist > 0 && DDR_REVERSED_P (ddr))
3217 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3218 return false;
3221 if (dump_enabled_p ())
3222 dump_printf_loc (MSG_NOTE, vect_location,
3223 "dependence distance between %T and %T is >= VF\n",
3224 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3226 return true;
3229 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3231 static void
3232 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3234 dump_printf (dump_kind, "%s (%T) >= ",
3235 lower_bound.unsigned_p ? "unsigned" : "abs",
3236 lower_bound.expr);
3237 dump_dec (dump_kind, lower_bound.min_value);
3240 /* Record that the vectorized loop requires the vec_lower_bound described
3241 by EXPR, UNSIGNED_P and MIN_VALUE. */
3243 static void
3244 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3245 poly_uint64 min_value)
3247 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3248 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3249 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3251 unsigned_p &= lower_bounds[i].unsigned_p;
3252 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3253 if (lower_bounds[i].unsigned_p != unsigned_p
3254 || maybe_lt (lower_bounds[i].min_value, min_value))
3256 lower_bounds[i].unsigned_p = unsigned_p;
3257 lower_bounds[i].min_value = min_value;
3258 if (dump_enabled_p ())
3260 dump_printf_loc (MSG_NOTE, vect_location,
3261 "updating run-time check to ");
3262 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3263 dump_printf (MSG_NOTE, "\n");
3266 return;
3269 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3270 if (dump_enabled_p ())
3272 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3273 dump_lower_bound (MSG_NOTE, lower_bound);
3274 dump_printf (MSG_NOTE, "\n");
3276 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3279 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3280 will span fewer than GAP bytes. */
3282 static bool
3283 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3284 poly_int64 gap)
3286 stmt_vec_info stmt_info = dr_info->stmt;
3287 HOST_WIDE_INT count
3288 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3289 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3290 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3291 return (estimated_poly_value (gap)
3292 <= count * vect_get_scalar_dr_size (dr_info));
3295 /* Return true if we know that there is no alias between DR_INFO_A and
3296 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3297 When returning true, set *LOWER_BOUND_OUT to this N. */
3299 static bool
3300 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3301 poly_uint64 *lower_bound_out)
3303 /* Check that there is a constant gap of known sign between DR_A
3304 and DR_B. */
3305 data_reference *dr_a = dr_info_a->dr;
3306 data_reference *dr_b = dr_info_b->dr;
3307 poly_int64 init_a, init_b;
3308 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3309 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3310 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3311 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3312 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3313 || !ordered_p (init_a, init_b))
3314 return false;
3316 /* Sort DR_A and DR_B by the address they access. */
3317 if (maybe_lt (init_b, init_a))
3319 std::swap (init_a, init_b);
3320 std::swap (dr_info_a, dr_info_b);
3321 std::swap (dr_a, dr_b);
3324 /* If the two accesses could be dependent within a scalar iteration,
3325 make sure that we'd retain their order. */
3326 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3327 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3328 return false;
3330 /* There is no alias if abs (DR_STEP) is greater than or equal to
3331 the bytes spanned by the combination of the two accesses. */
3332 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3333 return true;
3336 /* Function vect_prune_runtime_alias_test_list.
3338 Prune a list of ddrs to be tested at run-time by versioning for alias.
3339 Merge several alias checks into one if possible.
3340 Return FALSE if resulting list of ddrs is longer then allowed by
3341 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3343 opt_result
3344 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3346 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3347 hash_set <tree_pair_hash> compared_objects;
3349 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3350 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3351 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3352 vec<vec_object_pair> &check_unequal_addrs
3353 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3354 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3355 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3357 ddr_p ddr;
3358 unsigned int i;
3359 tree length_factor;
3361 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3363 /* Step values are irrelevant for aliasing if the number of vector
3364 iterations is equal to the number of scalar iterations (which can
3365 happen for fully-SLP loops). */
3366 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3368 if (!ignore_step_p)
3370 /* Convert the checks for nonzero steps into bound tests. */
3371 tree value;
3372 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3373 vect_check_lower_bound (loop_vinfo, value, true, 1);
3376 if (may_alias_ddrs.is_empty ())
3377 return opt_result::success ();
3379 comp_alias_ddrs.create (may_alias_ddrs.length ());
3381 unsigned int loop_depth
3382 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3383 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3385 /* First, we collect all data ref pairs for aliasing checks. */
3386 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3388 int comp_res;
3389 poly_uint64 lower_bound;
3390 tree segment_length_a, segment_length_b;
3391 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3392 unsigned int align_a, align_b;
3394 /* Ignore the alias if the VF we chose ended up being no greater
3395 than the dependence distance. */
3396 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3397 continue;
3399 if (DDR_OBJECT_A (ddr))
3401 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3402 if (!compared_objects.add (new_pair))
3404 if (dump_enabled_p ())
3405 dump_printf_loc (MSG_NOTE, vect_location,
3406 "checking that %T and %T"
3407 " have different addresses\n",
3408 new_pair.first, new_pair.second);
3409 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3411 continue;
3414 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3415 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3417 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3418 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3420 /* Skip the pair if inter-iteration dependencies are irrelevant
3421 and intra-iteration dependencies are guaranteed to be honored. */
3422 if (ignore_step_p
3423 && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3424 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3425 &lower_bound)))
3427 if (dump_enabled_p ())
3428 dump_printf_loc (MSG_NOTE, vect_location,
3429 "no need for alias check between "
3430 "%T and %T when VF is 1\n",
3431 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3432 continue;
3435 /* See whether we can handle the alias using a bounds check on
3436 the step, and whether that's likely to be the best approach.
3437 (It might not be, for example, if the minimum step is much larger
3438 than the number of bytes handled by one vector iteration.) */
3439 if (!ignore_step_p
3440 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3441 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3442 &lower_bound)
3443 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3444 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3446 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3447 if (dump_enabled_p ())
3449 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3450 "%T and %T when the step %T is outside ",
3451 DR_REF (dr_info_a->dr),
3452 DR_REF (dr_info_b->dr),
3453 DR_STEP (dr_info_a->dr));
3454 if (unsigned_p)
3455 dump_printf (MSG_NOTE, "[0");
3456 else
3458 dump_printf (MSG_NOTE, "(");
3459 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3461 dump_printf (MSG_NOTE, ", ");
3462 dump_dec (MSG_NOTE, lower_bound);
3463 dump_printf (MSG_NOTE, ")\n");
3465 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3466 unsigned_p, lower_bound);
3467 continue;
3470 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3471 if (dr_group_first_a)
3473 stmt_info_a = dr_group_first_a;
3474 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3477 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3478 if (dr_group_first_b)
3480 stmt_info_b = dr_group_first_b;
3481 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3484 if (ignore_step_p)
3486 segment_length_a = size_zero_node;
3487 segment_length_b = size_zero_node;
3489 else
3491 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3492 DR_STEP (dr_info_b->dr), 0))
3493 length_factor = scalar_loop_iters;
3494 else
3495 length_factor = size_int (vect_factor);
3496 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3497 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3499 access_size_a = vect_vfa_access_size (dr_info_a);
3500 access_size_b = vect_vfa_access_size (dr_info_b);
3501 align_a = vect_vfa_align (dr_info_a);
3502 align_b = vect_vfa_align (dr_info_b);
3504 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3505 DR_BASE_ADDRESS (dr_info_b->dr));
3506 if (comp_res == 0)
3507 comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3508 DR_OFFSET (dr_info_b->dr));
3510 /* See whether the alias is known at compilation time. */
3511 if (comp_res == 0
3512 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3513 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3514 && poly_int_tree_p (segment_length_a)
3515 && poly_int_tree_p (segment_length_b))
3517 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3518 segment_length_a,
3519 segment_length_b,
3520 access_size_a,
3521 access_size_b);
3522 if (res >= 0 && dump_enabled_p ())
3524 dump_printf_loc (MSG_NOTE, vect_location,
3525 "can tell at compile time that %T and %T",
3526 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3527 if (res == 0)
3528 dump_printf (MSG_NOTE, " do not alias\n");
3529 else
3530 dump_printf (MSG_NOTE, " alias\n");
3533 if (res == 0)
3534 continue;
3536 if (res == 1)
3537 return opt_result::failure_at (stmt_info_b->stmt,
3538 "not vectorized:"
3539 " compilation time alias: %G%G",
3540 stmt_info_a->stmt,
3541 stmt_info_b->stmt);
3544 dr_with_seg_len_pair_t dr_with_seg_len_pair
3545 (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3546 access_size_a, align_a),
3547 dr_with_seg_len (dr_info_b->dr, segment_length_b,
3548 access_size_b, align_b));
3550 /* Canonicalize pairs by sorting the two DR members. */
3551 if (comp_res > 0)
3552 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3554 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3557 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3559 unsigned int count = (comp_alias_ddrs.length ()
3560 + check_unequal_addrs.length ());
3562 if (dump_enabled_p ())
3563 dump_printf_loc (MSG_NOTE, vect_location,
3564 "improved number of alias checks from %d to %d\n",
3565 may_alias_ddrs.length (), count);
3566 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3567 return opt_result::failure_at
3568 (vect_location,
3569 "number of versioning for alias "
3570 "run-time tests exceeds %d "
3571 "(--param vect-max-version-for-alias-checks)\n",
3572 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3574 return opt_result::success ();
3577 /* Check whether we can use an internal function for a gather load
3578 or scatter store. READ_P is true for loads and false for stores.
3579 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3580 the type of the memory elements being loaded or stored. OFFSET_BITS
3581 is the number of bits in each scalar offset and OFFSET_SIGN is the
3582 sign of the offset. SCALE is the amount by which the offset should
3583 be multiplied *after* it has been converted to address width.
3585 Return true if the function is supported, storing the function
3586 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3588 bool
3589 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3590 tree memory_type, unsigned int offset_bits,
3591 signop offset_sign, int scale,
3592 internal_fn *ifn_out, tree *element_type_out)
3594 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3595 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3596 if (offset_bits > element_bits)
3597 /* Internal functions require the offset to be the same width as
3598 the vector elements. We can extend narrower offsets, but it isn't
3599 safe to truncate wider offsets. */
3600 return false;
3602 if (element_bits != memory_bits)
3603 /* For now the vector elements must be the same width as the
3604 memory elements. */
3605 return false;
3607 /* Work out which function we need. */
3608 internal_fn ifn;
3609 if (read_p)
3610 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3611 else
3612 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3614 /* Test whether the target supports this combination. */
3615 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3616 offset_sign, scale))
3617 return false;
3619 *ifn_out = ifn;
3620 *element_type_out = TREE_TYPE (vectype);
3621 return true;
3624 /* STMT_INFO is a call to an internal gather load or scatter store function.
3625 Describe the operation in INFO. */
3627 static void
3628 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3629 gather_scatter_info *info)
3631 gcall *call = as_a <gcall *> (stmt_info->stmt);
3632 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3633 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3635 info->ifn = gimple_call_internal_fn (call);
3636 info->decl = NULL_TREE;
3637 info->base = gimple_call_arg (call, 0);
3638 info->offset = gimple_call_arg (call, 1);
3639 info->offset_dt = vect_unknown_def_type;
3640 info->offset_vectype = NULL_TREE;
3641 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3642 info->element_type = TREE_TYPE (vectype);
3643 info->memory_type = TREE_TYPE (DR_REF (dr));
3646 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3647 gather load or scatter store. Describe the operation in *INFO if so. */
3649 bool
3650 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3651 gather_scatter_info *info)
3653 HOST_WIDE_INT scale = 1;
3654 poly_int64 pbitpos, pbitsize;
3655 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3656 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3657 tree offtype = NULL_TREE;
3658 tree decl = NULL_TREE, base, off;
3659 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3660 tree memory_type = TREE_TYPE (DR_REF (dr));
3661 machine_mode pmode;
3662 int punsignedp, reversep, pvolatilep = 0;
3663 internal_fn ifn;
3664 tree element_type;
3665 bool masked_p = false;
3667 /* See whether this is already a call to a gather/scatter internal function.
3668 If not, see whether it's a masked load or store. */
3669 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3670 if (call && gimple_call_internal_p (call))
3672 ifn = gimple_call_internal_fn (call);
3673 if (internal_gather_scatter_fn_p (ifn))
3675 vect_describe_gather_scatter_call (stmt_info, info);
3676 return true;
3678 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3681 /* True if we should aim to use internal functions rather than
3682 built-in functions. */
3683 bool use_ifn_p = (DR_IS_READ (dr)
3684 ? supports_vec_gather_load_p ()
3685 : supports_vec_scatter_store_p ());
3687 base = DR_REF (dr);
3688 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3689 see if we can use the def stmt of the address. */
3690 if (masked_p
3691 && TREE_CODE (base) == MEM_REF
3692 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3693 && integer_zerop (TREE_OPERAND (base, 1))
3694 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3696 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3697 if (is_gimple_assign (def_stmt)
3698 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3699 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3702 /* The gather and scatter builtins need address of the form
3703 loop_invariant + vector * {1, 2, 4, 8}
3705 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3706 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3707 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3708 multiplications and additions in it. To get a vector, we need
3709 a single SSA_NAME that will be defined in the loop and will
3710 contain everything that is not loop invariant and that can be
3711 vectorized. The following code attempts to find such a preexistng
3712 SSA_NAME OFF and put the loop invariants into a tree BASE
3713 that can be gimplified before the loop. */
3714 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3715 &punsignedp, &reversep, &pvolatilep);
3716 if (reversep)
3717 return false;
3719 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3721 if (TREE_CODE (base) == MEM_REF)
3723 if (!integer_zerop (TREE_OPERAND (base, 1)))
3725 if (off == NULL_TREE)
3726 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3727 else
3728 off = size_binop (PLUS_EXPR, off,
3729 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3731 base = TREE_OPERAND (base, 0);
3733 else
3734 base = build_fold_addr_expr (base);
3736 if (off == NULL_TREE)
3737 off = size_zero_node;
3739 /* If base is not loop invariant, either off is 0, then we start with just
3740 the constant offset in the loop invariant BASE and continue with base
3741 as OFF, otherwise give up.
3742 We could handle that case by gimplifying the addition of base + off
3743 into some SSA_NAME and use that as off, but for now punt. */
3744 if (!expr_invariant_in_loop_p (loop, base))
3746 if (!integer_zerop (off))
3747 return false;
3748 off = base;
3749 base = size_int (pbytepos);
3751 /* Otherwise put base + constant offset into the loop invariant BASE
3752 and continue with OFF. */
3753 else
3755 base = fold_convert (sizetype, base);
3756 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3759 /* OFF at this point may be either a SSA_NAME or some tree expression
3760 from get_inner_reference. Try to peel off loop invariants from it
3761 into BASE as long as possible. */
3762 STRIP_NOPS (off);
3763 while (offtype == NULL_TREE)
3765 enum tree_code code;
3766 tree op0, op1, add = NULL_TREE;
3768 if (TREE_CODE (off) == SSA_NAME)
3770 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3772 if (expr_invariant_in_loop_p (loop, off))
3773 return false;
3775 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3776 break;
3778 op0 = gimple_assign_rhs1 (def_stmt);
3779 code = gimple_assign_rhs_code (def_stmt);
3780 op1 = gimple_assign_rhs2 (def_stmt);
3782 else
3784 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3785 return false;
3786 code = TREE_CODE (off);
3787 extract_ops_from_tree (off, &code, &op0, &op1);
3789 switch (code)
3791 case POINTER_PLUS_EXPR:
3792 case PLUS_EXPR:
3793 if (expr_invariant_in_loop_p (loop, op0))
3795 add = op0;
3796 off = op1;
3797 do_add:
3798 add = fold_convert (sizetype, add);
3799 if (scale != 1)
3800 add = size_binop (MULT_EXPR, add, size_int (scale));
3801 base = size_binop (PLUS_EXPR, base, add);
3802 continue;
3804 if (expr_invariant_in_loop_p (loop, op1))
3806 add = op1;
3807 off = op0;
3808 goto do_add;
3810 break;
3811 case MINUS_EXPR:
3812 if (expr_invariant_in_loop_p (loop, op1))
3814 add = fold_convert (sizetype, op1);
3815 add = size_binop (MINUS_EXPR, size_zero_node, add);
3816 off = op0;
3817 goto do_add;
3819 break;
3820 case MULT_EXPR:
3821 if (scale == 1 && tree_fits_shwi_p (op1))
3823 int new_scale = tree_to_shwi (op1);
3824 /* Only treat this as a scaling operation if the target
3825 supports it. */
3826 if (use_ifn_p
3827 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3828 vectype, memory_type, 1,
3829 TYPE_SIGN (TREE_TYPE (op0)),
3830 new_scale, &ifn,
3831 &element_type))
3832 break;
3833 scale = new_scale;
3834 off = op0;
3835 continue;
3837 break;
3838 case SSA_NAME:
3839 off = op0;
3840 continue;
3841 CASE_CONVERT:
3842 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3843 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3844 break;
3845 if (TYPE_PRECISION (TREE_TYPE (op0))
3846 == TYPE_PRECISION (TREE_TYPE (off)))
3848 off = op0;
3849 continue;
3852 /* The internal functions need the offset to be the same width
3853 as the elements of VECTYPE. Don't include operations that
3854 cast the offset from that width to a different width. */
3855 if (use_ifn_p
3856 && (int_size_in_bytes (TREE_TYPE (vectype))
3857 == int_size_in_bytes (TREE_TYPE (off))))
3858 break;
3860 if (TYPE_PRECISION (TREE_TYPE (op0))
3861 < TYPE_PRECISION (TREE_TYPE (off)))
3863 off = op0;
3864 offtype = TREE_TYPE (off);
3865 STRIP_NOPS (off);
3866 continue;
3868 break;
3869 default:
3870 break;
3872 break;
3875 /* If at the end OFF still isn't a SSA_NAME or isn't
3876 defined in the loop, punt. */
3877 if (TREE_CODE (off) != SSA_NAME
3878 || expr_invariant_in_loop_p (loop, off))
3879 return false;
3881 if (offtype == NULL_TREE)
3882 offtype = TREE_TYPE (off);
3884 if (use_ifn_p)
3886 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3887 memory_type, TYPE_PRECISION (offtype),
3888 TYPE_SIGN (offtype), scale, &ifn,
3889 &element_type))
3890 return false;
3892 else
3894 if (DR_IS_READ (dr))
3896 if (targetm.vectorize.builtin_gather)
3897 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3899 else
3901 if (targetm.vectorize.builtin_scatter)
3902 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3905 if (!decl)
3906 return false;
3908 ifn = IFN_LAST;
3909 element_type = TREE_TYPE (vectype);
3912 info->ifn = ifn;
3913 info->decl = decl;
3914 info->base = base;
3915 info->offset = off;
3916 info->offset_dt = vect_unknown_def_type;
3917 info->offset_vectype = NULL_TREE;
3918 info->scale = scale;
3919 info->element_type = element_type;
3920 info->memory_type = memory_type;
3921 return true;
3924 /* Find the data references in STMT, analyze them with respect to LOOP and
3925 append them to DATAREFS. Return false if datarefs in this stmt cannot
3926 be handled. */
3928 opt_result
3929 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3930 vec<data_reference_p> *datarefs)
3932 /* We can ignore clobbers for dataref analysis - they are removed during
3933 loop vectorization and BB vectorization checks dependences with a
3934 stmt walk. */
3935 if (gimple_clobber_p (stmt))
3936 return opt_result::success ();
3938 if (gimple_has_volatile_ops (stmt))
3939 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
3940 stmt);
3942 if (stmt_can_throw_internal (cfun, stmt))
3943 return opt_result::failure_at (stmt,
3944 "not vectorized:"
3945 " statement can throw an exception: %G",
3946 stmt);
3948 auto_vec<data_reference_p, 2> refs;
3949 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
3950 if (!res)
3951 return res;
3953 if (refs.is_empty ())
3954 return opt_result::success ();
3956 if (refs.length () > 1)
3957 return opt_result::failure_at (stmt,
3958 "not vectorized:"
3959 " more than one data ref in stmt: %G", stmt);
3961 if (gcall *call = dyn_cast <gcall *> (stmt))
3962 if (!gimple_call_internal_p (call)
3963 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3964 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3965 return opt_result::failure_at (stmt,
3966 "not vectorized: dr in a call %G", stmt);
3968 data_reference_p dr = refs.pop ();
3969 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3970 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3971 return opt_result::failure_at (stmt,
3972 "not vectorized:"
3973 " statement is bitfield access %G", stmt);
3975 if (DR_BASE_ADDRESS (dr)
3976 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3977 return opt_result::failure_at (stmt,
3978 "not vectorized:"
3979 " base addr of dr is a constant\n");
3981 /* Check whether this may be a SIMD lane access and adjust the
3982 DR to make it easier for us to handle it. */
3983 if (loop
3984 && loop->simduid
3985 && (!DR_BASE_ADDRESS (dr)
3986 || !DR_OFFSET (dr)
3987 || !DR_INIT (dr)
3988 || !DR_STEP (dr)))
3990 struct data_reference *newdr
3991 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
3992 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
3993 if (DR_BASE_ADDRESS (newdr)
3994 && DR_OFFSET (newdr)
3995 && DR_INIT (newdr)
3996 && DR_STEP (newdr)
3997 && integer_zerop (DR_STEP (newdr)))
3999 tree off = DR_OFFSET (newdr);
4000 STRIP_NOPS (off);
4001 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4002 && TREE_CODE (off) == MULT_EXPR
4003 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4005 tree step = TREE_OPERAND (off, 1);
4006 off = TREE_OPERAND (off, 0);
4007 STRIP_NOPS (off);
4008 if (CONVERT_EXPR_P (off)
4009 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4010 < TYPE_PRECISION (TREE_TYPE (off))))
4011 off = TREE_OPERAND (off, 0);
4012 if (TREE_CODE (off) == SSA_NAME)
4014 gimple *def = SSA_NAME_DEF_STMT (off);
4015 tree reft = TREE_TYPE (DR_REF (newdr));
4016 if (is_gimple_call (def)
4017 && gimple_call_internal_p (def)
4018 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4020 tree arg = gimple_call_arg (def, 0);
4021 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4022 arg = SSA_NAME_VAR (arg);
4023 if (arg == loop->simduid
4024 /* For now. */
4025 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4027 DR_OFFSET (newdr) = ssize_int (0);
4028 DR_STEP (newdr) = step;
4029 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4030 DR_STEP_ALIGNMENT (newdr)
4031 = highest_pow2_factor (step);
4032 /* Mark as simd-lane access. */
4033 newdr->aux = (void *)-1;
4034 free_data_ref (dr);
4035 datarefs->safe_push (newdr);
4036 return opt_result::success ();
4042 free_data_ref (newdr);
4045 datarefs->safe_push (dr);
4046 return opt_result::success ();
4049 /* Function vect_analyze_data_refs.
4051 Find all the data references in the loop or basic block.
4053 The general structure of the analysis of data refs in the vectorizer is as
4054 follows:
4055 1- vect_analyze_data_refs(loop/bb): call
4056 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4057 in the loop/bb and their dependences.
4058 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4059 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4060 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4064 opt_result
4065 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4067 struct loop *loop = NULL;
4068 unsigned int i;
4069 struct data_reference *dr;
4070 tree scalar_type;
4072 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4074 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4075 loop = LOOP_VINFO_LOOP (loop_vinfo);
4077 /* Go through the data-refs, check that the analysis succeeded. Update
4078 pointer from stmt_vec_info struct to DR and vectype. */
4080 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4081 FOR_EACH_VEC_ELT (datarefs, i, dr)
4083 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4084 poly_uint64 vf;
4086 gcc_assert (DR_REF (dr));
4087 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4088 gcc_assert (!stmt_info->dr_aux.dr);
4089 stmt_info->dr_aux.dr = dr;
4090 stmt_info->dr_aux.stmt = stmt_info;
4092 /* Check that analysis of the data-ref succeeded. */
4093 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4094 || !DR_STEP (dr))
4096 bool maybe_gather
4097 = DR_IS_READ (dr)
4098 && !TREE_THIS_VOLATILE (DR_REF (dr))
4099 && (targetm.vectorize.builtin_gather != NULL
4100 || supports_vec_gather_load_p ());
4101 bool maybe_scatter
4102 = DR_IS_WRITE (dr)
4103 && !TREE_THIS_VOLATILE (DR_REF (dr))
4104 && (targetm.vectorize.builtin_scatter != NULL
4105 || supports_vec_scatter_store_p ());
4107 /* If target supports vector gather loads or scatter stores,
4108 see if they can't be used. */
4109 if (is_a <loop_vec_info> (vinfo)
4110 && !nested_in_vect_loop_p (loop, stmt_info))
4112 if (maybe_gather || maybe_scatter)
4114 if (maybe_gather)
4115 gatherscatter = GATHER;
4116 else
4117 gatherscatter = SCATTER;
4121 if (gatherscatter == SG_NONE)
4123 if (dump_enabled_p ())
4124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4125 "not vectorized: data ref analysis "
4126 "failed %G", stmt_info->stmt);
4127 if (is_a <bb_vec_info> (vinfo))
4129 /* In BB vectorization the ref can still participate
4130 in dependence analysis, we just can't vectorize it. */
4131 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4132 continue;
4134 return opt_result::failure_at (stmt_info->stmt,
4135 "not vectorized:"
4136 " data ref analysis failed: %G",
4137 stmt_info->stmt);
4141 /* See if this was detected as SIMD lane access. */
4142 if (dr->aux == (void *)-1)
4144 if (nested_in_vect_loop_p (loop, stmt_info))
4145 return opt_result::failure_at (stmt_info->stmt,
4146 "not vectorized:"
4147 " data ref analysis failed: %G",
4148 stmt_info->stmt);
4149 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4152 tree base = get_base_address (DR_REF (dr));
4153 if (base && VAR_P (base) && DECL_NONALIASED (base))
4155 if (dump_enabled_p ())
4156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4157 "not vectorized: base object not addressable "
4158 "for stmt: %G", stmt_info->stmt);
4159 if (is_a <bb_vec_info> (vinfo))
4161 /* In BB vectorization the ref can still participate
4162 in dependence analysis, we just can't vectorize it. */
4163 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4164 continue;
4166 return opt_result::failure_at (stmt_info->stmt,
4167 "not vectorized: base object not"
4168 " addressable for stmt: %G",
4169 stmt_info->stmt);
4172 if (is_a <loop_vec_info> (vinfo)
4173 && DR_STEP (dr)
4174 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4176 if (nested_in_vect_loop_p (loop, stmt_info))
4177 return opt_result::failure_at (stmt_info->stmt,
4178 "not vectorized:"
4179 "not suitable for strided load %G",
4180 stmt_info->stmt);
4181 STMT_VINFO_STRIDED_P (stmt_info) = true;
4184 /* Update DR field in stmt_vec_info struct. */
4186 /* If the dataref is in an inner-loop of the loop that is considered for
4187 for vectorization, we also want to analyze the access relative to
4188 the outer-loop (DR contains information only relative to the
4189 inner-most enclosing loop). We do that by building a reference to the
4190 first location accessed by the inner-loop, and analyze it relative to
4191 the outer-loop. */
4192 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4194 /* Build a reference to the first location accessed by the
4195 inner loop: *(BASE + INIT + OFFSET). By construction,
4196 this address must be invariant in the inner loop, so we
4197 can consider it as being used in the outer loop. */
4198 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4199 tree offset = unshare_expr (DR_OFFSET (dr));
4200 tree init = unshare_expr (DR_INIT (dr));
4201 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4202 init, offset);
4203 tree init_addr = fold_build_pointer_plus (base, init_offset);
4204 tree init_ref = build_fold_indirect_ref (init_addr);
4206 if (dump_enabled_p ())
4207 dump_printf_loc (MSG_NOTE, vect_location,
4208 "analyze in outer loop: %T\n", init_ref);
4210 opt_result res
4211 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4212 init_ref, loop, stmt_info->stmt);
4213 if (!res)
4214 /* dr_analyze_innermost already explained the failure. */
4215 return res;
4217 if (dump_enabled_p ())
4218 dump_printf_loc (MSG_NOTE, vect_location,
4219 "\touter base_address: %T\n"
4220 "\touter offset from base address: %T\n"
4221 "\touter constant offset from base address: %T\n"
4222 "\touter step: %T\n"
4223 "\touter base alignment: %d\n\n"
4224 "\touter base misalignment: %d\n"
4225 "\touter offset alignment: %d\n"
4226 "\touter step alignment: %d\n",
4227 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4228 STMT_VINFO_DR_OFFSET (stmt_info),
4229 STMT_VINFO_DR_INIT (stmt_info),
4230 STMT_VINFO_DR_STEP (stmt_info),
4231 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4232 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4233 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4234 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4237 /* Set vectype for STMT. */
4238 scalar_type = TREE_TYPE (DR_REF (dr));
4239 STMT_VINFO_VECTYPE (stmt_info)
4240 = get_vectype_for_scalar_type (scalar_type);
4241 if (!STMT_VINFO_VECTYPE (stmt_info))
4243 if (dump_enabled_p ())
4245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4246 "not vectorized: no vectype for stmt: %G",
4247 stmt_info->stmt);
4248 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4249 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4250 scalar_type);
4251 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4254 if (is_a <bb_vec_info> (vinfo))
4256 /* No vector type is fine, the ref can still participate
4257 in dependence analysis, we just can't vectorize it. */
4258 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4259 continue;
4261 return opt_result::failure_at (stmt_info->stmt,
4262 "not vectorized:"
4263 " no vectype for stmt: %G"
4264 " scalar_type: %T\n",
4265 stmt_info->stmt, scalar_type);
4267 else
4269 if (dump_enabled_p ())
4270 dump_printf_loc (MSG_NOTE, vect_location,
4271 "got vectype for stmt: %G%T\n",
4272 stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4275 /* Adjust the minimal vectorization factor according to the
4276 vector type. */
4277 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4278 *min_vf = upper_bound (*min_vf, vf);
4280 if (gatherscatter != SG_NONE)
4282 gather_scatter_info gs_info;
4283 if (!vect_check_gather_scatter (stmt_info,
4284 as_a <loop_vec_info> (vinfo),
4285 &gs_info)
4286 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4287 return opt_result::failure_at
4288 (stmt_info->stmt,
4289 (gatherscatter == GATHER) ?
4290 "not vectorized: not suitable for gather load %G" :
4291 "not vectorized: not suitable for scatter store %G",
4292 stmt_info->stmt);
4293 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4297 /* We used to stop processing and prune the list here. Verify we no
4298 longer need to. */
4299 gcc_assert (i == datarefs.length ());
4301 return opt_result::success ();
4305 /* Function vect_get_new_vect_var.
4307 Returns a name for a new variable. The current naming scheme appends the
4308 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4309 the name of vectorizer generated variables, and appends that to NAME if
4310 provided. */
4312 tree
4313 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4315 const char *prefix;
4316 tree new_vect_var;
4318 switch (var_kind)
4320 case vect_simple_var:
4321 prefix = "vect";
4322 break;
4323 case vect_scalar_var:
4324 prefix = "stmp";
4325 break;
4326 case vect_mask_var:
4327 prefix = "mask";
4328 break;
4329 case vect_pointer_var:
4330 prefix = "vectp";
4331 break;
4332 default:
4333 gcc_unreachable ();
4336 if (name)
4338 char* tmp = concat (prefix, "_", name, NULL);
4339 new_vect_var = create_tmp_reg (type, tmp);
4340 free (tmp);
4342 else
4343 new_vect_var = create_tmp_reg (type, prefix);
4345 return new_vect_var;
4348 /* Like vect_get_new_vect_var but return an SSA name. */
4350 tree
4351 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4353 const char *prefix;
4354 tree new_vect_var;
4356 switch (var_kind)
4358 case vect_simple_var:
4359 prefix = "vect";
4360 break;
4361 case vect_scalar_var:
4362 prefix = "stmp";
4363 break;
4364 case vect_pointer_var:
4365 prefix = "vectp";
4366 break;
4367 default:
4368 gcc_unreachable ();
4371 if (name)
4373 char* tmp = concat (prefix, "_", name, NULL);
4374 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4375 free (tmp);
4377 else
4378 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4380 return new_vect_var;
4383 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4385 static void
4386 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4388 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4389 int misalign = DR_MISALIGNMENT (dr_info);
4390 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4391 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4392 else
4393 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4394 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4395 misalign);
4398 /* Function vect_create_addr_base_for_vector_ref.
4400 Create an expression that computes the address of the first memory location
4401 that will be accessed for a data reference.
4403 Input:
4404 STMT_INFO: The statement containing the data reference.
4405 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4406 OFFSET: Optional. If supplied, it is be added to the initial address.
4407 LOOP: Specify relative to which loop-nest should the address be computed.
4408 For example, when the dataref is in an inner-loop nested in an
4409 outer-loop that is now being vectorized, LOOP can be either the
4410 outer-loop, or the inner-loop. The first memory location accessed
4411 by the following dataref ('in' points to short):
4413 for (i=0; i<N; i++)
4414 for (j=0; j<M; j++)
4415 s += in[i+j]
4417 is as follows:
4418 if LOOP=i_loop: &in (relative to i_loop)
4419 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4420 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4421 initial address. Unlike OFFSET, which is number of elements to
4422 be added, BYTE_OFFSET is measured in bytes.
4424 Output:
4425 1. Return an SSA_NAME whose value is the address of the memory location of
4426 the first vector of the data reference.
4427 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4428 these statement(s) which define the returned SSA_NAME.
4430 FORNOW: We are only handling array accesses with step 1. */
4432 tree
4433 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4434 gimple_seq *new_stmt_list,
4435 tree offset,
4436 tree byte_offset)
4438 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4439 struct data_reference *dr = dr_info->dr;
4440 const char *base_name;
4441 tree addr_base;
4442 tree dest;
4443 gimple_seq seq = NULL;
4444 tree vect_ptr_type;
4445 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4446 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4447 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4449 tree data_ref_base = unshare_expr (drb->base_address);
4450 tree base_offset = unshare_expr (drb->offset);
4451 tree init = unshare_expr (drb->init);
4453 if (loop_vinfo)
4454 base_name = get_name (data_ref_base);
4455 else
4457 base_offset = ssize_int (0);
4458 init = ssize_int (0);
4459 base_name = get_name (DR_REF (dr));
4462 /* Create base_offset */
4463 base_offset = size_binop (PLUS_EXPR,
4464 fold_convert (sizetype, base_offset),
4465 fold_convert (sizetype, init));
4467 if (offset)
4469 offset = fold_build2 (MULT_EXPR, sizetype,
4470 fold_convert (sizetype, offset), step);
4471 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4472 base_offset, offset);
4474 if (byte_offset)
4476 byte_offset = fold_convert (sizetype, byte_offset);
4477 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4478 base_offset, byte_offset);
4481 /* base + base_offset */
4482 if (loop_vinfo)
4483 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4484 else
4486 addr_base = build1 (ADDR_EXPR,
4487 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4488 unshare_expr (DR_REF (dr)));
4491 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4492 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4493 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4494 gimple_seq_add_seq (new_stmt_list, seq);
4496 if (DR_PTR_INFO (dr)
4497 && TREE_CODE (addr_base) == SSA_NAME
4498 && !SSA_NAME_PTR_INFO (addr_base))
4500 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4501 if (offset || byte_offset)
4502 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4505 if (dump_enabled_p ())
4506 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4508 return addr_base;
4512 /* Function vect_create_data_ref_ptr.
4514 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4515 location accessed in the loop by STMT_INFO, along with the def-use update
4516 chain to appropriately advance the pointer through the loop iterations.
4517 Also set aliasing information for the pointer. This pointer is used by
4518 the callers to this function to create a memory reference expression for
4519 vector load/store access.
4521 Input:
4522 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4523 GIMPLE_ASSIGN <name, data-ref> or
4524 GIMPLE_ASSIGN <data-ref, name>.
4525 2. AGGR_TYPE: the type of the reference, which should be either a vector
4526 or an array.
4527 3. AT_LOOP: the loop where the vector memref is to be created.
4528 4. OFFSET (optional): an offset to be added to the initial address accessed
4529 by the data-ref in STMT_INFO.
4530 5. BSI: location where the new stmts are to be placed if there is no loop
4531 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4532 pointing to the initial address.
4533 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4534 to the initial address accessed by the data-ref in STMT_INFO. This is
4535 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4536 in bytes.
4537 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4538 to the IV during each iteration of the loop. NULL says to move
4539 by one copy of AGGR_TYPE up or down, depending on the step of the
4540 data reference.
4542 Output:
4543 1. Declare a new ptr to vector_type, and have it point to the base of the
4544 data reference (initial addressed accessed by the data reference).
4545 For example, for vector of type V8HI, the following code is generated:
4547 v8hi *ap;
4548 ap = (v8hi *)initial_address;
4550 if OFFSET is not supplied:
4551 initial_address = &a[init];
4552 if OFFSET is supplied:
4553 initial_address = &a[init + OFFSET];
4554 if BYTE_OFFSET is supplied:
4555 initial_address = &a[init] + BYTE_OFFSET;
4557 Return the initial_address in INITIAL_ADDRESS.
4559 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4560 update the pointer in each iteration of the loop.
4562 Return the increment stmt that updates the pointer in PTR_INCR.
4564 3. Return the pointer. */
4566 tree
4567 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4568 struct loop *at_loop, tree offset,
4569 tree *initial_address, gimple_stmt_iterator *gsi,
4570 gimple **ptr_incr, bool only_init,
4571 tree byte_offset, tree iv_step)
4573 const char *base_name;
4574 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4575 struct loop *loop = NULL;
4576 bool nested_in_vect_loop = false;
4577 struct loop *containing_loop = NULL;
4578 tree aggr_ptr_type;
4579 tree aggr_ptr;
4580 tree new_temp;
4581 gimple_seq new_stmt_list = NULL;
4582 edge pe = NULL;
4583 basic_block new_bb;
4584 tree aggr_ptr_init;
4585 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4586 struct data_reference *dr = dr_info->dr;
4587 tree aptr;
4588 gimple_stmt_iterator incr_gsi;
4589 bool insert_after;
4590 tree indx_before_incr, indx_after_incr;
4591 gimple *incr;
4592 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4594 gcc_assert (iv_step != NULL_TREE
4595 || TREE_CODE (aggr_type) == ARRAY_TYPE
4596 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4598 if (loop_vinfo)
4600 loop = LOOP_VINFO_LOOP (loop_vinfo);
4601 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4602 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4603 pe = loop_preheader_edge (loop);
4605 else
4607 gcc_assert (bb_vinfo);
4608 only_init = true;
4609 *ptr_incr = NULL;
4612 /* Create an expression for the first address accessed by this load
4613 in LOOP. */
4614 base_name = get_name (DR_BASE_ADDRESS (dr));
4616 if (dump_enabled_p ())
4618 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4619 dump_printf_loc (MSG_NOTE, vect_location,
4620 "create %s-pointer variable to type: %T",
4621 get_tree_code_name (TREE_CODE (aggr_type)),
4622 aggr_type);
4623 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4624 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4625 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4626 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4627 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4628 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4629 else
4630 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4631 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4634 /* (1) Create the new aggregate-pointer variable.
4635 Vector and array types inherit the alias set of their component
4636 type by default so we need to use a ref-all pointer if the data
4637 reference does not conflict with the created aggregated data
4638 reference because it is not addressable. */
4639 bool need_ref_all = false;
4640 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4641 get_alias_set (DR_REF (dr))))
4642 need_ref_all = true;
4643 /* Likewise for any of the data references in the stmt group. */
4644 else if (DR_GROUP_SIZE (stmt_info) > 1)
4646 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4649 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4650 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4651 get_alias_set (DR_REF (sdr))))
4653 need_ref_all = true;
4654 break;
4656 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4658 while (sinfo);
4660 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4661 need_ref_all);
4662 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4665 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4666 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4667 def-use update cycles for the pointer: one relative to the outer-loop
4668 (LOOP), which is what steps (3) and (4) below do. The other is relative
4669 to the inner-loop (which is the inner-most loop containing the dataref),
4670 and this is done be step (5) below.
4672 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4673 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4674 redundant. Steps (3),(4) create the following:
4676 vp0 = &base_addr;
4677 LOOP: vp1 = phi(vp0,vp2)
4680 vp2 = vp1 + step
4681 goto LOOP
4683 If there is an inner-loop nested in loop, then step (5) will also be
4684 applied, and an additional update in the inner-loop will be created:
4686 vp0 = &base_addr;
4687 LOOP: vp1 = phi(vp0,vp2)
4689 inner: vp3 = phi(vp1,vp4)
4690 vp4 = vp3 + inner_step
4691 if () goto inner
4693 vp2 = vp1 + step
4694 if () goto LOOP */
4696 /* (2) Calculate the initial address of the aggregate-pointer, and set
4697 the aggregate-pointer to point to it before the loop. */
4699 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4701 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4702 offset, byte_offset);
4703 if (new_stmt_list)
4705 if (pe)
4707 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4708 gcc_assert (!new_bb);
4710 else
4711 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4714 *initial_address = new_temp;
4715 aggr_ptr_init = new_temp;
4717 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4718 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4719 inner-loop nested in LOOP (during outer-loop vectorization). */
4721 /* No update in loop is required. */
4722 if (only_init && (!loop_vinfo || at_loop == loop))
4723 aptr = aggr_ptr_init;
4724 else
4726 /* Accesses to invariant addresses should be handled specially
4727 by the caller. */
4728 tree step = vect_dr_behavior (dr_info)->step;
4729 gcc_assert (!integer_zerop (step));
4731 if (iv_step == NULL_TREE)
4733 /* The step of the aggregate pointer is the type size,
4734 negated for downward accesses. */
4735 iv_step = TYPE_SIZE_UNIT (aggr_type);
4736 if (tree_int_cst_sgn (step) == -1)
4737 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4740 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4742 create_iv (aggr_ptr_init,
4743 fold_convert (aggr_ptr_type, iv_step),
4744 aggr_ptr, loop, &incr_gsi, insert_after,
4745 &indx_before_incr, &indx_after_incr);
4746 incr = gsi_stmt (incr_gsi);
4747 loop_vinfo->add_stmt (incr);
4749 /* Copy the points-to information if it exists. */
4750 if (DR_PTR_INFO (dr))
4752 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4753 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4755 if (ptr_incr)
4756 *ptr_incr = incr;
4758 aptr = indx_before_incr;
4761 if (!nested_in_vect_loop || only_init)
4762 return aptr;
4765 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4766 nested in LOOP, if exists. */
4768 gcc_assert (nested_in_vect_loop);
4769 if (!only_init)
4771 standard_iv_increment_position (containing_loop, &incr_gsi,
4772 &insert_after);
4773 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4774 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4775 &indx_after_incr);
4776 incr = gsi_stmt (incr_gsi);
4777 loop_vinfo->add_stmt (incr);
4779 /* Copy the points-to information if it exists. */
4780 if (DR_PTR_INFO (dr))
4782 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4783 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4785 if (ptr_incr)
4786 *ptr_incr = incr;
4788 return indx_before_incr;
4790 else
4791 gcc_unreachable ();
4795 /* Function bump_vector_ptr
4797 Increment a pointer (to a vector type) by vector-size. If requested,
4798 i.e. if PTR-INCR is given, then also connect the new increment stmt
4799 to the existing def-use update-chain of the pointer, by modifying
4800 the PTR_INCR as illustrated below:
4802 The pointer def-use update-chain before this function:
4803 DATAREF_PTR = phi (p_0, p_2)
4804 ....
4805 PTR_INCR: p_2 = DATAREF_PTR + step
4807 The pointer def-use update-chain after this function:
4808 DATAREF_PTR = phi (p_0, p_2)
4809 ....
4810 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4811 ....
4812 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4814 Input:
4815 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4816 in the loop.
4817 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4818 the loop. The increment amount across iterations is expected
4819 to be vector_size.
4820 BSI - location where the new update stmt is to be placed.
4821 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4822 BUMP - optional. The offset by which to bump the pointer. If not given,
4823 the offset is assumed to be vector_size.
4825 Output: Return NEW_DATAREF_PTR as illustrated above.
4829 tree
4830 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4831 stmt_vec_info stmt_info, tree bump)
4833 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4834 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4835 tree update = TYPE_SIZE_UNIT (vectype);
4836 gassign *incr_stmt;
4837 ssa_op_iter iter;
4838 use_operand_p use_p;
4839 tree new_dataref_ptr;
4841 if (bump)
4842 update = bump;
4844 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4845 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4846 else
4847 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4848 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4849 dataref_ptr, update);
4850 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4852 /* Copy the points-to information if it exists. */
4853 if (DR_PTR_INFO (dr))
4855 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4856 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4859 if (!ptr_incr)
4860 return new_dataref_ptr;
4862 /* Update the vector-pointer's cross-iteration increment. */
4863 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4865 tree use = USE_FROM_PTR (use_p);
4867 if (use == dataref_ptr)
4868 SET_USE (use_p, new_dataref_ptr);
4869 else
4870 gcc_assert (operand_equal_p (use, update, 0));
4873 return new_dataref_ptr;
4877 /* Copy memory reference info such as base/clique from the SRC reference
4878 to the DEST MEM_REF. */
4880 void
4881 vect_copy_ref_info (tree dest, tree src)
4883 if (TREE_CODE (dest) != MEM_REF)
4884 return;
4886 tree src_base = src;
4887 while (handled_component_p (src_base))
4888 src_base = TREE_OPERAND (src_base, 0);
4889 if (TREE_CODE (src_base) != MEM_REF
4890 && TREE_CODE (src_base) != TARGET_MEM_REF)
4891 return;
4893 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4894 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4898 /* Function vect_create_destination_var.
4900 Create a new temporary of type VECTYPE. */
4902 tree
4903 vect_create_destination_var (tree scalar_dest, tree vectype)
4905 tree vec_dest;
4906 const char *name;
4907 char *new_name;
4908 tree type;
4909 enum vect_var_kind kind;
4911 kind = vectype
4912 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4913 ? vect_mask_var
4914 : vect_simple_var
4915 : vect_scalar_var;
4916 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4918 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4920 name = get_name (scalar_dest);
4921 if (name)
4922 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4923 else
4924 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4925 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4926 free (new_name);
4928 return vec_dest;
4931 /* Function vect_grouped_store_supported.
4933 Returns TRUE if interleave high and interleave low permutations
4934 are supported, and FALSE otherwise. */
4936 bool
4937 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4939 machine_mode mode = TYPE_MODE (vectype);
4941 /* vect_permute_store_chain requires the group size to be equal to 3 or
4942 be a power of two. */
4943 if (count != 3 && exact_log2 (count) == -1)
4945 if (dump_enabled_p ())
4946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4947 "the size of the group of accesses"
4948 " is not a power of 2 or not eqaul to 3\n");
4949 return false;
4952 /* Check that the permutation is supported. */
4953 if (VECTOR_MODE_P (mode))
4955 unsigned int i;
4956 if (count == 3)
4958 unsigned int j0 = 0, j1 = 0, j2 = 0;
4959 unsigned int i, j;
4961 unsigned int nelt;
4962 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4964 if (dump_enabled_p ())
4965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4966 "cannot handle groups of 3 stores for"
4967 " variable-length vectors\n");
4968 return false;
4971 vec_perm_builder sel (nelt, nelt, 1);
4972 sel.quick_grow (nelt);
4973 vec_perm_indices indices;
4974 for (j = 0; j < 3; j++)
4976 int nelt0 = ((3 - j) * nelt) % 3;
4977 int nelt1 = ((3 - j) * nelt + 1) % 3;
4978 int nelt2 = ((3 - j) * nelt + 2) % 3;
4979 for (i = 0; i < nelt; i++)
4981 if (3 * i + nelt0 < nelt)
4982 sel[3 * i + nelt0] = j0++;
4983 if (3 * i + nelt1 < nelt)
4984 sel[3 * i + nelt1] = nelt + j1++;
4985 if (3 * i + nelt2 < nelt)
4986 sel[3 * i + nelt2] = 0;
4988 indices.new_vector (sel, 2, nelt);
4989 if (!can_vec_perm_const_p (mode, indices))
4991 if (dump_enabled_p ())
4992 dump_printf (MSG_MISSED_OPTIMIZATION,
4993 "permutation op not supported by target.\n");
4994 return false;
4997 for (i = 0; i < nelt; i++)
4999 if (3 * i + nelt0 < nelt)
5000 sel[3 * i + nelt0] = 3 * i + nelt0;
5001 if (3 * i + nelt1 < nelt)
5002 sel[3 * i + nelt1] = 3 * i + nelt1;
5003 if (3 * i + nelt2 < nelt)
5004 sel[3 * i + nelt2] = nelt + j2++;
5006 indices.new_vector (sel, 2, nelt);
5007 if (!can_vec_perm_const_p (mode, indices))
5009 if (dump_enabled_p ())
5010 dump_printf (MSG_MISSED_OPTIMIZATION,
5011 "permutation op not supported by target.\n");
5012 return false;
5015 return true;
5017 else
5019 /* If length is not equal to 3 then only power of 2 is supported. */
5020 gcc_assert (pow2p_hwi (count));
5021 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5023 /* The encoding has 2 interleaved stepped patterns. */
5024 vec_perm_builder sel (nelt, 2, 3);
5025 sel.quick_grow (6);
5026 for (i = 0; i < 3; i++)
5028 sel[i * 2] = i;
5029 sel[i * 2 + 1] = i + nelt;
5031 vec_perm_indices indices (sel, 2, nelt);
5032 if (can_vec_perm_const_p (mode, indices))
5034 for (i = 0; i < 6; i++)
5035 sel[i] += exact_div (nelt, 2);
5036 indices.new_vector (sel, 2, nelt);
5037 if (can_vec_perm_const_p (mode, indices))
5038 return true;
5043 if (dump_enabled_p ())
5044 dump_printf (MSG_MISSED_OPTIMIZATION,
5045 "permutation op not supported by target.\n");
5046 return false;
5050 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5051 type VECTYPE. MASKED_P says whether the masked form is needed. */
5053 bool
5054 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5055 bool masked_p)
5057 if (masked_p)
5058 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5059 vec_mask_store_lanes_optab,
5060 vectype, count);
5061 else
5062 return vect_lanes_optab_supported_p ("vec_store_lanes",
5063 vec_store_lanes_optab,
5064 vectype, count);
5068 /* Function vect_permute_store_chain.
5070 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5071 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5072 the data correctly for the stores. Return the final references for stores
5073 in RESULT_CHAIN.
5075 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5076 The input is 4 vectors each containing 8 elements. We assign a number to
5077 each element, the input sequence is:
5079 1st vec: 0 1 2 3 4 5 6 7
5080 2nd vec: 8 9 10 11 12 13 14 15
5081 3rd vec: 16 17 18 19 20 21 22 23
5082 4th vec: 24 25 26 27 28 29 30 31
5084 The output sequence should be:
5086 1st vec: 0 8 16 24 1 9 17 25
5087 2nd vec: 2 10 18 26 3 11 19 27
5088 3rd vec: 4 12 20 28 5 13 21 30
5089 4th vec: 6 14 22 30 7 15 23 31
5091 i.e., we interleave the contents of the four vectors in their order.
5093 We use interleave_high/low instructions to create such output. The input of
5094 each interleave_high/low operation is two vectors:
5095 1st vec 2nd vec
5096 0 1 2 3 4 5 6 7
5097 the even elements of the result vector are obtained left-to-right from the
5098 high/low elements of the first vector. The odd elements of the result are
5099 obtained left-to-right from the high/low elements of the second vector.
5100 The output of interleave_high will be: 0 4 1 5
5101 and of interleave_low: 2 6 3 7
5104 The permutation is done in log LENGTH stages. In each stage interleave_high
5105 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5106 where the first argument is taken from the first half of DR_CHAIN and the
5107 second argument from it's second half.
5108 In our example,
5110 I1: interleave_high (1st vec, 3rd vec)
5111 I2: interleave_low (1st vec, 3rd vec)
5112 I3: interleave_high (2nd vec, 4th vec)
5113 I4: interleave_low (2nd vec, 4th vec)
5115 The output for the first stage is:
5117 I1: 0 16 1 17 2 18 3 19
5118 I2: 4 20 5 21 6 22 7 23
5119 I3: 8 24 9 25 10 26 11 27
5120 I4: 12 28 13 29 14 30 15 31
5122 The output of the second stage, i.e. the final result is:
5124 I1: 0 8 16 24 1 9 17 25
5125 I2: 2 10 18 26 3 11 19 27
5126 I3: 4 12 20 28 5 13 21 30
5127 I4: 6 14 22 30 7 15 23 31. */
5129 void
5130 vect_permute_store_chain (vec<tree> dr_chain,
5131 unsigned int length,
5132 stmt_vec_info stmt_info,
5133 gimple_stmt_iterator *gsi,
5134 vec<tree> *result_chain)
5136 tree vect1, vect2, high, low;
5137 gimple *perm_stmt;
5138 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5139 tree perm_mask_low, perm_mask_high;
5140 tree data_ref;
5141 tree perm3_mask_low, perm3_mask_high;
5142 unsigned int i, j, n, log_length = exact_log2 (length);
5144 result_chain->quick_grow (length);
5145 memcpy (result_chain->address (), dr_chain.address (),
5146 length * sizeof (tree));
5148 if (length == 3)
5150 /* vect_grouped_store_supported ensures that this is constant. */
5151 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5152 unsigned int j0 = 0, j1 = 0, j2 = 0;
5154 vec_perm_builder sel (nelt, nelt, 1);
5155 sel.quick_grow (nelt);
5156 vec_perm_indices indices;
5157 for (j = 0; j < 3; j++)
5159 int nelt0 = ((3 - j) * nelt) % 3;
5160 int nelt1 = ((3 - j) * nelt + 1) % 3;
5161 int nelt2 = ((3 - j) * nelt + 2) % 3;
5163 for (i = 0; i < nelt; i++)
5165 if (3 * i + nelt0 < nelt)
5166 sel[3 * i + nelt0] = j0++;
5167 if (3 * i + nelt1 < nelt)
5168 sel[3 * i + nelt1] = nelt + j1++;
5169 if (3 * i + nelt2 < nelt)
5170 sel[3 * i + nelt2] = 0;
5172 indices.new_vector (sel, 2, nelt);
5173 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5175 for (i = 0; i < nelt; i++)
5177 if (3 * i + nelt0 < nelt)
5178 sel[3 * i + nelt0] = 3 * i + nelt0;
5179 if (3 * i + nelt1 < nelt)
5180 sel[3 * i + nelt1] = 3 * i + nelt1;
5181 if (3 * i + nelt2 < nelt)
5182 sel[3 * i + nelt2] = nelt + j2++;
5184 indices.new_vector (sel, 2, nelt);
5185 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5187 vect1 = dr_chain[0];
5188 vect2 = dr_chain[1];
5190 /* Create interleaving stmt:
5191 low = VEC_PERM_EXPR <vect1, vect2,
5192 {j, nelt, *, j + 1, nelt + j + 1, *,
5193 j + 2, nelt + j + 2, *, ...}> */
5194 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5195 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5196 vect2, perm3_mask_low);
5197 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5199 vect1 = data_ref;
5200 vect2 = dr_chain[2];
5201 /* Create interleaving stmt:
5202 low = VEC_PERM_EXPR <vect1, vect2,
5203 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5204 6, 7, nelt + j + 2, ...}> */
5205 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5206 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5207 vect2, perm3_mask_high);
5208 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5209 (*result_chain)[j] = data_ref;
5212 else
5214 /* If length is not equal to 3 then only power of 2 is supported. */
5215 gcc_assert (pow2p_hwi (length));
5217 /* The encoding has 2 interleaved stepped patterns. */
5218 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5219 vec_perm_builder sel (nelt, 2, 3);
5220 sel.quick_grow (6);
5221 for (i = 0; i < 3; i++)
5223 sel[i * 2] = i;
5224 sel[i * 2 + 1] = i + nelt;
5226 vec_perm_indices indices (sel, 2, nelt);
5227 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5229 for (i = 0; i < 6; i++)
5230 sel[i] += exact_div (nelt, 2);
5231 indices.new_vector (sel, 2, nelt);
5232 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5234 for (i = 0, n = log_length; i < n; i++)
5236 for (j = 0; j < length/2; j++)
5238 vect1 = dr_chain[j];
5239 vect2 = dr_chain[j+length/2];
5241 /* Create interleaving stmt:
5242 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5243 ...}> */
5244 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5245 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5246 vect2, perm_mask_high);
5247 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5248 (*result_chain)[2*j] = high;
5250 /* Create interleaving stmt:
5251 low = VEC_PERM_EXPR <vect1, vect2,
5252 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5253 ...}> */
5254 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5255 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5256 vect2, perm_mask_low);
5257 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5258 (*result_chain)[2*j+1] = low;
5260 memcpy (dr_chain.address (), result_chain->address (),
5261 length * sizeof (tree));
5266 /* Function vect_setup_realignment
5268 This function is called when vectorizing an unaligned load using
5269 the dr_explicit_realign[_optimized] scheme.
5270 This function generates the following code at the loop prolog:
5272 p = initial_addr;
5273 x msq_init = *(floor(p)); # prolog load
5274 realignment_token = call target_builtin;
5275 loop:
5276 x msq = phi (msq_init, ---)
5278 The stmts marked with x are generated only for the case of
5279 dr_explicit_realign_optimized.
5281 The code above sets up a new (vector) pointer, pointing to the first
5282 location accessed by STMT_INFO, and a "floor-aligned" load using that
5283 pointer. It also generates code to compute the "realignment-token"
5284 (if the relevant target hook was defined), and creates a phi-node at the
5285 loop-header bb whose arguments are the result of the prolog-load (created
5286 by this function) and the result of a load that takes place in the loop
5287 (to be created by the caller to this function).
5289 For the case of dr_explicit_realign_optimized:
5290 The caller to this function uses the phi-result (msq) to create the
5291 realignment code inside the loop, and sets up the missing phi argument,
5292 as follows:
5293 loop:
5294 msq = phi (msq_init, lsq)
5295 lsq = *(floor(p')); # load in loop
5296 result = realign_load (msq, lsq, realignment_token);
5298 For the case of dr_explicit_realign:
5299 loop:
5300 msq = *(floor(p)); # load in loop
5301 p' = p + (VS-1);
5302 lsq = *(floor(p')); # load in loop
5303 result = realign_load (msq, lsq, realignment_token);
5305 Input:
5306 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5307 a memory location that may be unaligned.
5308 BSI - place where new code is to be inserted.
5309 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5310 is used.
5312 Output:
5313 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5314 target hook, if defined.
5315 Return value - the result of the loop-header phi node. */
5317 tree
5318 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5319 tree *realignment_token,
5320 enum dr_alignment_support alignment_support_scheme,
5321 tree init_addr,
5322 struct loop **at_loop)
5324 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5325 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5326 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5327 struct data_reference *dr = dr_info->dr;
5328 struct loop *loop = NULL;
5329 edge pe = NULL;
5330 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5331 tree vec_dest;
5332 gimple *inc;
5333 tree ptr;
5334 tree data_ref;
5335 basic_block new_bb;
5336 tree msq_init = NULL_TREE;
5337 tree new_temp;
5338 gphi *phi_stmt;
5339 tree msq = NULL_TREE;
5340 gimple_seq stmts = NULL;
5341 bool compute_in_loop = false;
5342 bool nested_in_vect_loop = false;
5343 struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5344 struct loop *loop_for_initial_load = NULL;
5346 if (loop_vinfo)
5348 loop = LOOP_VINFO_LOOP (loop_vinfo);
5349 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5352 gcc_assert (alignment_support_scheme == dr_explicit_realign
5353 || alignment_support_scheme == dr_explicit_realign_optimized);
5355 /* We need to generate three things:
5356 1. the misalignment computation
5357 2. the extra vector load (for the optimized realignment scheme).
5358 3. the phi node for the two vectors from which the realignment is
5359 done (for the optimized realignment scheme). */
5361 /* 1. Determine where to generate the misalignment computation.
5363 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5364 calculation will be generated by this function, outside the loop (in the
5365 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5366 caller, inside the loop.
5368 Background: If the misalignment remains fixed throughout the iterations of
5369 the loop, then both realignment schemes are applicable, and also the
5370 misalignment computation can be done outside LOOP. This is because we are
5371 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5372 are a multiple of VS (the Vector Size), and therefore the misalignment in
5373 different vectorized LOOP iterations is always the same.
5374 The problem arises only if the memory access is in an inner-loop nested
5375 inside LOOP, which is now being vectorized using outer-loop vectorization.
5376 This is the only case when the misalignment of the memory access may not
5377 remain fixed throughout the iterations of the inner-loop (as explained in
5378 detail in vect_supportable_dr_alignment). In this case, not only is the
5379 optimized realignment scheme not applicable, but also the misalignment
5380 computation (and generation of the realignment token that is passed to
5381 REALIGN_LOAD) have to be done inside the loop.
5383 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5384 or not, which in turn determines if the misalignment is computed inside
5385 the inner-loop, or outside LOOP. */
5387 if (init_addr != NULL_TREE || !loop_vinfo)
5389 compute_in_loop = true;
5390 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5394 /* 2. Determine where to generate the extra vector load.
5396 For the optimized realignment scheme, instead of generating two vector
5397 loads in each iteration, we generate a single extra vector load in the
5398 preheader of the loop, and in each iteration reuse the result of the
5399 vector load from the previous iteration. In case the memory access is in
5400 an inner-loop nested inside LOOP, which is now being vectorized using
5401 outer-loop vectorization, we need to determine whether this initial vector
5402 load should be generated at the preheader of the inner-loop, or can be
5403 generated at the preheader of LOOP. If the memory access has no evolution
5404 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5405 to be generated inside LOOP (in the preheader of the inner-loop). */
5407 if (nested_in_vect_loop)
5409 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5410 bool invariant_in_outerloop =
5411 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5412 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5414 else
5415 loop_for_initial_load = loop;
5416 if (at_loop)
5417 *at_loop = loop_for_initial_load;
5419 if (loop_for_initial_load)
5420 pe = loop_preheader_edge (loop_for_initial_load);
5422 /* 3. For the case of the optimized realignment, create the first vector
5423 load at the loop preheader. */
5425 if (alignment_support_scheme == dr_explicit_realign_optimized)
5427 /* Create msq_init = *(floor(p1)) in the loop preheader */
5428 gassign *new_stmt;
5430 gcc_assert (!compute_in_loop);
5431 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5432 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5433 loop_for_initial_load, NULL_TREE,
5434 &init_addr, NULL, &inc, true);
5435 if (TREE_CODE (ptr) == SSA_NAME)
5436 new_temp = copy_ssa_name (ptr);
5437 else
5438 new_temp = make_ssa_name (TREE_TYPE (ptr));
5439 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5440 tree type = TREE_TYPE (ptr);
5441 new_stmt = gimple_build_assign
5442 (new_temp, BIT_AND_EXPR, ptr,
5443 fold_build2 (MINUS_EXPR, type,
5444 build_int_cst (type, 0),
5445 build_int_cst (type, align)));
5446 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5447 gcc_assert (!new_bb);
5448 data_ref
5449 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5450 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5451 vect_copy_ref_info (data_ref, DR_REF (dr));
5452 new_stmt = gimple_build_assign (vec_dest, data_ref);
5453 new_temp = make_ssa_name (vec_dest, new_stmt);
5454 gimple_assign_set_lhs (new_stmt, new_temp);
5455 if (pe)
5457 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5458 gcc_assert (!new_bb);
5460 else
5461 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5463 msq_init = gimple_assign_lhs (new_stmt);
5466 /* 4. Create realignment token using a target builtin, if available.
5467 It is done either inside the containing loop, or before LOOP (as
5468 determined above). */
5470 if (targetm.vectorize.builtin_mask_for_load)
5472 gcall *new_stmt;
5473 tree builtin_decl;
5475 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5476 if (!init_addr)
5478 /* Generate the INIT_ADDR computation outside LOOP. */
5479 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5480 NULL_TREE);
5481 if (loop)
5483 pe = loop_preheader_edge (loop);
5484 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5485 gcc_assert (!new_bb);
5487 else
5488 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5491 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5492 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5493 vec_dest =
5494 vect_create_destination_var (scalar_dest,
5495 gimple_call_return_type (new_stmt));
5496 new_temp = make_ssa_name (vec_dest, new_stmt);
5497 gimple_call_set_lhs (new_stmt, new_temp);
5499 if (compute_in_loop)
5500 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5501 else
5503 /* Generate the misalignment computation outside LOOP. */
5504 pe = loop_preheader_edge (loop);
5505 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5506 gcc_assert (!new_bb);
5509 *realignment_token = gimple_call_lhs (new_stmt);
5511 /* The result of the CALL_EXPR to this builtin is determined from
5512 the value of the parameter and no global variables are touched
5513 which makes the builtin a "const" function. Requiring the
5514 builtin to have the "const" attribute makes it unnecessary
5515 to call mark_call_clobbered. */
5516 gcc_assert (TREE_READONLY (builtin_decl));
5519 if (alignment_support_scheme == dr_explicit_realign)
5520 return msq;
5522 gcc_assert (!compute_in_loop);
5523 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5526 /* 5. Create msq = phi <msq_init, lsq> in loop */
5528 pe = loop_preheader_edge (containing_loop);
5529 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5530 msq = make_ssa_name (vec_dest);
5531 phi_stmt = create_phi_node (msq, containing_loop->header);
5532 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5534 return msq;
5538 /* Function vect_grouped_load_supported.
5540 COUNT is the size of the load group (the number of statements plus the
5541 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5542 only one statement, with a gap of COUNT - 1.
5544 Returns true if a suitable permute exists. */
5546 bool
5547 vect_grouped_load_supported (tree vectype, bool single_element_p,
5548 unsigned HOST_WIDE_INT count)
5550 machine_mode mode = TYPE_MODE (vectype);
5552 /* If this is single-element interleaving with an element distance
5553 that leaves unused vector loads around punt - we at least create
5554 very sub-optimal code in that case (and blow up memory,
5555 see PR65518). */
5556 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5558 if (dump_enabled_p ())
5559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5560 "single-element interleaving not supported "
5561 "for not adjacent vector loads\n");
5562 return false;
5565 /* vect_permute_load_chain requires the group size to be equal to 3 or
5566 be a power of two. */
5567 if (count != 3 && exact_log2 (count) == -1)
5569 if (dump_enabled_p ())
5570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5571 "the size of the group of accesses"
5572 " is not a power of 2 or not equal to 3\n");
5573 return false;
5576 /* Check that the permutation is supported. */
5577 if (VECTOR_MODE_P (mode))
5579 unsigned int i, j;
5580 if (count == 3)
5582 unsigned int nelt;
5583 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5585 if (dump_enabled_p ())
5586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5587 "cannot handle groups of 3 loads for"
5588 " variable-length vectors\n");
5589 return false;
5592 vec_perm_builder sel (nelt, nelt, 1);
5593 sel.quick_grow (nelt);
5594 vec_perm_indices indices;
5595 unsigned int k;
5596 for (k = 0; k < 3; k++)
5598 for (i = 0; i < nelt; i++)
5599 if (3 * i + k < 2 * nelt)
5600 sel[i] = 3 * i + k;
5601 else
5602 sel[i] = 0;
5603 indices.new_vector (sel, 2, nelt);
5604 if (!can_vec_perm_const_p (mode, indices))
5606 if (dump_enabled_p ())
5607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5608 "shuffle of 3 loads is not supported by"
5609 " target\n");
5610 return false;
5612 for (i = 0, j = 0; i < nelt; i++)
5613 if (3 * i + k < 2 * nelt)
5614 sel[i] = i;
5615 else
5616 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5617 indices.new_vector (sel, 2, nelt);
5618 if (!can_vec_perm_const_p (mode, indices))
5620 if (dump_enabled_p ())
5621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5622 "shuffle of 3 loads is not supported by"
5623 " target\n");
5624 return false;
5627 return true;
5629 else
5631 /* If length is not equal to 3 then only power of 2 is supported. */
5632 gcc_assert (pow2p_hwi (count));
5633 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5635 /* The encoding has a single stepped pattern. */
5636 vec_perm_builder sel (nelt, 1, 3);
5637 sel.quick_grow (3);
5638 for (i = 0; i < 3; i++)
5639 sel[i] = i * 2;
5640 vec_perm_indices indices (sel, 2, nelt);
5641 if (can_vec_perm_const_p (mode, indices))
5643 for (i = 0; i < 3; i++)
5644 sel[i] = i * 2 + 1;
5645 indices.new_vector (sel, 2, nelt);
5646 if (can_vec_perm_const_p (mode, indices))
5647 return true;
5652 if (dump_enabled_p ())
5653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5654 "extract even/odd not supported by target\n");
5655 return false;
5658 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5659 type VECTYPE. MASKED_P says whether the masked form is needed. */
5661 bool
5662 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5663 bool masked_p)
5665 if (masked_p)
5666 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5667 vec_mask_load_lanes_optab,
5668 vectype, count);
5669 else
5670 return vect_lanes_optab_supported_p ("vec_load_lanes",
5671 vec_load_lanes_optab,
5672 vectype, count);
5675 /* Function vect_permute_load_chain.
5677 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5678 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5679 the input data correctly. Return the final references for loads in
5680 RESULT_CHAIN.
5682 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5683 The input is 4 vectors each containing 8 elements. We assign a number to each
5684 element, the input sequence is:
5686 1st vec: 0 1 2 3 4 5 6 7
5687 2nd vec: 8 9 10 11 12 13 14 15
5688 3rd vec: 16 17 18 19 20 21 22 23
5689 4th vec: 24 25 26 27 28 29 30 31
5691 The output sequence should be:
5693 1st vec: 0 4 8 12 16 20 24 28
5694 2nd vec: 1 5 9 13 17 21 25 29
5695 3rd vec: 2 6 10 14 18 22 26 30
5696 4th vec: 3 7 11 15 19 23 27 31
5698 i.e., the first output vector should contain the first elements of each
5699 interleaving group, etc.
5701 We use extract_even/odd instructions to create such output. The input of
5702 each extract_even/odd operation is two vectors
5703 1st vec 2nd vec
5704 0 1 2 3 4 5 6 7
5706 and the output is the vector of extracted even/odd elements. The output of
5707 extract_even will be: 0 2 4 6
5708 and of extract_odd: 1 3 5 7
5711 The permutation is done in log LENGTH stages. In each stage extract_even
5712 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5713 their order. In our example,
5715 E1: extract_even (1st vec, 2nd vec)
5716 E2: extract_odd (1st vec, 2nd vec)
5717 E3: extract_even (3rd vec, 4th vec)
5718 E4: extract_odd (3rd vec, 4th vec)
5720 The output for the first stage will be:
5722 E1: 0 2 4 6 8 10 12 14
5723 E2: 1 3 5 7 9 11 13 15
5724 E3: 16 18 20 22 24 26 28 30
5725 E4: 17 19 21 23 25 27 29 31
5727 In order to proceed and create the correct sequence for the next stage (or
5728 for the correct output, if the second stage is the last one, as in our
5729 example), we first put the output of extract_even operation and then the
5730 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5731 The input for the second stage is:
5733 1st vec (E1): 0 2 4 6 8 10 12 14
5734 2nd vec (E3): 16 18 20 22 24 26 28 30
5735 3rd vec (E2): 1 3 5 7 9 11 13 15
5736 4th vec (E4): 17 19 21 23 25 27 29 31
5738 The output of the second stage:
5740 E1: 0 4 8 12 16 20 24 28
5741 E2: 2 6 10 14 18 22 26 30
5742 E3: 1 5 9 13 17 21 25 29
5743 E4: 3 7 11 15 19 23 27 31
5745 And RESULT_CHAIN after reordering:
5747 1st vec (E1): 0 4 8 12 16 20 24 28
5748 2nd vec (E3): 1 5 9 13 17 21 25 29
5749 3rd vec (E2): 2 6 10 14 18 22 26 30
5750 4th vec (E4): 3 7 11 15 19 23 27 31. */
5752 static void
5753 vect_permute_load_chain (vec<tree> dr_chain,
5754 unsigned int length,
5755 stmt_vec_info stmt_info,
5756 gimple_stmt_iterator *gsi,
5757 vec<tree> *result_chain)
5759 tree data_ref, first_vect, second_vect;
5760 tree perm_mask_even, perm_mask_odd;
5761 tree perm3_mask_low, perm3_mask_high;
5762 gimple *perm_stmt;
5763 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5764 unsigned int i, j, log_length = exact_log2 (length);
5766 result_chain->quick_grow (length);
5767 memcpy (result_chain->address (), dr_chain.address (),
5768 length * sizeof (tree));
5770 if (length == 3)
5772 /* vect_grouped_load_supported ensures that this is constant. */
5773 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5774 unsigned int k;
5776 vec_perm_builder sel (nelt, nelt, 1);
5777 sel.quick_grow (nelt);
5778 vec_perm_indices indices;
5779 for (k = 0; k < 3; k++)
5781 for (i = 0; i < nelt; i++)
5782 if (3 * i + k < 2 * nelt)
5783 sel[i] = 3 * i + k;
5784 else
5785 sel[i] = 0;
5786 indices.new_vector (sel, 2, nelt);
5787 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5789 for (i = 0, j = 0; i < nelt; i++)
5790 if (3 * i + k < 2 * nelt)
5791 sel[i] = i;
5792 else
5793 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5794 indices.new_vector (sel, 2, nelt);
5795 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5797 first_vect = dr_chain[0];
5798 second_vect = dr_chain[1];
5800 /* Create interleaving stmt (low part of):
5801 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5802 ...}> */
5803 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5804 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5805 second_vect, perm3_mask_low);
5806 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5808 /* Create interleaving stmt (high part of):
5809 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5810 ...}> */
5811 first_vect = data_ref;
5812 second_vect = dr_chain[2];
5813 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5814 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5815 second_vect, perm3_mask_high);
5816 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5817 (*result_chain)[k] = data_ref;
5820 else
5822 /* If length is not equal to 3 then only power of 2 is supported. */
5823 gcc_assert (pow2p_hwi (length));
5825 /* The encoding has a single stepped pattern. */
5826 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5827 vec_perm_builder sel (nelt, 1, 3);
5828 sel.quick_grow (3);
5829 for (i = 0; i < 3; ++i)
5830 sel[i] = i * 2;
5831 vec_perm_indices indices (sel, 2, nelt);
5832 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5834 for (i = 0; i < 3; ++i)
5835 sel[i] = i * 2 + 1;
5836 indices.new_vector (sel, 2, nelt);
5837 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5839 for (i = 0; i < log_length; i++)
5841 for (j = 0; j < length; j += 2)
5843 first_vect = dr_chain[j];
5844 second_vect = dr_chain[j+1];
5846 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5847 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5848 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5849 first_vect, second_vect,
5850 perm_mask_even);
5851 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5852 (*result_chain)[j/2] = data_ref;
5854 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5855 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5856 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5857 first_vect, second_vect,
5858 perm_mask_odd);
5859 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5860 (*result_chain)[j/2+length/2] = data_ref;
5862 memcpy (dr_chain.address (), result_chain->address (),
5863 length * sizeof (tree));
5868 /* Function vect_shift_permute_load_chain.
5870 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5871 sequence of stmts to reorder the input data accordingly.
5872 Return the final references for loads in RESULT_CHAIN.
5873 Return true if successed, false otherwise.
5875 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5876 The input is 3 vectors each containing 8 elements. We assign a
5877 number to each element, the input sequence is:
5879 1st vec: 0 1 2 3 4 5 6 7
5880 2nd vec: 8 9 10 11 12 13 14 15
5881 3rd vec: 16 17 18 19 20 21 22 23
5883 The output sequence should be:
5885 1st vec: 0 3 6 9 12 15 18 21
5886 2nd vec: 1 4 7 10 13 16 19 22
5887 3rd vec: 2 5 8 11 14 17 20 23
5889 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5891 First we shuffle all 3 vectors to get correct elements order:
5893 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5894 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5895 3rd vec: (16 19 22) (17 20 23) (18 21)
5897 Next we unite and shift vector 3 times:
5899 1st step:
5900 shift right by 6 the concatenation of:
5901 "1st vec" and "2nd vec"
5902 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5903 "2nd vec" and "3rd vec"
5904 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5905 "3rd vec" and "1st vec"
5906 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5907 | New vectors |
5909 So that now new vectors are:
5911 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5912 2nd vec: (10 13) (16 19 22) (17 20 23)
5913 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5915 2nd step:
5916 shift right by 5 the concatenation of:
5917 "1st vec" and "3rd vec"
5918 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5919 "2nd vec" and "1st vec"
5920 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5921 "3rd vec" and "2nd vec"
5922 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5923 | New vectors |
5925 So that now new vectors are:
5927 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5928 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5929 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5931 3rd step:
5932 shift right by 5 the concatenation of:
5933 "1st vec" and "1st vec"
5934 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5935 shift right by 3 the concatenation of:
5936 "2nd vec" and "2nd vec"
5937 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5938 | New vectors |
5940 So that now all vectors are READY:
5941 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5942 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5943 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5945 This algorithm is faster than one in vect_permute_load_chain if:
5946 1. "shift of a concatination" is faster than general permutation.
5947 This is usually so.
5948 2. The TARGET machine can't execute vector instructions in parallel.
5949 This is because each step of the algorithm depends on previous.
5950 The algorithm in vect_permute_load_chain is much more parallel.
5952 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5955 static bool
5956 vect_shift_permute_load_chain (vec<tree> dr_chain,
5957 unsigned int length,
5958 stmt_vec_info stmt_info,
5959 gimple_stmt_iterator *gsi,
5960 vec<tree> *result_chain)
5962 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5963 tree perm2_mask1, perm2_mask2, perm3_mask;
5964 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5965 gimple *perm_stmt;
5967 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5968 unsigned int i;
5969 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5971 unsigned HOST_WIDE_INT nelt, vf;
5972 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5973 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5974 /* Not supported for variable-length vectors. */
5975 return false;
5977 vec_perm_builder sel (nelt, nelt, 1);
5978 sel.quick_grow (nelt);
5980 result_chain->quick_grow (length);
5981 memcpy (result_chain->address (), dr_chain.address (),
5982 length * sizeof (tree));
5984 if (pow2p_hwi (length) && vf > 4)
5986 unsigned int j, log_length = exact_log2 (length);
5987 for (i = 0; i < nelt / 2; ++i)
5988 sel[i] = i * 2;
5989 for (i = 0; i < nelt / 2; ++i)
5990 sel[nelt / 2 + i] = i * 2 + 1;
5991 vec_perm_indices indices (sel, 2, nelt);
5992 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5994 if (dump_enabled_p ())
5995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5996 "shuffle of 2 fields structure is not \
5997 supported by target\n");
5998 return false;
6000 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6002 for (i = 0; i < nelt / 2; ++i)
6003 sel[i] = i * 2 + 1;
6004 for (i = 0; i < nelt / 2; ++i)
6005 sel[nelt / 2 + i] = i * 2;
6006 indices.new_vector (sel, 2, nelt);
6007 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6009 if (dump_enabled_p ())
6010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6011 "shuffle of 2 fields structure is not \
6012 supported by target\n");
6013 return false;
6015 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6017 /* Generating permutation constant to shift all elements.
6018 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6019 for (i = 0; i < nelt; i++)
6020 sel[i] = nelt / 2 + i;
6021 indices.new_vector (sel, 2, nelt);
6022 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6024 if (dump_enabled_p ())
6025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6026 "shift permutation is not supported by target\n");
6027 return false;
6029 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6031 /* Generating permutation constant to select vector from 2.
6032 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6033 for (i = 0; i < nelt / 2; i++)
6034 sel[i] = i;
6035 for (i = nelt / 2; i < nelt; i++)
6036 sel[i] = nelt + i;
6037 indices.new_vector (sel, 2, nelt);
6038 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6040 if (dump_enabled_p ())
6041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6042 "select is not supported by target\n");
6043 return false;
6045 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6047 for (i = 0; i < log_length; i++)
6049 for (j = 0; j < length; j += 2)
6051 first_vect = dr_chain[j];
6052 second_vect = dr_chain[j + 1];
6054 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6055 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6056 first_vect, first_vect,
6057 perm2_mask1);
6058 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6059 vect[0] = data_ref;
6061 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6062 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6063 second_vect, second_vect,
6064 perm2_mask2);
6065 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6066 vect[1] = data_ref;
6068 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6069 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6070 vect[0], vect[1], shift1_mask);
6071 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6072 (*result_chain)[j/2 + length/2] = data_ref;
6074 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6075 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6076 vect[0], vect[1], select_mask);
6077 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6078 (*result_chain)[j/2] = data_ref;
6080 memcpy (dr_chain.address (), result_chain->address (),
6081 length * sizeof (tree));
6083 return true;
6085 if (length == 3 && vf > 2)
6087 unsigned int k = 0, l = 0;
6089 /* Generating permutation constant to get all elements in rigth order.
6090 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6091 for (i = 0; i < nelt; i++)
6093 if (3 * k + (l % 3) >= nelt)
6095 k = 0;
6096 l += (3 - (nelt % 3));
6098 sel[i] = 3 * k + (l % 3);
6099 k++;
6101 vec_perm_indices indices (sel, 2, nelt);
6102 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6104 if (dump_enabled_p ())
6105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6106 "shuffle of 3 fields structure is not \
6107 supported by target\n");
6108 return false;
6110 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6112 /* Generating permutation constant to shift all elements.
6113 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6114 for (i = 0; i < nelt; i++)
6115 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6116 indices.new_vector (sel, 2, nelt);
6117 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6119 if (dump_enabled_p ())
6120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6121 "shift permutation is not supported by target\n");
6122 return false;
6124 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6126 /* Generating permutation constant to shift all elements.
6127 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6128 for (i = 0; i < nelt; i++)
6129 sel[i] = 2 * (nelt / 3) + 1 + i;
6130 indices.new_vector (sel, 2, nelt);
6131 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6133 if (dump_enabled_p ())
6134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6135 "shift permutation is not supported by target\n");
6136 return false;
6138 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6140 /* Generating permutation constant to shift all elements.
6141 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6142 for (i = 0; i < nelt; i++)
6143 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6144 indices.new_vector (sel, 2, nelt);
6145 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6147 if (dump_enabled_p ())
6148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6149 "shift permutation is not supported by target\n");
6150 return false;
6152 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6154 /* Generating permutation constant to shift all elements.
6155 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6156 for (i = 0; i < nelt; i++)
6157 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6158 indices.new_vector (sel, 2, nelt);
6159 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6161 if (dump_enabled_p ())
6162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6163 "shift permutation is not supported by target\n");
6164 return false;
6166 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6168 for (k = 0; k < 3; k++)
6170 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6171 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6172 dr_chain[k], dr_chain[k],
6173 perm3_mask);
6174 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6175 vect[k] = data_ref;
6178 for (k = 0; k < 3; k++)
6180 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6181 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6182 vect[k % 3], vect[(k + 1) % 3],
6183 shift1_mask);
6184 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6185 vect_shift[k] = data_ref;
6188 for (k = 0; k < 3; k++)
6190 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6191 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6192 vect_shift[(4 - k) % 3],
6193 vect_shift[(3 - k) % 3],
6194 shift2_mask);
6195 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6196 vect[k] = data_ref;
6199 (*result_chain)[3 - (nelt % 3)] = vect[2];
6201 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6202 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6203 vect[0], shift3_mask);
6204 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6205 (*result_chain)[nelt % 3] = data_ref;
6207 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6208 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6209 vect[1], shift4_mask);
6210 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6211 (*result_chain)[0] = data_ref;
6212 return true;
6214 return false;
6217 /* Function vect_transform_grouped_load.
6219 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6220 to perform their permutation and ascribe the result vectorized statements to
6221 the scalar statements.
6224 void
6225 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6226 int size, gimple_stmt_iterator *gsi)
6228 machine_mode mode;
6229 vec<tree> result_chain = vNULL;
6231 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6232 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6233 vectors, that are ready for vector computation. */
6234 result_chain.create (size);
6236 /* If reassociation width for vector type is 2 or greater target machine can
6237 execute 2 or more vector instructions in parallel. Otherwise try to
6238 get chain for loads group using vect_shift_permute_load_chain. */
6239 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6240 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6241 || pow2p_hwi (size)
6242 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6243 gsi, &result_chain))
6244 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6245 vect_record_grouped_load_vectors (stmt_info, result_chain);
6246 result_chain.release ();
6249 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6250 generated as part of the vectorization of STMT_INFO. Assign the statement
6251 for each vector to the associated scalar statement. */
6253 void
6254 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6255 vec<tree> result_chain)
6257 vec_info *vinfo = stmt_info->vinfo;
6258 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6259 unsigned int i, gap_count;
6260 tree tmp_data_ref;
6262 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6263 Since we scan the chain starting from it's first node, their order
6264 corresponds the order of data-refs in RESULT_CHAIN. */
6265 stmt_vec_info next_stmt_info = first_stmt_info;
6266 gap_count = 1;
6267 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6269 if (!next_stmt_info)
6270 break;
6272 /* Skip the gaps. Loads created for the gaps will be removed by dead
6273 code elimination pass later. No need to check for the first stmt in
6274 the group, since it always exists.
6275 DR_GROUP_GAP is the number of steps in elements from the previous
6276 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6277 correspond to the gaps. */
6278 if (next_stmt_info != first_stmt_info
6279 && gap_count < DR_GROUP_GAP (next_stmt_info))
6281 gap_count++;
6282 continue;
6285 while (next_stmt_info)
6287 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6288 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6289 copies, and we put the new vector statement in the first available
6290 RELATED_STMT. */
6291 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6292 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6293 else
6295 if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6297 stmt_vec_info prev_stmt_info
6298 = STMT_VINFO_VEC_STMT (next_stmt_info);
6299 stmt_vec_info rel_stmt_info
6300 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6301 while (rel_stmt_info)
6303 prev_stmt_info = rel_stmt_info;
6304 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6307 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6311 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6312 gap_count = 1;
6313 /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6314 put the same TMP_DATA_REF as its vectorized statement; otherwise
6315 get the next data-ref from RESULT_CHAIN. */
6316 if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6317 break;
6322 /* Function vect_force_dr_alignment_p.
6324 Returns whether the alignment of a DECL can be forced to be aligned
6325 on ALIGNMENT bit boundary. */
6327 bool
6328 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6330 if (!VAR_P (decl))
6331 return false;
6333 if (decl_in_symtab_p (decl)
6334 && !symtab_node::get (decl)->can_increase_alignment_p ())
6335 return false;
6337 if (TREE_STATIC (decl))
6338 return (known_le (alignment,
6339 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6340 else
6341 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6345 /* Return whether the data reference DR_INFO is supported with respect to its
6346 alignment.
6347 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6348 it is aligned, i.e., check if it is possible to vectorize it with different
6349 alignment. */
6351 enum dr_alignment_support
6352 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6353 bool check_aligned_accesses)
6355 data_reference *dr = dr_info->dr;
6356 stmt_vec_info stmt_info = dr_info->stmt;
6357 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6358 machine_mode mode = TYPE_MODE (vectype);
6359 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6360 struct loop *vect_loop = NULL;
6361 bool nested_in_vect_loop = false;
6363 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6364 return dr_aligned;
6366 /* For now assume all conditional loads/stores support unaligned
6367 access without any special code. */
6368 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6369 if (gimple_call_internal_p (stmt)
6370 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6371 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6372 return dr_unaligned_supported;
6374 if (loop_vinfo)
6376 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6377 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6380 /* Possibly unaligned access. */
6382 /* We can choose between using the implicit realignment scheme (generating
6383 a misaligned_move stmt) and the explicit realignment scheme (generating
6384 aligned loads with a REALIGN_LOAD). There are two variants to the
6385 explicit realignment scheme: optimized, and unoptimized.
6386 We can optimize the realignment only if the step between consecutive
6387 vector loads is equal to the vector size. Since the vector memory
6388 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6389 is guaranteed that the misalignment amount remains the same throughout the
6390 execution of the vectorized loop. Therefore, we can create the
6391 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6392 at the loop preheader.
6394 However, in the case of outer-loop vectorization, when vectorizing a
6395 memory access in the inner-loop nested within the LOOP that is now being
6396 vectorized, while it is guaranteed that the misalignment of the
6397 vectorized memory access will remain the same in different outer-loop
6398 iterations, it is *not* guaranteed that is will remain the same throughout
6399 the execution of the inner-loop. This is because the inner-loop advances
6400 with the original scalar step (and not in steps of VS). If the inner-loop
6401 step happens to be a multiple of VS, then the misalignment remains fixed
6402 and we can use the optimized realignment scheme. For example:
6404 for (i=0; i<N; i++)
6405 for (j=0; j<M; j++)
6406 s += a[i+j];
6408 When vectorizing the i-loop in the above example, the step between
6409 consecutive vector loads is 1, and so the misalignment does not remain
6410 fixed across the execution of the inner-loop, and the realignment cannot
6411 be optimized (as illustrated in the following pseudo vectorized loop):
6413 for (i=0; i<N; i+=4)
6414 for (j=0; j<M; j++){
6415 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6416 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6417 // (assuming that we start from an aligned address).
6420 We therefore have to use the unoptimized realignment scheme:
6422 for (i=0; i<N; i+=4)
6423 for (j=k; j<M; j+=4)
6424 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6425 // that the misalignment of the initial address is
6426 // 0).
6428 The loop can then be vectorized as follows:
6430 for (k=0; k<4; k++){
6431 rt = get_realignment_token (&vp[k]);
6432 for (i=0; i<N; i+=4){
6433 v1 = vp[i+k];
6434 for (j=k; j<M; j+=4){
6435 v2 = vp[i+j+VS-1];
6436 va = REALIGN_LOAD <v1,v2,rt>;
6437 vs += va;
6438 v1 = v2;
6441 } */
6443 if (DR_IS_READ (dr))
6445 bool is_packed = false;
6446 tree type = (TREE_TYPE (DR_REF (dr)));
6448 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6449 && (!targetm.vectorize.builtin_mask_for_load
6450 || targetm.vectorize.builtin_mask_for_load ()))
6452 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6454 /* If we are doing SLP then the accesses need not have the
6455 same alignment, instead it depends on the SLP group size. */
6456 if (loop_vinfo
6457 && STMT_SLP_TYPE (stmt_info)
6458 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6459 * (DR_GROUP_SIZE
6460 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6461 TYPE_VECTOR_SUBPARTS (vectype)))
6463 else if (!loop_vinfo
6464 || (nested_in_vect_loop
6465 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6466 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6467 return dr_explicit_realign;
6468 else
6469 return dr_explicit_realign_optimized;
6471 if (!known_alignment_for_access_p (dr_info))
6472 is_packed = not_size_aligned (DR_REF (dr));
6474 if (targetm.vectorize.support_vector_misalignment
6475 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6476 /* Can't software pipeline the loads, but can at least do them. */
6477 return dr_unaligned_supported;
6479 else
6481 bool is_packed = false;
6482 tree type = (TREE_TYPE (DR_REF (dr)));
6484 if (!known_alignment_for_access_p (dr_info))
6485 is_packed = not_size_aligned (DR_REF (dr));
6487 if (targetm.vectorize.support_vector_misalignment
6488 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6489 return dr_unaligned_supported;
6492 /* Unsupported. */
6493 return dr_unaligned_unsupported;