[33/46] Use stmt_vec_infos instead of vec_info/gimple stmt pairs
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobf3cf4404bbd75c332833124f5d2dcbe0c25c9719
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT_INFO.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
121 HOST_WIDE_INT *lhs_size_unit,
122 HOST_WIDE_INT *rhs_size_unit)
124 tree scalar_type = gimple_expr_type (stmt_info->stmt);
125 HOST_WIDE_INT lhs, rhs;
127 /* During the analysis phase, this function is called on arbitrary
128 statements that might not have scalar results. */
129 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
130 return scalar_type;
132 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
134 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
135 if (assign
136 && (gimple_assign_cast_p (assign)
137 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
140 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
141 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
143 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
145 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
146 if (rhs < lhs)
147 scalar_type = rhs_type;
150 *lhs_size_unit = lhs;
151 *rhs_size_unit = rhs;
152 return scalar_type;
156 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
157 tested at run-time. Return TRUE if DDR was successfully inserted.
158 Return false if versioning is not supported. */
160 static bool
161 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
163 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
165 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
166 return false;
168 if (!runtime_alias_check_p (ddr, loop,
169 optimize_loop_nest_for_speed_p (loop)))
170 return false;
172 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
173 return true;
176 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
178 static void
179 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
181 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
182 for (unsigned int i = 0; i < checks.length(); ++i)
183 if (checks[i] == value)
184 return;
186 if (dump_enabled_p ())
188 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
189 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
190 dump_printf (MSG_NOTE, " is nonzero\n");
192 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
195 /* Return true if we know that the order of vectorized STMTINFO_A and
196 vectorized STMTINFO_B will be the same as the order of STMTINFO_A and
197 STMTINFO_B. At least one of the statements is a write. */
199 static bool
200 vect_preserves_scalar_order_p (stmt_vec_info stmtinfo_a,
201 stmt_vec_info stmtinfo_b)
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 if (is_pattern_stmt_p (stmtinfo_a))
216 stmtinfo_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
217 if (is_pattern_stmt_p (stmtinfo_b))
218 stmtinfo_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
219 stmt_vec_info earlier_stmt_info = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
220 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (earlier_stmt_info));
223 /* A subroutine of vect_analyze_data_ref_dependence. Handle
224 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
225 distances. These distances are conservatively correct but they don't
226 reflect a guaranteed dependence.
228 Return true if this function does all the work necessary to avoid
229 an alias or false if the caller should use the dependence distances
230 to limit the vectorization factor in the usual way. LOOP_DEPTH is
231 the depth of the loop described by LOOP_VINFO and the other arguments
232 are as for vect_analyze_data_ref_dependence. */
234 static bool
235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
236 loop_vec_info loop_vinfo,
237 int loop_depth, unsigned int *max_vf)
239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
240 lambda_vector dist_v;
241 unsigned int i;
242 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
244 int dist = dist_v[loop_depth];
245 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
247 /* If the user asserted safelen >= DIST consecutive iterations
248 can be executed concurrently, assume independence.
250 ??? An alternative would be to add the alias check even
251 in this case, and vectorize the fallback loop with the
252 maximum VF set to safelen. However, if the user has
253 explicitly given a length, it's less likely that that
254 would be a win. */
255 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
257 if ((unsigned int) loop->safelen < *max_vf)
258 *max_vf = loop->safelen;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
260 continue;
263 /* For dependence distances of 2 or more, we have the option
264 of limiting VF or checking for an alias at runtime.
265 Prefer to check at runtime if we can, to avoid limiting
266 the VF unnecessarily when the bases are in fact independent.
268 Note that the alias checks will be removed if the VF ends up
269 being small enough. */
270 return (!STMT_VINFO_GATHER_SCATTER_P
271 (vinfo_for_stmt (DR_STMT (DDR_A (ddr))))
272 && !STMT_VINFO_GATHER_SCATTER_P
273 (vinfo_for_stmt (DR_STMT (DDR_B (ddr))))
274 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
277 return true;
281 /* Function vect_analyze_data_ref_dependence.
283 Return TRUE if there (might) exist a dependence between a memory-reference
284 DRA and a memory-reference DRB. When versioning for alias may check a
285 dependence at run-time, return FALSE. Adjust *MAX_VF according to
286 the data dependence. */
288 static bool
289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
290 loop_vec_info loop_vinfo,
291 unsigned int *max_vf)
293 unsigned int i;
294 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
295 struct data_reference *dra = DDR_A (ddr);
296 struct data_reference *drb = DDR_B (ddr);
297 stmt_vec_info stmtinfo_a = vect_dr_stmt (dra);
298 stmt_vec_info stmtinfo_b = vect_dr_stmt (drb);
299 lambda_vector dist_v;
300 unsigned int loop_depth;
302 /* In loop analysis all data references should be vectorizable. */
303 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
304 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
305 gcc_unreachable ();
307 /* Independent data accesses. */
308 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
309 return false;
311 if (dra == drb
312 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
313 return false;
315 /* We do not have to consider dependences between accesses that belong
316 to the same group, unless the stride could be smaller than the
317 group size. */
318 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
319 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
320 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
321 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
322 return false;
324 /* Even if we have an anti-dependence then, as the vectorized loop covers at
325 least two scalar iterations, there is always also a true dependence.
326 As the vectorizer does not re-order loads and stores we can ignore
327 the anti-dependence if TBAA can disambiguate both DRs similar to the
328 case with known negative distance anti-dependences (positive
329 distance anti-dependences would violate TBAA constraints). */
330 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
331 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
332 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
333 get_alias_set (DR_REF (drb))))
334 return false;
336 /* Unknown data dependence. */
337 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
339 /* If user asserted safelen consecutive iterations can be
340 executed concurrently, assume independence. */
341 if (loop->safelen >= 2)
343 if ((unsigned int) loop->safelen < *max_vf)
344 *max_vf = loop->safelen;
345 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
346 return false;
349 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
350 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "versioning for alias not supported for: "
356 "can't determine dependence between ");
357 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
358 DR_REF (dra));
359 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
360 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
361 DR_REF (drb));
362 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 return true;
367 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "versioning for alias required: "
371 "can't determine dependence between ");
372 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
373 DR_REF (dra));
374 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
375 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
376 DR_REF (drb));
377 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
380 /* Add to list of ddrs that need to be tested at run-time. */
381 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
384 /* Known data dependence. */
385 if (DDR_NUM_DIST_VECTS (ddr) == 0)
387 /* If user asserted safelen consecutive iterations can be
388 executed concurrently, assume independence. */
389 if (loop->safelen >= 2)
391 if ((unsigned int) loop->safelen < *max_vf)
392 *max_vf = loop->safelen;
393 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
394 return false;
397 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
398 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
400 if (dump_enabled_p ())
402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
403 "versioning for alias not supported for: "
404 "bad dist vector for ");
405 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
406 DR_REF (dra));
407 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
408 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
409 DR_REF (drb));
410 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
412 return true;
415 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "versioning for alias required: "
419 "bad dist vector for ");
420 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
421 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
422 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
423 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
425 /* Add to list of ddrs that need to be tested at run-time. */
426 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
429 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
431 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
432 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
433 loop_depth, max_vf))
434 return false;
436 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
438 int dist = dist_v[loop_depth];
440 if (dump_enabled_p ())
441 dump_printf_loc (MSG_NOTE, vect_location,
442 "dependence distance = %d.\n", dist);
444 if (dist == 0)
446 if (dump_enabled_p ())
448 dump_printf_loc (MSG_NOTE, vect_location,
449 "dependence distance == 0 between ");
450 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
451 dump_printf (MSG_NOTE, " and ");
452 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
453 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
456 /* When we perform grouped accesses and perform implicit CSE
457 by detecting equal accesses and doing disambiguation with
458 runtime alias tests like for
459 .. = a[i];
460 .. = a[i+1];
461 a[i] = ..;
462 a[i+1] = ..;
463 *p = ..;
464 .. = a[i];
465 .. = a[i+1];
466 where we will end up loading { a[i], a[i+1] } once, make
467 sure that inserting group loads before the first load and
468 stores after the last store will do the right thing.
469 Similar for groups like
470 a[i] = ...;
471 ... = a[i];
472 a[i+1] = ...;
473 where loads from the group interleave with the store. */
474 if (!vect_preserves_scalar_order_p (stmtinfo_a, stmtinfo_b))
476 if (dump_enabled_p ())
477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
478 "READ_WRITE dependence in interleaving.\n");
479 return true;
482 if (loop->safelen < 2)
484 tree indicator = dr_zero_step_indicator (dra);
485 if (!indicator || integer_zerop (indicator))
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
489 "access also has a zero step\n");
490 return true;
492 else if (TREE_CODE (indicator) != INTEGER_CST)
493 vect_check_nonzero_value (loop_vinfo, indicator);
495 continue;
498 if (dist > 0 && DDR_REVERSED_P (ddr))
500 /* If DDR_REVERSED_P the order of the data-refs in DDR was
501 reversed (to make distance vector positive), and the actual
502 distance is negative. */
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
505 "dependence distance negative.\n");
506 /* Record a negative dependence distance to later limit the
507 amount of stmt copying / unrolling we can perform.
508 Only need to handle read-after-write dependence. */
509 if (DR_IS_READ (drb)
510 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
511 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
512 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
513 continue;
516 unsigned int abs_dist = abs (dist);
517 if (abs_dist >= 2 && abs_dist < *max_vf)
519 /* The dependence distance requires reduction of the maximal
520 vectorization factor. */
521 *max_vf = abs (dist);
522 if (dump_enabled_p ())
523 dump_printf_loc (MSG_NOTE, vect_location,
524 "adjusting maximal vectorization factor to %i\n",
525 *max_vf);
528 if (abs_dist >= *max_vf)
530 /* Dependence distance does not create dependence, as far as
531 vectorization is concerned, in this case. */
532 if (dump_enabled_p ())
533 dump_printf_loc (MSG_NOTE, vect_location,
534 "dependence distance >= VF.\n");
535 continue;
538 if (dump_enabled_p ())
540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
541 "not vectorized, possible dependence "
542 "between data-refs ");
543 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
544 dump_printf (MSG_NOTE, " and ");
545 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
546 dump_printf (MSG_NOTE, "\n");
549 return true;
552 return false;
555 /* Function vect_analyze_data_ref_dependences.
557 Examine all the data references in the loop, and make sure there do not
558 exist any data dependences between them. Set *MAX_VF according to
559 the maximum vectorization factor the data dependences allow. */
561 bool
562 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
563 unsigned int *max_vf)
565 unsigned int i;
566 struct data_dependence_relation *ddr;
568 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
570 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
572 LOOP_VINFO_DDRS (loop_vinfo)
573 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
574 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
575 /* We need read-read dependences to compute
576 STMT_VINFO_SAME_ALIGN_REFS. */
577 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
578 &LOOP_VINFO_DDRS (loop_vinfo),
579 LOOP_VINFO_LOOP_NEST (loop_vinfo),
580 true);
581 gcc_assert (res);
584 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
586 /* For epilogues we either have no aliases or alias versioning
587 was applied to original loop. Therefore we may just get max_vf
588 using VF of original loop. */
589 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
590 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
591 else
592 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
593 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
594 return false;
596 return true;
600 /* Function vect_slp_analyze_data_ref_dependence.
602 Return TRUE if there (might) exist a dependence between a memory-reference
603 DRA and a memory-reference DRB. When versioning for alias may check a
604 dependence at run-time, return FALSE. Adjust *MAX_VF according to
605 the data dependence. */
607 static bool
608 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
610 struct data_reference *dra = DDR_A (ddr);
611 struct data_reference *drb = DDR_B (ddr);
613 /* We need to check dependences of statements marked as unvectorizable
614 as well, they still can prohibit vectorization. */
616 /* Independent data accesses. */
617 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
618 return false;
620 if (dra == drb)
621 return false;
623 /* Read-read is OK. */
624 if (DR_IS_READ (dra) && DR_IS_READ (drb))
625 return false;
627 /* If dra and drb are part of the same interleaving chain consider
628 them independent. */
629 if (STMT_VINFO_GROUPED_ACCESS (vect_dr_stmt (dra))
630 && (DR_GROUP_FIRST_ELEMENT (vect_dr_stmt (dra))
631 == DR_GROUP_FIRST_ELEMENT (vect_dr_stmt (drb))))
632 return false;
634 /* Unknown data dependence. */
635 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
637 if (dump_enabled_p ())
639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
640 "can't determine dependence between ");
641 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
642 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
643 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
644 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
647 else if (dump_enabled_p ())
649 dump_printf_loc (MSG_NOTE, vect_location,
650 "determined dependence between ");
651 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
652 dump_printf (MSG_NOTE, " and ");
653 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
654 dump_printf (MSG_NOTE, "\n");
657 return true;
661 /* Analyze dependences involved in the transform of SLP NODE. STORES
662 contain the vector of scalar stores of this instance if we are
663 disambiguating the loads. */
665 static bool
666 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
667 vec<stmt_vec_info> stores,
668 stmt_vec_info last_store_info)
670 /* This walks over all stmts involved in the SLP load/store done
671 in NODE verifying we can sink them up to the last stmt in the
672 group. */
673 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
674 vec_info *vinfo = last_access_info->vinfo;
675 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
677 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
678 if (access_info == last_access_info)
679 continue;
680 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
681 ao_ref ref;
682 bool ref_initialized_p = false;
683 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
684 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
686 gimple *stmt = gsi_stmt (gsi);
687 if (! gimple_vuse (stmt)
688 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
689 continue;
691 /* If we couldn't record a (single) data reference for this
692 stmt we have to resort to the alias oracle. */
693 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
694 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
695 if (!dr_b)
697 /* We are moving a store or sinking a load - this means
698 we cannot use TBAA for disambiguation. */
699 if (!ref_initialized_p)
700 ao_ref_init (&ref, DR_REF (dr_a));
701 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
702 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
703 return false;
704 continue;
707 bool dependent = false;
708 /* If we run into a store of this same instance (we've just
709 marked those) then delay dependence checking until we run
710 into the last store because this is where it will have
711 been sunk to (and we verify if we can do that as well). */
712 if (gimple_visited_p (stmt))
714 if (stmt_info != last_store_info)
715 continue;
716 unsigned i;
717 stmt_vec_info store_info;
718 FOR_EACH_VEC_ELT (stores, i, store_info)
720 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
721 ddr_p ddr = initialize_data_dependence_relation
722 (dr_a, store_dr, vNULL);
723 dependent = vect_slp_analyze_data_ref_dependence (ddr);
724 free_dependence_relation (ddr);
725 if (dependent)
726 break;
729 else
731 ddr_p ddr = initialize_data_dependence_relation (dr_a,
732 dr_b, vNULL);
733 dependent = vect_slp_analyze_data_ref_dependence (ddr);
734 free_dependence_relation (ddr);
736 if (dependent)
737 return false;
740 return true;
744 /* Function vect_analyze_data_ref_dependences.
746 Examine all the data references in the basic-block, and make sure there
747 do not exist any data dependences between them. Set *MAX_VF according to
748 the maximum vectorization factor the data dependences allow. */
750 bool
751 vect_slp_analyze_instance_dependence (slp_instance instance)
753 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
755 /* The stores of this instance are at the root of the SLP tree. */
756 slp_tree store = SLP_INSTANCE_TREE (instance);
757 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
758 store = NULL;
760 /* Verify we can sink stores to the vectorized stmt insert location. */
761 stmt_vec_info last_store_info = NULL;
762 if (store)
764 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
765 return false;
767 /* Mark stores in this instance and remember the last one. */
768 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
769 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
770 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
773 bool res = true;
775 /* Verify we can sink loads to the vectorized stmt insert location,
776 special-casing stores of this instance. */
777 slp_tree load;
778 unsigned int i;
779 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
780 if (! vect_slp_analyze_node_dependences (instance, load,
781 store
782 ? SLP_TREE_SCALAR_STMTS (store)
783 : vNULL, last_store_info))
785 res = false;
786 break;
789 /* Unset the visited flag. */
790 if (store)
791 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
792 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
794 return res;
797 /* Record the base alignment guarantee given by DRB, which occurs
798 in STMT_INFO. */
800 static void
801 vect_record_base_alignment (stmt_vec_info stmt_info,
802 innermost_loop_behavior *drb)
804 vec_info *vinfo = stmt_info->vinfo;
805 bool existed;
806 innermost_loop_behavior *&entry
807 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
808 if (!existed || entry->base_alignment < drb->base_alignment)
810 entry = drb;
811 if (dump_enabled_p ())
813 dump_printf_loc (MSG_NOTE, vect_location,
814 "recording new base alignment for ");
815 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
816 dump_printf (MSG_NOTE, "\n");
817 dump_printf_loc (MSG_NOTE, vect_location,
818 " alignment: %d\n", drb->base_alignment);
819 dump_printf_loc (MSG_NOTE, vect_location,
820 " misalignment: %d\n", drb->base_misalignment);
821 dump_printf_loc (MSG_NOTE, vect_location,
822 " based on: ");
823 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
828 /* If the region we're going to vectorize is reached, all unconditional
829 data references occur at least once. We can therefore pool the base
830 alignment guarantees from each unconditional reference. Do this by
831 going through all the data references in VINFO and checking whether
832 the containing statement makes the reference unconditionally. If so,
833 record the alignment of the base address in VINFO so that it can be
834 used for all other references with the same base. */
836 void
837 vect_record_base_alignments (vec_info *vinfo)
839 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
840 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
841 data_reference *dr;
842 unsigned int i;
843 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
845 stmt_vec_info stmt_info = vect_dr_stmt (dr);
846 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
847 && STMT_VINFO_VECTORIZABLE (stmt_info)
848 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
850 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
852 /* If DR is nested in the loop that is being vectorized, we can also
853 record the alignment of the base wrt the outer loop. */
854 if (loop && nested_in_vect_loop_p (loop, stmt_info))
855 vect_record_base_alignment
856 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
861 /* Return the target alignment for the vectorized form of DR. */
863 static unsigned int
864 vect_calculate_target_alignment (struct data_reference *dr)
866 stmt_vec_info stmt_info = vect_dr_stmt (dr);
867 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
868 return targetm.vectorize.preferred_vector_alignment (vectype);
871 /* Function vect_compute_data_ref_alignment
873 Compute the misalignment of the data reference DR.
875 Output:
876 1. DR_MISALIGNMENT (DR) is defined.
878 FOR NOW: No analysis is actually performed. Misalignment is calculated
879 only for trivial cases. TODO. */
881 static void
882 vect_compute_data_ref_alignment (struct data_reference *dr)
884 stmt_vec_info stmt_info = vect_dr_stmt (dr);
885 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
886 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
887 struct loop *loop = NULL;
888 tree ref = DR_REF (dr);
889 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_NOTE, vect_location,
893 "vect_compute_data_ref_alignment:\n");
895 if (loop_vinfo)
896 loop = LOOP_VINFO_LOOP (loop_vinfo);
898 /* Initialize misalignment to unknown. */
899 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
901 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
902 return;
904 innermost_loop_behavior *drb = vect_dr_behavior (dr);
905 bool step_preserves_misalignment_p;
907 unsigned HOST_WIDE_INT vector_alignment
908 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
909 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
911 /* No step for BB vectorization. */
912 if (!loop)
914 gcc_assert (integer_zerop (drb->step));
915 step_preserves_misalignment_p = true;
918 /* In case the dataref is in an inner-loop of the loop that is being
919 vectorized (LOOP), we use the base and misalignment information
920 relative to the outer-loop (LOOP). This is ok only if the misalignment
921 stays the same throughout the execution of the inner-loop, which is why
922 we have to check that the stride of the dataref in the inner-loop evenly
923 divides by the vector alignment. */
924 else if (nested_in_vect_loop_p (loop, stmt_info))
926 step_preserves_misalignment_p
927 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
929 if (dump_enabled_p ())
931 if (step_preserves_misalignment_p)
932 dump_printf_loc (MSG_NOTE, vect_location,
933 "inner step divides the vector alignment.\n");
934 else
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
936 "inner step doesn't divide the vector"
937 " alignment.\n");
941 /* Similarly we can only use base and misalignment information relative to
942 an innermost loop if the misalignment stays the same throughout the
943 execution of the loop. As above, this is the case if the stride of
944 the dataref evenly divides by the alignment. */
945 else
947 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
948 step_preserves_misalignment_p
949 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
951 if (!step_preserves_misalignment_p && dump_enabled_p ())
952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
953 "step doesn't divide the vector alignment.\n");
956 unsigned int base_alignment = drb->base_alignment;
957 unsigned int base_misalignment = drb->base_misalignment;
959 /* Calculate the maximum of the pooled base address alignment and the
960 alignment that we can compute for DR itself. */
961 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
962 if (entry && base_alignment < (*entry)->base_alignment)
964 base_alignment = (*entry)->base_alignment;
965 base_misalignment = (*entry)->base_misalignment;
968 if (drb->offset_alignment < vector_alignment
969 || !step_preserves_misalignment_p
970 /* We need to know whether the step wrt the vectorized loop is
971 negative when computing the starting misalignment below. */
972 || TREE_CODE (drb->step) != INTEGER_CST)
974 if (dump_enabled_p ())
976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
977 "Unknown alignment for access: ");
978 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
979 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
981 return;
984 if (base_alignment < vector_alignment)
986 unsigned int max_alignment;
987 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
988 if (max_alignment < vector_alignment
989 || !vect_can_force_dr_alignment_p (base,
990 vector_alignment * BITS_PER_UNIT))
992 if (dump_enabled_p ())
994 dump_printf_loc (MSG_NOTE, vect_location,
995 "can't force alignment of ref: ");
996 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
997 dump_printf (MSG_NOTE, "\n");
999 return;
1002 /* Force the alignment of the decl.
1003 NOTE: This is the only change to the code we make during
1004 the analysis phase, before deciding to vectorize the loop. */
1005 if (dump_enabled_p ())
1007 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1008 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1009 dump_printf (MSG_NOTE, "\n");
1012 DR_VECT_AUX (dr)->base_decl = base;
1013 DR_VECT_AUX (dr)->base_misaligned = true;
1014 base_misalignment = 0;
1016 poly_int64 misalignment
1017 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1019 /* If this is a backward running DR then first access in the larger
1020 vectype actually is N-1 elements before the address in the DR.
1021 Adjust misalign accordingly. */
1022 if (tree_int_cst_sgn (drb->step) < 0)
1023 /* PLUS because STEP is negative. */
1024 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1025 * TREE_INT_CST_LOW (drb->step));
1027 unsigned int const_misalignment;
1028 if (!known_misalignment (misalignment, vector_alignment,
1029 &const_misalignment))
1031 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034 "Non-constant misalignment for access: ");
1035 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1036 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1038 return;
1041 SET_DR_MISALIGNMENT (dr, const_misalignment);
1043 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1047 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1048 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1051 return;
1054 /* Function vect_update_misalignment_for_peel.
1055 Sets DR's misalignment
1056 - to 0 if it has the same alignment as DR_PEEL,
1057 - to the misalignment computed using NPEEL if DR's salignment is known,
1058 - to -1 (unknown) otherwise.
1060 DR - the data reference whose misalignment is to be adjusted.
1061 DR_PEEL - the data reference whose misalignment is being made
1062 zero in the vector loop by the peel.
1063 NPEEL - the number of iterations in the peel loop if the misalignment
1064 of DR_PEEL is known at compile time. */
1066 static void
1067 vect_update_misalignment_for_peel (struct data_reference *dr,
1068 struct data_reference *dr_peel, int npeel)
1070 unsigned int i;
1071 vec<dr_p> same_aligned_drs;
1072 struct data_reference *current_dr;
1073 int dr_size = vect_get_scalar_dr_size (dr);
1074 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1075 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1076 stmt_vec_info peel_stmt_info = vect_dr_stmt (dr_peel);
1078 /* For interleaved data accesses the step in the loop must be multiplied by
1079 the size of the interleaving group. */
1080 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1081 dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1082 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1083 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1085 /* It can be assumed that the data refs with the same alignment as dr_peel
1086 are aligned in the vector loop. */
1087 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (vect_dr_stmt (dr_peel));
1088 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1090 if (current_dr != dr)
1091 continue;
1092 gcc_assert (!known_alignment_for_access_p (dr)
1093 || !known_alignment_for_access_p (dr_peel)
1094 || (DR_MISALIGNMENT (dr) / dr_size
1095 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1096 SET_DR_MISALIGNMENT (dr, 0);
1097 return;
1100 if (known_alignment_for_access_p (dr)
1101 && known_alignment_for_access_p (dr_peel))
1103 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1104 int misal = DR_MISALIGNMENT (dr);
1105 misal += negative ? -npeel * dr_size : npeel * dr_size;
1106 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1107 SET_DR_MISALIGNMENT (dr, misal);
1108 return;
1111 if (dump_enabled_p ())
1112 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1113 "to unknown (-1).\n");
1114 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1118 /* Function verify_data_ref_alignment
1120 Return TRUE if DR can be handled with respect to alignment. */
1122 static bool
1123 verify_data_ref_alignment (data_reference_p dr)
1125 enum dr_alignment_support supportable_dr_alignment
1126 = vect_supportable_dr_alignment (dr, false);
1127 if (!supportable_dr_alignment)
1129 if (dump_enabled_p ())
1131 if (DR_IS_READ (dr))
1132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1133 "not vectorized: unsupported unaligned load.");
1134 else
1135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1136 "not vectorized: unsupported unaligned "
1137 "store.");
1139 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1140 DR_REF (dr));
1141 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1143 return false;
1146 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1147 dump_printf_loc (MSG_NOTE, vect_location,
1148 "Vectorizing an unaligned access.\n");
1150 return true;
1153 /* Function vect_verify_datarefs_alignment
1155 Return TRUE if all data references in the loop can be
1156 handled with respect to alignment. */
1158 bool
1159 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1161 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1162 struct data_reference *dr;
1163 unsigned int i;
1165 FOR_EACH_VEC_ELT (datarefs, i, dr)
1167 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1169 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1170 continue;
1172 /* For interleaving, only the alignment of the first access matters. */
1173 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1174 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1175 continue;
1177 /* Strided accesses perform only component accesses, alignment is
1178 irrelevant for them. */
1179 if (STMT_VINFO_STRIDED_P (stmt_info)
1180 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1181 continue;
1183 if (! verify_data_ref_alignment (dr))
1184 return false;
1187 return true;
1190 /* Given an memory reference EXP return whether its alignment is less
1191 than its size. */
1193 static bool
1194 not_size_aligned (tree exp)
1196 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1197 return true;
1199 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1200 > get_object_alignment (exp));
1203 /* Function vector_alignment_reachable_p
1205 Return true if vector alignment for DR is reachable by peeling
1206 a few loop iterations. Return false otherwise. */
1208 static bool
1209 vector_alignment_reachable_p (struct data_reference *dr)
1211 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1212 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1214 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1216 /* For interleaved access we peel only if number of iterations in
1217 the prolog loop ({VF - misalignment}), is a multiple of the
1218 number of the interleaved accesses. */
1219 int elem_size, mis_in_elements;
1221 /* FORNOW: handle only known alignment. */
1222 if (!known_alignment_for_access_p (dr))
1223 return false;
1225 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1226 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1227 elem_size = vector_element_size (vector_size, nelements);
1228 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1230 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1231 return false;
1234 /* If misalignment is known at the compile time then allow peeling
1235 only if natural alignment is reachable through peeling. */
1236 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1238 HOST_WIDE_INT elmsize =
1239 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1240 if (dump_enabled_p ())
1242 dump_printf_loc (MSG_NOTE, vect_location,
1243 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1244 dump_printf (MSG_NOTE,
1245 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1247 if (DR_MISALIGNMENT (dr) % elmsize)
1249 if (dump_enabled_p ())
1250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251 "data size does not divide the misalignment.\n");
1252 return false;
1256 if (!known_alignment_for_access_p (dr))
1258 tree type = TREE_TYPE (DR_REF (dr));
1259 bool is_packed = not_size_aligned (DR_REF (dr));
1260 if (dump_enabled_p ())
1261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1262 "Unknown misalignment, %snaturally aligned\n",
1263 is_packed ? "not " : "");
1264 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1267 return true;
1271 /* Calculate the cost of the memory access represented by DR. */
1273 static void
1274 vect_get_data_access_cost (struct data_reference *dr,
1275 unsigned int *inside_cost,
1276 unsigned int *outside_cost,
1277 stmt_vector_for_cost *body_cost_vec,
1278 stmt_vector_for_cost *prologue_cost_vec)
1280 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1281 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1282 int ncopies;
1284 if (PURE_SLP_STMT (stmt_info))
1285 ncopies = 1;
1286 else
1287 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1289 if (DR_IS_READ (dr))
1290 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1291 prologue_cost_vec, body_cost_vec, false);
1292 else
1293 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1295 if (dump_enabled_p ())
1296 dump_printf_loc (MSG_NOTE, vect_location,
1297 "vect_get_data_access_cost: inside_cost = %d, "
1298 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1302 typedef struct _vect_peel_info
1304 struct data_reference *dr;
1305 int npeel;
1306 unsigned int count;
1307 } *vect_peel_info;
1309 typedef struct _vect_peel_extended_info
1311 struct _vect_peel_info peel_info;
1312 unsigned int inside_cost;
1313 unsigned int outside_cost;
1314 } *vect_peel_extended_info;
1317 /* Peeling hashtable helpers. */
1319 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1321 static inline hashval_t hash (const _vect_peel_info *);
1322 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1325 inline hashval_t
1326 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1328 return (hashval_t) peel_info->npeel;
1331 inline bool
1332 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1334 return (a->npeel == b->npeel);
1338 /* Insert DR into peeling hash table with NPEEL as key. */
1340 static void
1341 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1342 loop_vec_info loop_vinfo, struct data_reference *dr,
1343 int npeel)
1345 struct _vect_peel_info elem, *slot;
1346 _vect_peel_info **new_slot;
1347 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1349 elem.npeel = npeel;
1350 slot = peeling_htab->find (&elem);
1351 if (slot)
1352 slot->count++;
1353 else
1355 slot = XNEW (struct _vect_peel_info);
1356 slot->npeel = npeel;
1357 slot->dr = dr;
1358 slot->count = 1;
1359 new_slot = peeling_htab->find_slot (slot, INSERT);
1360 *new_slot = slot;
1363 if (!supportable_dr_alignment
1364 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1365 slot->count += VECT_MAX_COST;
1369 /* Traverse peeling hash table to find peeling option that aligns maximum
1370 number of data accesses. */
1373 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1374 _vect_peel_extended_info *max)
1376 vect_peel_info elem = *slot;
1378 if (elem->count > max->peel_info.count
1379 || (elem->count == max->peel_info.count
1380 && max->peel_info.npeel > elem->npeel))
1382 max->peel_info.npeel = elem->npeel;
1383 max->peel_info.count = elem->count;
1384 max->peel_info.dr = elem->dr;
1387 return 1;
1390 /* Get the costs of peeling NPEEL iterations checking data access costs
1391 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1392 misalignment will be zero after peeling. */
1394 static void
1395 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1396 struct data_reference *dr0,
1397 unsigned int *inside_cost,
1398 unsigned int *outside_cost,
1399 stmt_vector_for_cost *body_cost_vec,
1400 stmt_vector_for_cost *prologue_cost_vec,
1401 unsigned int npeel,
1402 bool unknown_misalignment)
1404 unsigned i;
1405 data_reference *dr;
1407 FOR_EACH_VEC_ELT (datarefs, i, dr)
1409 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1410 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1411 continue;
1413 /* For interleaving, only the alignment of the first access
1414 matters. */
1415 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1416 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1417 continue;
1419 /* Strided accesses perform only component accesses, alignment is
1420 irrelevant for them. */
1421 if (STMT_VINFO_STRIDED_P (stmt_info)
1422 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1423 continue;
1425 int save_misalignment;
1426 save_misalignment = DR_MISALIGNMENT (dr);
1427 if (npeel == 0)
1429 else if (unknown_misalignment && dr == dr0)
1430 SET_DR_MISALIGNMENT (dr, 0);
1431 else
1432 vect_update_misalignment_for_peel (dr, dr0, npeel);
1433 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1434 body_cost_vec, prologue_cost_vec);
1435 SET_DR_MISALIGNMENT (dr, save_misalignment);
1439 /* Traverse peeling hash table and calculate cost for each peeling option.
1440 Find the one with the lowest cost. */
1443 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1444 _vect_peel_extended_info *min)
1446 vect_peel_info elem = *slot;
1447 int dummy;
1448 unsigned int inside_cost = 0, outside_cost = 0;
1449 stmt_vec_info stmt_info = vect_dr_stmt (elem->dr);
1450 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1451 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1452 epilogue_cost_vec;
1454 prologue_cost_vec.create (2);
1455 body_cost_vec.create (2);
1456 epilogue_cost_vec.create (2);
1458 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1459 elem->dr, &inside_cost, &outside_cost,
1460 &body_cost_vec, &prologue_cost_vec,
1461 elem->npeel, false);
1463 body_cost_vec.release ();
1465 outside_cost += vect_get_known_peeling_cost
1466 (loop_vinfo, elem->npeel, &dummy,
1467 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1468 &prologue_cost_vec, &epilogue_cost_vec);
1470 /* Prologue and epilogue costs are added to the target model later.
1471 These costs depend only on the scalar iteration cost, the
1472 number of peeling iterations finally chosen, and the number of
1473 misaligned statements. So discard the information found here. */
1474 prologue_cost_vec.release ();
1475 epilogue_cost_vec.release ();
1477 if (inside_cost < min->inside_cost
1478 || (inside_cost == min->inside_cost
1479 && outside_cost < min->outside_cost))
1481 min->inside_cost = inside_cost;
1482 min->outside_cost = outside_cost;
1483 min->peel_info.dr = elem->dr;
1484 min->peel_info.npeel = elem->npeel;
1485 min->peel_info.count = elem->count;
1488 return 1;
1492 /* Choose best peeling option by traversing peeling hash table and either
1493 choosing an option with the lowest cost (if cost model is enabled) or the
1494 option that aligns as many accesses as possible. */
1496 static struct _vect_peel_extended_info
1497 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1498 loop_vec_info loop_vinfo)
1500 struct _vect_peel_extended_info res;
1502 res.peel_info.dr = NULL;
1504 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1506 res.inside_cost = INT_MAX;
1507 res.outside_cost = INT_MAX;
1508 peeling_htab->traverse <_vect_peel_extended_info *,
1509 vect_peeling_hash_get_lowest_cost> (&res);
1511 else
1513 res.peel_info.count = 0;
1514 peeling_htab->traverse <_vect_peel_extended_info *,
1515 vect_peeling_hash_get_most_frequent> (&res);
1516 res.inside_cost = 0;
1517 res.outside_cost = 0;
1520 return res;
1523 /* Return true if the new peeling NPEEL is supported. */
1525 static bool
1526 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1527 unsigned npeel)
1529 unsigned i;
1530 struct data_reference *dr = NULL;
1531 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1532 enum dr_alignment_support supportable_dr_alignment;
1534 /* Ensure that all data refs can be vectorized after the peel. */
1535 FOR_EACH_VEC_ELT (datarefs, i, dr)
1537 int save_misalignment;
1539 if (dr == dr0)
1540 continue;
1542 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1543 /* For interleaving, only the alignment of the first access
1544 matters. */
1545 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1546 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1547 continue;
1549 /* Strided accesses perform only component accesses, alignment is
1550 irrelevant for them. */
1551 if (STMT_VINFO_STRIDED_P (stmt_info)
1552 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1553 continue;
1555 save_misalignment = DR_MISALIGNMENT (dr);
1556 vect_update_misalignment_for_peel (dr, dr0, npeel);
1557 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1558 SET_DR_MISALIGNMENT (dr, save_misalignment);
1560 if (!supportable_dr_alignment)
1561 return false;
1564 return true;
1567 /* Function vect_enhance_data_refs_alignment
1569 This pass will use loop versioning and loop peeling in order to enhance
1570 the alignment of data references in the loop.
1572 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1573 original loop is to be vectorized. Any other loops that are created by
1574 the transformations performed in this pass - are not supposed to be
1575 vectorized. This restriction will be relaxed.
1577 This pass will require a cost model to guide it whether to apply peeling
1578 or versioning or a combination of the two. For example, the scheme that
1579 intel uses when given a loop with several memory accesses, is as follows:
1580 choose one memory access ('p') which alignment you want to force by doing
1581 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1582 other accesses are not necessarily aligned, or (2) use loop versioning to
1583 generate one loop in which all accesses are aligned, and another loop in
1584 which only 'p' is necessarily aligned.
1586 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1587 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1588 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1590 Devising a cost model is the most critical aspect of this work. It will
1591 guide us on which access to peel for, whether to use loop versioning, how
1592 many versions to create, etc. The cost model will probably consist of
1593 generic considerations as well as target specific considerations (on
1594 powerpc for example, misaligned stores are more painful than misaligned
1595 loads).
1597 Here are the general steps involved in alignment enhancements:
1599 -- original loop, before alignment analysis:
1600 for (i=0; i<N; i++){
1601 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1602 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1605 -- After vect_compute_data_refs_alignment:
1606 for (i=0; i<N; i++){
1607 x = q[i]; # DR_MISALIGNMENT(q) = 3
1608 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1611 -- Possibility 1: we do loop versioning:
1612 if (p is aligned) {
1613 for (i=0; i<N; i++){ # loop 1A
1614 x = q[i]; # DR_MISALIGNMENT(q) = 3
1615 p[i] = y; # DR_MISALIGNMENT(p) = 0
1618 else {
1619 for (i=0; i<N; i++){ # loop 1B
1620 x = q[i]; # DR_MISALIGNMENT(q) = 3
1621 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1625 -- Possibility 2: we do loop peeling:
1626 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1627 x = q[i];
1628 p[i] = y;
1630 for (i = 3; i < N; i++){ # loop 2A
1631 x = q[i]; # DR_MISALIGNMENT(q) = 0
1632 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1635 -- Possibility 3: combination of loop peeling and versioning:
1636 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1637 x = q[i];
1638 p[i] = y;
1640 if (p is aligned) {
1641 for (i = 3; i<N; i++){ # loop 3A
1642 x = q[i]; # DR_MISALIGNMENT(q) = 0
1643 p[i] = y; # DR_MISALIGNMENT(p) = 0
1646 else {
1647 for (i = 3; i<N; i++){ # loop 3B
1648 x = q[i]; # DR_MISALIGNMENT(q) = 0
1649 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1653 These loops are later passed to loop_transform to be vectorized. The
1654 vectorizer will use the alignment information to guide the transformation
1655 (whether to generate regular loads/stores, or with special handling for
1656 misalignment). */
1658 bool
1659 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1661 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1662 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663 enum dr_alignment_support supportable_dr_alignment;
1664 struct data_reference *dr0 = NULL, *first_store = NULL;
1665 struct data_reference *dr;
1666 unsigned int i, j;
1667 bool do_peeling = false;
1668 bool do_versioning = false;
1669 bool stat;
1670 unsigned int npeel = 0;
1671 bool one_misalignment_known = false;
1672 bool one_misalignment_unknown = false;
1673 bool one_dr_unsupportable = false;
1674 struct data_reference *unsupportable_dr = NULL;
1675 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1676 unsigned possible_npeel_number = 1;
1677 tree vectype;
1678 unsigned int mis, same_align_drs_max = 0;
1679 hash_table<peel_info_hasher> peeling_htab (1);
1681 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1683 /* Reset data so we can safely be called multiple times. */
1684 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1685 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1687 /* While cost model enhancements are expected in the future, the high level
1688 view of the code at this time is as follows:
1690 A) If there is a misaligned access then see if peeling to align
1691 this access can make all data references satisfy
1692 vect_supportable_dr_alignment. If so, update data structures
1693 as needed and return true.
1695 B) If peeling wasn't possible and there is a data reference with an
1696 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1697 then see if loop versioning checks can be used to make all data
1698 references satisfy vect_supportable_dr_alignment. If so, update
1699 data structures as needed and return true.
1701 C) If neither peeling nor versioning were successful then return false if
1702 any data reference does not satisfy vect_supportable_dr_alignment.
1704 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1706 Note, Possibility 3 above (which is peeling and versioning together) is not
1707 being done at this time. */
1709 /* (1) Peeling to force alignment. */
1711 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1712 Considerations:
1713 + How many accesses will become aligned due to the peeling
1714 - How many accesses will become unaligned due to the peeling,
1715 and the cost of misaligned accesses.
1716 - The cost of peeling (the extra runtime checks, the increase
1717 in code size). */
1719 FOR_EACH_VEC_ELT (datarefs, i, dr)
1721 stmt_vec_info stmt_info = vect_dr_stmt (dr);
1723 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1724 continue;
1726 /* For interleaving, only the alignment of the first access
1727 matters. */
1728 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1729 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1730 continue;
1732 /* For scatter-gather or invariant accesses there is nothing
1733 to enhance. */
1734 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1735 || integer_zerop (DR_STEP (dr)))
1736 continue;
1738 /* Strided accesses perform only component accesses, alignment is
1739 irrelevant for them. */
1740 if (STMT_VINFO_STRIDED_P (stmt_info)
1741 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1742 continue;
1744 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1745 do_peeling = vector_alignment_reachable_p (dr);
1746 if (do_peeling)
1748 if (known_alignment_for_access_p (dr))
1750 unsigned int npeel_tmp = 0;
1751 bool negative = tree_int_cst_compare (DR_STEP (dr),
1752 size_zero_node) < 0;
1754 vectype = STMT_VINFO_VECTYPE (stmt_info);
1755 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1756 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1757 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1758 if (DR_MISALIGNMENT (dr) != 0)
1759 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1761 /* For multiple types, it is possible that the bigger type access
1762 will have more than one peeling option. E.g., a loop with two
1763 types: one of size (vector size / 4), and the other one of
1764 size (vector size / 8). Vectorization factor will 8. If both
1765 accesses are misaligned by 3, the first one needs one scalar
1766 iteration to be aligned, and the second one needs 5. But the
1767 first one will be aligned also by peeling 5 scalar
1768 iterations, and in that case both accesses will be aligned.
1769 Hence, except for the immediate peeling amount, we also want
1770 to try to add full vector size, while we don't exceed
1771 vectorization factor.
1772 We do this automatically for cost model, since we calculate
1773 cost for every peeling option. */
1774 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1776 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1777 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1778 possible_npeel_number
1779 = vect_get_num_vectors (nscalars, vectype);
1781 /* NPEEL_TMP is 0 when there is no misalignment, but also
1782 allow peeling NELEMENTS. */
1783 if (DR_MISALIGNMENT (dr) == 0)
1784 possible_npeel_number++;
1787 /* Save info about DR in the hash table. Also include peeling
1788 amounts according to the explanation above. */
1789 for (j = 0; j < possible_npeel_number; j++)
1791 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1792 dr, npeel_tmp);
1793 npeel_tmp += target_align / dr_size;
1796 one_misalignment_known = true;
1798 else
1800 /* If we don't know any misalignment values, we prefer
1801 peeling for data-ref that has the maximum number of data-refs
1802 with the same alignment, unless the target prefers to align
1803 stores over load. */
1804 unsigned same_align_drs
1805 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1806 if (!dr0
1807 || same_align_drs_max < same_align_drs)
1809 same_align_drs_max = same_align_drs;
1810 dr0 = dr;
1812 /* For data-refs with the same number of related
1813 accesses prefer the one where the misalign
1814 computation will be invariant in the outermost loop. */
1815 else if (same_align_drs_max == same_align_drs)
1817 struct loop *ivloop0, *ivloop;
1818 ivloop0 = outermost_invariant_loop_for_expr
1819 (loop, DR_BASE_ADDRESS (dr0));
1820 ivloop = outermost_invariant_loop_for_expr
1821 (loop, DR_BASE_ADDRESS (dr));
1822 if ((ivloop && !ivloop0)
1823 || (ivloop && ivloop0
1824 && flow_loop_nested_p (ivloop, ivloop0)))
1825 dr0 = dr;
1828 one_misalignment_unknown = true;
1830 /* Check for data refs with unsupportable alignment that
1831 can be peeled. */
1832 if (!supportable_dr_alignment)
1834 one_dr_unsupportable = true;
1835 unsupportable_dr = dr;
1838 if (!first_store && DR_IS_WRITE (dr))
1839 first_store = dr;
1842 else
1844 if (!aligned_access_p (dr))
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848 "vector alignment may not be reachable\n");
1849 break;
1854 /* Check if we can possibly peel the loop. */
1855 if (!vect_can_advance_ivs_p (loop_vinfo)
1856 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1857 || loop->inner)
1858 do_peeling = false;
1860 struct _vect_peel_extended_info peel_for_known_alignment;
1861 struct _vect_peel_extended_info peel_for_unknown_alignment;
1862 struct _vect_peel_extended_info best_peel;
1864 peel_for_unknown_alignment.inside_cost = INT_MAX;
1865 peel_for_unknown_alignment.outside_cost = INT_MAX;
1866 peel_for_unknown_alignment.peel_info.count = 0;
1868 if (do_peeling
1869 && one_misalignment_unknown)
1871 /* Check if the target requires to prefer stores over loads, i.e., if
1872 misaligned stores are more expensive than misaligned loads (taking
1873 drs with same alignment into account). */
1874 unsigned int load_inside_cost = 0;
1875 unsigned int load_outside_cost = 0;
1876 unsigned int store_inside_cost = 0;
1877 unsigned int store_outside_cost = 0;
1878 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1880 stmt_vector_for_cost dummy;
1881 dummy.create (2);
1882 vect_get_peeling_costs_all_drs (datarefs, dr0,
1883 &load_inside_cost,
1884 &load_outside_cost,
1885 &dummy, &dummy, estimated_npeels, true);
1886 dummy.release ();
1888 if (first_store)
1890 dummy.create (2);
1891 vect_get_peeling_costs_all_drs (datarefs, first_store,
1892 &store_inside_cost,
1893 &store_outside_cost,
1894 &dummy, &dummy,
1895 estimated_npeels, true);
1896 dummy.release ();
1898 else
1900 store_inside_cost = INT_MAX;
1901 store_outside_cost = INT_MAX;
1904 if (load_inside_cost > store_inside_cost
1905 || (load_inside_cost == store_inside_cost
1906 && load_outside_cost > store_outside_cost))
1908 dr0 = first_store;
1909 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1910 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1912 else
1914 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1915 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1918 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1919 prologue_cost_vec.create (2);
1920 epilogue_cost_vec.create (2);
1922 int dummy2;
1923 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1924 (loop_vinfo, estimated_npeels, &dummy2,
1925 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1926 &prologue_cost_vec, &epilogue_cost_vec);
1928 prologue_cost_vec.release ();
1929 epilogue_cost_vec.release ();
1931 peel_for_unknown_alignment.peel_info.count = 1
1932 + STMT_VINFO_SAME_ALIGN_REFS (vect_dr_stmt (dr0)).length ();
1935 peel_for_unknown_alignment.peel_info.npeel = 0;
1936 peel_for_unknown_alignment.peel_info.dr = dr0;
1938 best_peel = peel_for_unknown_alignment;
1940 peel_for_known_alignment.inside_cost = INT_MAX;
1941 peel_for_known_alignment.outside_cost = INT_MAX;
1942 peel_for_known_alignment.peel_info.count = 0;
1943 peel_for_known_alignment.peel_info.dr = NULL;
1945 if (do_peeling && one_misalignment_known)
1947 /* Peeling is possible, but there is no data access that is not supported
1948 unless aligned. So we try to choose the best possible peeling from
1949 the hash table. */
1950 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1951 (&peeling_htab, loop_vinfo);
1954 /* Compare costs of peeling for known and unknown alignment. */
1955 if (peel_for_known_alignment.peel_info.dr != NULL
1956 && peel_for_unknown_alignment.inside_cost
1957 >= peel_for_known_alignment.inside_cost)
1959 best_peel = peel_for_known_alignment;
1961 /* If the best peeling for known alignment has NPEEL == 0, perform no
1962 peeling at all except if there is an unsupportable dr that we can
1963 align. */
1964 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1965 do_peeling = false;
1968 /* If there is an unsupportable data ref, prefer this over all choices so far
1969 since we'd have to discard a chosen peeling except when it accidentally
1970 aligned the unsupportable data ref. */
1971 if (one_dr_unsupportable)
1972 dr0 = unsupportable_dr;
1973 else if (do_peeling)
1975 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1976 TODO: Use nopeel_outside_cost or get rid of it? */
1977 unsigned nopeel_inside_cost = 0;
1978 unsigned nopeel_outside_cost = 0;
1980 stmt_vector_for_cost dummy;
1981 dummy.create (2);
1982 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1983 &nopeel_outside_cost, &dummy, &dummy,
1984 0, false);
1985 dummy.release ();
1987 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1988 costs will be recorded. */
1989 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1990 prologue_cost_vec.create (2);
1991 epilogue_cost_vec.create (2);
1993 int dummy2;
1994 nopeel_outside_cost += vect_get_known_peeling_cost
1995 (loop_vinfo, 0, &dummy2,
1996 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1997 &prologue_cost_vec, &epilogue_cost_vec);
1999 prologue_cost_vec.release ();
2000 epilogue_cost_vec.release ();
2002 npeel = best_peel.peel_info.npeel;
2003 dr0 = best_peel.peel_info.dr;
2005 /* If no peeling is not more expensive than the best peeling we
2006 have so far, don't perform any peeling. */
2007 if (nopeel_inside_cost <= best_peel.inside_cost)
2008 do_peeling = false;
2011 if (do_peeling)
2013 stmt_vec_info stmt_info = vect_dr_stmt (dr0);
2014 vectype = STMT_VINFO_VECTYPE (stmt_info);
2016 if (known_alignment_for_access_p (dr0))
2018 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2019 size_zero_node) < 0;
2020 if (!npeel)
2022 /* Since it's known at compile time, compute the number of
2023 iterations in the peeled loop (the peeling factor) for use in
2024 updating DR_MISALIGNMENT values. The peeling factor is the
2025 vectorization factor minus the misalignment as an element
2026 count. */
2027 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2028 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2029 npeel = ((mis & (target_align - 1))
2030 / vect_get_scalar_dr_size (dr0));
2033 /* For interleaved data access every iteration accesses all the
2034 members of the group, therefore we divide the number of iterations
2035 by the group size. */
2036 stmt_info = vect_dr_stmt (dr0);
2037 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2038 npeel /= DR_GROUP_SIZE (stmt_info);
2040 if (dump_enabled_p ())
2041 dump_printf_loc (MSG_NOTE, vect_location,
2042 "Try peeling by %d\n", npeel);
2045 /* Ensure that all datarefs can be vectorized after the peel. */
2046 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2047 do_peeling = false;
2049 /* Check if all datarefs are supportable and log. */
2050 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2052 stat = vect_verify_datarefs_alignment (loop_vinfo);
2053 if (!stat)
2054 do_peeling = false;
2055 else
2056 return stat;
2059 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2060 if (do_peeling)
2062 unsigned max_allowed_peel
2063 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2064 if (max_allowed_peel != (unsigned)-1)
2066 unsigned max_peel = npeel;
2067 if (max_peel == 0)
2069 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2070 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2072 if (max_peel > max_allowed_peel)
2074 do_peeling = false;
2075 if (dump_enabled_p ())
2076 dump_printf_loc (MSG_NOTE, vect_location,
2077 "Disable peeling, max peels reached: %d\n", max_peel);
2082 /* Cost model #2 - if peeling may result in a remaining loop not
2083 iterating enough to be vectorized then do not peel. Since this
2084 is a cost heuristic rather than a correctness decision, use the
2085 most likely runtime value for variable vectorization factors. */
2086 if (do_peeling
2087 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2089 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2090 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2091 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2092 < assumed_vf + max_peel)
2093 do_peeling = false;
2096 if (do_peeling)
2098 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2099 If the misalignment of DR_i is identical to that of dr0 then set
2100 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2101 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2102 by the peeling factor times the element size of DR_i (MOD the
2103 vectorization factor times the size). Otherwise, the
2104 misalignment of DR_i must be set to unknown. */
2105 FOR_EACH_VEC_ELT (datarefs, i, dr)
2106 if (dr != dr0)
2108 /* Strided accesses perform only component accesses, alignment
2109 is irrelevant for them. */
2110 stmt_info = vect_dr_stmt (dr);
2111 if (STMT_VINFO_STRIDED_P (stmt_info)
2112 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2113 continue;
2115 vect_update_misalignment_for_peel (dr, dr0, npeel);
2118 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2119 if (npeel)
2120 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2121 else
2122 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2123 = DR_MISALIGNMENT (dr0);
2124 SET_DR_MISALIGNMENT (dr0, 0);
2125 if (dump_enabled_p ())
2127 dump_printf_loc (MSG_NOTE, vect_location,
2128 "Alignment of access forced using peeling.\n");
2129 dump_printf_loc (MSG_NOTE, vect_location,
2130 "Peeling for alignment will be applied.\n");
2133 /* The inside-loop cost will be accounted for in vectorizable_load
2134 and vectorizable_store correctly with adjusted alignments.
2135 Drop the body_cst_vec on the floor here. */
2136 stat = vect_verify_datarefs_alignment (loop_vinfo);
2137 gcc_assert (stat);
2138 return stat;
2142 /* (2) Versioning to force alignment. */
2144 /* Try versioning if:
2145 1) optimize loop for speed
2146 2) there is at least one unsupported misaligned data ref with an unknown
2147 misalignment, and
2148 3) all misaligned data refs with a known misalignment are supported, and
2149 4) the number of runtime alignment checks is within reason. */
2151 do_versioning =
2152 optimize_loop_nest_for_speed_p (loop)
2153 && (!loop->inner); /* FORNOW */
2155 if (do_versioning)
2157 FOR_EACH_VEC_ELT (datarefs, i, dr)
2159 stmt_vec_info stmt_info = vect_dr_stmt (dr);
2161 /* For interleaving, only the alignment of the first access
2162 matters. */
2163 if (aligned_access_p (dr)
2164 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2165 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2166 continue;
2168 if (STMT_VINFO_STRIDED_P (stmt_info))
2170 /* Strided loads perform only component accesses, alignment is
2171 irrelevant for them. */
2172 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2173 continue;
2174 do_versioning = false;
2175 break;
2178 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2180 if (!supportable_dr_alignment)
2182 int mask;
2183 tree vectype;
2185 if (known_alignment_for_access_p (dr)
2186 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2187 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2189 do_versioning = false;
2190 break;
2193 stmt_info = vect_dr_stmt (dr);
2194 vectype = STMT_VINFO_VECTYPE (stmt_info);
2195 gcc_assert (vectype);
2197 /* At present we don't support versioning for alignment
2198 with variable VF, since there's no guarantee that the
2199 VF is a power of two. We could relax this if we added
2200 a way of enforcing a power-of-two size. */
2201 unsigned HOST_WIDE_INT size;
2202 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2204 do_versioning = false;
2205 break;
2208 /* The rightmost bits of an aligned address must be zeros.
2209 Construct the mask needed for this test. For example,
2210 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2211 mask must be 15 = 0xf. */
2212 mask = size - 1;
2214 /* FORNOW: use the same mask to test all potentially unaligned
2215 references in the loop. The vectorizer currently supports
2216 a single vector size, see the reference to
2217 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2218 vectorization factor is computed. */
2219 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2220 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2221 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2222 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2226 /* Versioning requires at least one misaligned data reference. */
2227 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2228 do_versioning = false;
2229 else if (!do_versioning)
2230 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2233 if (do_versioning)
2235 vec<stmt_vec_info> may_misalign_stmts
2236 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2237 stmt_vec_info stmt_info;
2239 /* It can now be assumed that the data references in the statements
2240 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2241 of the loop being vectorized. */
2242 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2244 dr = STMT_VINFO_DATA_REF (stmt_info);
2245 SET_DR_MISALIGNMENT (dr, 0);
2246 if (dump_enabled_p ())
2247 dump_printf_loc (MSG_NOTE, vect_location,
2248 "Alignment of access forced using versioning.\n");
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "Versioning for alignment will be applied.\n");
2255 /* Peeling and versioning can't be done together at this time. */
2256 gcc_assert (! (do_peeling && do_versioning));
2258 stat = vect_verify_datarefs_alignment (loop_vinfo);
2259 gcc_assert (stat);
2260 return stat;
2263 /* This point is reached if neither peeling nor versioning is being done. */
2264 gcc_assert (! (do_peeling || do_versioning));
2266 stat = vect_verify_datarefs_alignment (loop_vinfo);
2267 return stat;
2271 /* Function vect_find_same_alignment_drs.
2273 Update group and alignment relations according to the chosen
2274 vectorization factor. */
2276 static void
2277 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2279 struct data_reference *dra = DDR_A (ddr);
2280 struct data_reference *drb = DDR_B (ddr);
2281 stmt_vec_info stmtinfo_a = vect_dr_stmt (dra);
2282 stmt_vec_info stmtinfo_b = vect_dr_stmt (drb);
2284 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2285 return;
2287 if (dra == drb)
2288 return;
2290 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2291 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2292 return;
2294 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2295 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2296 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2297 return;
2299 /* Two references with distance zero have the same alignment. */
2300 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2301 - wi::to_poly_offset (DR_INIT (drb)));
2302 if (maybe_ne (diff, 0))
2304 /* Get the wider of the two alignments. */
2305 unsigned int align_a = (vect_calculate_target_alignment (dra)
2306 / BITS_PER_UNIT);
2307 unsigned int align_b = (vect_calculate_target_alignment (drb)
2308 / BITS_PER_UNIT);
2309 unsigned int max_align = MAX (align_a, align_b);
2311 /* Require the gap to be a multiple of the larger vector alignment. */
2312 if (!multiple_p (diff, max_align))
2313 return;
2316 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2317 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2318 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_NOTE, vect_location,
2321 "accesses have the same alignment: ");
2322 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2323 dump_printf (MSG_NOTE, " and ");
2324 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2325 dump_printf (MSG_NOTE, "\n");
2330 /* Function vect_analyze_data_refs_alignment
2332 Analyze the alignment of the data-references in the loop.
2333 Return FALSE if a data reference is found that cannot be vectorized. */
2335 bool
2336 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2338 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2340 /* Mark groups of data references with same alignment using
2341 data dependence information. */
2342 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2343 struct data_dependence_relation *ddr;
2344 unsigned int i;
2346 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2347 vect_find_same_alignment_drs (ddr);
2349 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2350 struct data_reference *dr;
2352 vect_record_base_alignments (vinfo);
2353 FOR_EACH_VEC_ELT (datarefs, i, dr)
2355 stmt_vec_info stmt_info = vect_dr_stmt (dr);
2356 if (STMT_VINFO_VECTORIZABLE (stmt_info))
2357 vect_compute_data_ref_alignment (dr);
2360 return true;
2364 /* Analyze alignment of DRs of stmts in NODE. */
2366 static bool
2367 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2369 /* We vectorize from the first scalar stmt in the node unless
2370 the node is permuted in which case we start from the first
2371 element in the group. */
2372 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2373 data_reference_p first_dr = STMT_VINFO_DATA_REF (first_stmt_info);
2374 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2375 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2377 data_reference_p dr = STMT_VINFO_DATA_REF (first_stmt_info);
2378 vect_compute_data_ref_alignment (dr);
2379 /* For creating the data-ref pointer we need alignment of the
2380 first element anyway. */
2381 if (dr != first_dr)
2382 vect_compute_data_ref_alignment (first_dr);
2383 if (! verify_data_ref_alignment (dr))
2385 if (dump_enabled_p ())
2386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2387 "not vectorized: bad data alignment in basic "
2388 "block.\n");
2389 return false;
2392 return true;
2395 /* Function vect_slp_analyze_instance_alignment
2397 Analyze the alignment of the data-references in the SLP instance.
2398 Return FALSE if a data reference is found that cannot be vectorized. */
2400 bool
2401 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2403 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2405 slp_tree node;
2406 unsigned i;
2407 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2408 if (! vect_slp_analyze_and_verify_node_alignment (node))
2409 return false;
2411 node = SLP_INSTANCE_TREE (instance);
2412 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2413 && ! vect_slp_analyze_and_verify_node_alignment
2414 (SLP_INSTANCE_TREE (instance)))
2415 return false;
2417 return true;
2421 /* Analyze groups of accesses: check that DR belongs to a group of
2422 accesses of legal size, step, etc. Detect gaps, single element
2423 interleaving, and other special cases. Set grouped access info.
2424 Collect groups of strided stores for further use in SLP analysis.
2425 Worker for vect_analyze_group_access. */
2427 static bool
2428 vect_analyze_group_access_1 (struct data_reference *dr)
2430 tree step = DR_STEP (dr);
2431 tree scalar_type = TREE_TYPE (DR_REF (dr));
2432 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2433 stmt_vec_info stmt_info = vect_dr_stmt (dr);
2434 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2435 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2436 HOST_WIDE_INT dr_step = -1;
2437 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2438 bool slp_impossible = false;
2440 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2441 size of the interleaving group (including gaps). */
2442 if (tree_fits_shwi_p (step))
2444 dr_step = tree_to_shwi (step);
2445 /* Check that STEP is a multiple of type size. Otherwise there is
2446 a non-element-sized gap at the end of the group which we
2447 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2448 ??? As we can handle non-constant step fine here we should
2449 simply remove uses of DR_GROUP_GAP between the last and first
2450 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2451 simply not include that gap. */
2452 if ((dr_step % type_size) != 0)
2454 if (dump_enabled_p ())
2456 dump_printf_loc (MSG_NOTE, vect_location,
2457 "Step ");
2458 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2459 dump_printf (MSG_NOTE,
2460 " is not a multiple of the element size for ");
2461 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2462 dump_printf (MSG_NOTE, "\n");
2464 return false;
2466 groupsize = absu_hwi (dr_step) / type_size;
2468 else
2469 groupsize = 0;
2471 /* Not consecutive access is possible only if it is a part of interleaving. */
2472 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2474 /* Check if it this DR is a part of interleaving, and is a single
2475 element of the group that is accessed in the loop. */
2477 /* Gaps are supported only for loads. STEP must be a multiple of the type
2478 size. */
2479 if (DR_IS_READ (dr)
2480 && (dr_step % type_size) == 0
2481 && groupsize > 0)
2483 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2484 DR_GROUP_SIZE (stmt_info) = groupsize;
2485 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2486 if (dump_enabled_p ())
2488 dump_printf_loc (MSG_NOTE, vect_location,
2489 "Detected single element interleaving ");
2490 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2491 dump_printf (MSG_NOTE, " step ");
2492 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2493 dump_printf (MSG_NOTE, "\n");
2496 return true;
2499 if (dump_enabled_p ())
2501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2502 "not consecutive access ");
2503 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2504 stmt_info->stmt, 0);
2507 if (bb_vinfo)
2509 /* Mark the statement as unvectorizable. */
2510 STMT_VINFO_VECTORIZABLE (vect_dr_stmt (dr)) = false;
2511 return true;
2514 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2515 STMT_VINFO_STRIDED_P (stmt_info) = true;
2516 return true;
2519 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2521 /* First stmt in the interleaving chain. Check the chain. */
2522 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2523 struct data_reference *data_ref = dr;
2524 unsigned int count = 1;
2525 tree prev_init = DR_INIT (data_ref);
2526 stmt_vec_info prev = stmt_info;
2527 HOST_WIDE_INT diff, gaps = 0;
2529 /* By construction, all group members have INTEGER_CST DR_INITs. */
2530 while (next)
2532 /* Skip same data-refs. In case that two or more stmts share
2533 data-ref (supported only for loads), we vectorize only the first
2534 stmt, and the rest get their vectorized loads from the first
2535 one. */
2536 if (!tree_int_cst_compare (DR_INIT (data_ref),
2537 DR_INIT (STMT_VINFO_DATA_REF (next))))
2539 if (DR_IS_WRITE (data_ref))
2541 if (dump_enabled_p ())
2542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2543 "Two store stmts share the same dr.\n");
2544 return false;
2547 if (dump_enabled_p ())
2548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2549 "Two or more load stmts share the same dr.\n");
2551 /* For load use the same data-ref load. */
2552 DR_GROUP_SAME_DR_STMT (next) = prev;
2554 prev = next;
2555 next = DR_GROUP_NEXT_ELEMENT (next);
2556 continue;
2559 prev = next;
2560 data_ref = STMT_VINFO_DATA_REF (next);
2562 /* All group members have the same STEP by construction. */
2563 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2565 /* Check that the distance between two accesses is equal to the type
2566 size. Otherwise, we have gaps. */
2567 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2568 - TREE_INT_CST_LOW (prev_init)) / type_size;
2569 if (diff != 1)
2571 /* FORNOW: SLP of accesses with gaps is not supported. */
2572 slp_impossible = true;
2573 if (DR_IS_WRITE (data_ref))
2575 if (dump_enabled_p ())
2576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2577 "interleaved store with gaps\n");
2578 return false;
2581 gaps += diff - 1;
2584 last_accessed_element += diff;
2586 /* Store the gap from the previous member of the group. If there is no
2587 gap in the access, DR_GROUP_GAP is always 1. */
2588 DR_GROUP_GAP (next) = diff;
2590 prev_init = DR_INIT (data_ref);
2591 next = DR_GROUP_NEXT_ELEMENT (next);
2592 /* Count the number of data-refs in the chain. */
2593 count++;
2596 if (groupsize == 0)
2597 groupsize = count + gaps;
2599 /* This could be UINT_MAX but as we are generating code in a very
2600 inefficient way we have to cap earlier. See PR78699 for example. */
2601 if (groupsize > 4096)
2603 if (dump_enabled_p ())
2604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2605 "group is too large\n");
2606 return false;
2609 /* Check that the size of the interleaving is equal to count for stores,
2610 i.e., that there are no gaps. */
2611 if (groupsize != count
2612 && !DR_IS_READ (dr))
2614 if (dump_enabled_p ())
2615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2616 "interleaved store with gaps\n");
2617 return false;
2620 /* If there is a gap after the last load in the group it is the
2621 difference between the groupsize and the last accessed
2622 element.
2623 When there is no gap, this difference should be 0. */
2624 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2626 DR_GROUP_SIZE (stmt_info) = groupsize;
2627 if (dump_enabled_p ())
2629 dump_printf_loc (MSG_NOTE, vect_location,
2630 "Detected interleaving ");
2631 if (DR_IS_READ (dr))
2632 dump_printf (MSG_NOTE, "load ");
2633 else
2634 dump_printf (MSG_NOTE, "store ");
2635 dump_printf (MSG_NOTE, "of size %u starting with ",
2636 (unsigned)groupsize);
2637 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2638 if (DR_GROUP_GAP (stmt_info) != 0)
2639 dump_printf_loc (MSG_NOTE, vect_location,
2640 "There is a gap of %u elements after the group\n",
2641 DR_GROUP_GAP (stmt_info));
2644 /* SLP: create an SLP data structure for every interleaving group of
2645 stores for further analysis in vect_analyse_slp. */
2646 if (DR_IS_WRITE (dr) && !slp_impossible)
2648 if (loop_vinfo)
2649 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2650 if (bb_vinfo)
2651 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2655 return true;
2658 /* Analyze groups of accesses: check that DR belongs to a group of
2659 accesses of legal size, step, etc. Detect gaps, single element
2660 interleaving, and other special cases. Set grouped access info.
2661 Collect groups of strided stores for further use in SLP analysis. */
2663 static bool
2664 vect_analyze_group_access (struct data_reference *dr)
2666 if (!vect_analyze_group_access_1 (dr))
2668 /* Dissolve the group if present. */
2669 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (vect_dr_stmt (dr));
2670 while (stmt_info)
2672 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2673 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2674 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2675 stmt_info = next;
2677 return false;
2679 return true;
2682 /* Analyze the access pattern of the data-reference DR.
2683 In case of non-consecutive accesses call vect_analyze_group_access() to
2684 analyze groups of accesses. */
2686 static bool
2687 vect_analyze_data_ref_access (struct data_reference *dr)
2689 tree step = DR_STEP (dr);
2690 tree scalar_type = TREE_TYPE (DR_REF (dr));
2691 stmt_vec_info stmt_info = vect_dr_stmt (dr);
2692 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2693 struct loop *loop = NULL;
2695 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2696 return true;
2698 if (loop_vinfo)
2699 loop = LOOP_VINFO_LOOP (loop_vinfo);
2701 if (loop_vinfo && !step)
2703 if (dump_enabled_p ())
2704 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2705 "bad data-ref access in loop\n");
2706 return false;
2709 /* Allow loads with zero step in inner-loop vectorization. */
2710 if (loop_vinfo && integer_zerop (step))
2712 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2713 if (!nested_in_vect_loop_p (loop, stmt_info))
2714 return DR_IS_READ (dr);
2715 /* Allow references with zero step for outer loops marked
2716 with pragma omp simd only - it guarantees absence of
2717 loop-carried dependencies between inner loop iterations. */
2718 if (loop->safelen < 2)
2720 if (dump_enabled_p ())
2721 dump_printf_loc (MSG_NOTE, vect_location,
2722 "zero step in inner loop of nest\n");
2723 return false;
2727 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2729 /* Interleaved accesses are not yet supported within outer-loop
2730 vectorization for references in the inner-loop. */
2731 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2733 /* For the rest of the analysis we use the outer-loop step. */
2734 step = STMT_VINFO_DR_STEP (stmt_info);
2735 if (integer_zerop (step))
2737 if (dump_enabled_p ())
2738 dump_printf_loc (MSG_NOTE, vect_location,
2739 "zero step in outer loop.\n");
2740 return DR_IS_READ (dr);
2744 /* Consecutive? */
2745 if (TREE_CODE (step) == INTEGER_CST)
2747 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2748 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2749 || (dr_step < 0
2750 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2752 /* Mark that it is not interleaving. */
2753 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2754 return true;
2758 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2760 if (dump_enabled_p ())
2761 dump_printf_loc (MSG_NOTE, vect_location,
2762 "grouped access in outer loop.\n");
2763 return false;
2767 /* Assume this is a DR handled by non-constant strided load case. */
2768 if (TREE_CODE (step) != INTEGER_CST)
2769 return (STMT_VINFO_STRIDED_P (stmt_info)
2770 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2771 || vect_analyze_group_access (dr)));
2773 /* Not consecutive access - check if it's a part of interleaving group. */
2774 return vect_analyze_group_access (dr);
2777 /* Compare two data-references DRA and DRB to group them into chunks
2778 suitable for grouping. */
2780 static int
2781 dr_group_sort_cmp (const void *dra_, const void *drb_)
2783 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2784 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2785 int cmp;
2787 /* Stabilize sort. */
2788 if (dra == drb)
2789 return 0;
2791 /* DRs in different loops never belong to the same group. */
2792 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2793 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2794 if (loopa != loopb)
2795 return loopa->num < loopb->num ? -1 : 1;
2797 /* Ordering of DRs according to base. */
2798 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2799 DR_BASE_ADDRESS (drb));
2800 if (cmp != 0)
2801 return cmp;
2803 /* And according to DR_OFFSET. */
2804 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2805 if (cmp != 0)
2806 return cmp;
2808 /* Put reads before writes. */
2809 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2810 return DR_IS_READ (dra) ? -1 : 1;
2812 /* Then sort after access size. */
2813 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2814 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2815 if (cmp != 0)
2816 return cmp;
2818 /* And after step. */
2819 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2820 if (cmp != 0)
2821 return cmp;
2823 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2824 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2825 if (cmp == 0)
2826 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2827 return cmp;
2830 /* If OP is the result of a conversion, return the unconverted value,
2831 otherwise return null. */
2833 static tree
2834 strip_conversion (tree op)
2836 if (TREE_CODE (op) != SSA_NAME)
2837 return NULL_TREE;
2838 gimple *stmt = SSA_NAME_DEF_STMT (op);
2839 if (!is_gimple_assign (stmt)
2840 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2841 return NULL_TREE;
2842 return gimple_assign_rhs1 (stmt);
2845 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2846 and STMT2_INFO being in a single group. */
2848 static bool
2849 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2851 if (gimple_assign_single_p (stmt1_info->stmt))
2852 return gimple_assign_single_p (stmt2_info->stmt);
2854 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2855 if (call1 && gimple_call_internal_p (call1))
2857 /* Check for two masked loads or two masked stores. */
2858 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2859 if (!call2 || !gimple_call_internal_p (call2))
2860 return false;
2861 internal_fn ifn = gimple_call_internal_fn (call1);
2862 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2863 return false;
2864 if (ifn != gimple_call_internal_fn (call2))
2865 return false;
2867 /* Check that the masks are the same. Cope with casts of masks,
2868 like those created by build_mask_conversion. */
2869 tree mask1 = gimple_call_arg (call1, 2);
2870 tree mask2 = gimple_call_arg (call2, 2);
2871 if (!operand_equal_p (mask1, mask2, 0))
2873 mask1 = strip_conversion (mask1);
2874 if (!mask1)
2875 return false;
2876 mask2 = strip_conversion (mask2);
2877 if (!mask2)
2878 return false;
2879 if (!operand_equal_p (mask1, mask2, 0))
2880 return false;
2882 return true;
2885 return false;
2888 /* Function vect_analyze_data_ref_accesses.
2890 Analyze the access pattern of all the data references in the loop.
2892 FORNOW: the only access pattern that is considered vectorizable is a
2893 simple step 1 (consecutive) access.
2895 FORNOW: handle only arrays and pointer accesses. */
2897 bool
2898 vect_analyze_data_ref_accesses (vec_info *vinfo)
2900 unsigned int i;
2901 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2902 struct data_reference *dr;
2904 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2906 if (datarefs.is_empty ())
2907 return true;
2909 /* Sort the array of datarefs to make building the interleaving chains
2910 linear. Don't modify the original vector's order, it is needed for
2911 determining what dependencies are reversed. */
2912 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2913 datarefs_copy.qsort (dr_group_sort_cmp);
2915 /* Build the interleaving chains. */
2916 for (i = 0; i < datarefs_copy.length () - 1;)
2918 data_reference_p dra = datarefs_copy[i];
2919 stmt_vec_info stmtinfo_a = vect_dr_stmt (dra);
2920 stmt_vec_info lastinfo = NULL;
2921 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2922 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2924 ++i;
2925 continue;
2927 for (i = i + 1; i < datarefs_copy.length (); ++i)
2929 data_reference_p drb = datarefs_copy[i];
2930 stmt_vec_info stmtinfo_b = vect_dr_stmt (drb);
2931 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2932 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2933 break;
2935 /* ??? Imperfect sorting (non-compatible types, non-modulo
2936 accesses, same accesses) can lead to a group to be artificially
2937 split here as we don't just skip over those. If it really
2938 matters we can push those to a worklist and re-iterate
2939 over them. The we can just skip ahead to the next DR here. */
2941 /* DRs in a different loop should not be put into the same
2942 interleaving group. */
2943 if (gimple_bb (DR_STMT (dra))->loop_father
2944 != gimple_bb (DR_STMT (drb))->loop_father)
2945 break;
2947 /* Check that the data-refs have same first location (except init)
2948 and they are both either store or load (not load and store,
2949 not masked loads or stores). */
2950 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2951 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2952 DR_BASE_ADDRESS (drb)) != 0
2953 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2954 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2955 break;
2957 /* Check that the data-refs have the same constant size. */
2958 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2959 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2960 if (!tree_fits_uhwi_p (sza)
2961 || !tree_fits_uhwi_p (szb)
2962 || !tree_int_cst_equal (sza, szb))
2963 break;
2965 /* Check that the data-refs have the same step. */
2966 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2967 break;
2969 /* Check the types are compatible.
2970 ??? We don't distinguish this during sorting. */
2971 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2972 TREE_TYPE (DR_REF (drb))))
2973 break;
2975 /* Check that the DR_INITs are compile-time constants. */
2976 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2977 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2978 break;
2980 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2981 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2982 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2983 HOST_WIDE_INT init_prev
2984 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2985 gcc_assert (init_a <= init_b
2986 && init_a <= init_prev
2987 && init_prev <= init_b);
2989 /* Do not place the same access in the interleaving chain twice. */
2990 if (init_b == init_prev)
2992 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2993 < gimple_uid (DR_STMT (drb)));
2994 /* ??? For now we simply "drop" the later reference which is
2995 otherwise the same rather than finishing off this group.
2996 In the end we'd want to re-process duplicates forming
2997 multiple groups from the refs, likely by just collecting
2998 all candidates (including duplicates and split points
2999 below) in a vector and then process them together. */
3000 continue;
3003 /* If init_b == init_a + the size of the type * k, we have an
3004 interleaving, and DRA is accessed before DRB. */
3005 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3006 if (type_size_a == 0
3007 || (init_b - init_a) % type_size_a != 0)
3008 break;
3010 /* If we have a store, the accesses are adjacent. This splits
3011 groups into chunks we support (we don't support vectorization
3012 of stores with gaps). */
3013 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3014 break;
3016 /* If the step (if not zero or non-constant) is greater than the
3017 difference between data-refs' inits this splits groups into
3018 suitable sizes. */
3019 if (tree_fits_shwi_p (DR_STEP (dra)))
3021 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3022 if (step != 0 && step <= (init_b - init_a))
3023 break;
3026 if (dump_enabled_p ())
3028 dump_printf_loc (MSG_NOTE, vect_location,
3029 "Detected interleaving ");
3030 if (DR_IS_READ (dra))
3031 dump_printf (MSG_NOTE, "load ");
3032 else
3033 dump_printf (MSG_NOTE, "store ");
3034 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3035 dump_printf (MSG_NOTE, " and ");
3036 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3037 dump_printf (MSG_NOTE, "\n");
3040 /* Link the found element into the group list. */
3041 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3043 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3044 lastinfo = stmtinfo_a;
3046 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3047 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3048 lastinfo = stmtinfo_b;
3052 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3053 if (STMT_VINFO_VECTORIZABLE (vect_dr_stmt (dr))
3054 && !vect_analyze_data_ref_access (dr))
3056 if (dump_enabled_p ())
3057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058 "not vectorized: complicated access pattern.\n");
3060 if (is_a <bb_vec_info> (vinfo))
3062 /* Mark the statement as not vectorizable. */
3063 STMT_VINFO_VECTORIZABLE (vect_dr_stmt (dr)) = false;
3064 continue;
3066 else
3068 datarefs_copy.release ();
3069 return false;
3073 datarefs_copy.release ();
3074 return true;
3077 /* Function vect_vfa_segment_size.
3079 Input:
3080 DR: The data reference.
3081 LENGTH_FACTOR: segment length to consider.
3083 Return a value suitable for the dr_with_seg_len::seg_len field.
3084 This is the "distance travelled" by the pointer from the first
3085 iteration in the segment to the last. Note that it does not include
3086 the size of the access; in effect it only describes the first byte. */
3088 static tree
3089 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3091 length_factor = size_binop (MINUS_EXPR,
3092 fold_convert (sizetype, length_factor),
3093 size_one_node);
3094 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3095 length_factor);
3098 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3099 gives the worst-case number of bytes covered by the segment. */
3101 static unsigned HOST_WIDE_INT
3102 vect_vfa_access_size (data_reference *dr)
3104 stmt_vec_info stmt_vinfo = vect_dr_stmt (dr);
3105 tree ref_type = TREE_TYPE (DR_REF (dr));
3106 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3107 unsigned HOST_WIDE_INT access_size = ref_size;
3108 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3110 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3111 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3113 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3114 && (vect_supportable_dr_alignment (dr, false)
3115 == dr_explicit_realign_optimized))
3117 /* We might access a full vector's worth. */
3118 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3119 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3121 return access_size;
3124 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3126 static unsigned int
3127 vect_vfa_align (const data_reference *dr)
3129 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3132 /* Function vect_no_alias_p.
3134 Given data references A and B with equal base and offset, see whether
3135 the alias relation can be decided at compilation time. Return 1 if
3136 it can and the references alias, 0 if it can and the references do
3137 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3138 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3139 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3141 static int
3142 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3143 tree segment_length_a, tree segment_length_b,
3144 unsigned HOST_WIDE_INT access_size_a,
3145 unsigned HOST_WIDE_INT access_size_b)
3147 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3148 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3149 poly_uint64 const_length_a;
3150 poly_uint64 const_length_b;
3152 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3153 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3154 [a, a+12) */
3155 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3157 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3158 offset_a = (offset_a + access_size_a) - const_length_a;
3160 else
3161 const_length_a = tree_to_poly_uint64 (segment_length_a);
3162 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3164 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3165 offset_b = (offset_b + access_size_b) - const_length_b;
3167 else
3168 const_length_b = tree_to_poly_uint64 (segment_length_b);
3170 const_length_a += access_size_a;
3171 const_length_b += access_size_b;
3173 if (ranges_known_overlap_p (offset_a, const_length_a,
3174 offset_b, const_length_b))
3175 return 1;
3177 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3178 offset_b, const_length_b))
3179 return 0;
3181 return -1;
3184 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3185 in DDR is >= VF. */
3187 static bool
3188 dependence_distance_ge_vf (data_dependence_relation *ddr,
3189 unsigned int loop_depth, poly_uint64 vf)
3191 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3192 || DDR_NUM_DIST_VECTS (ddr) == 0)
3193 return false;
3195 /* If the dependence is exact, we should have limited the VF instead. */
3196 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3198 unsigned int i;
3199 lambda_vector dist_v;
3200 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3202 HOST_WIDE_INT dist = dist_v[loop_depth];
3203 if (dist != 0
3204 && !(dist > 0 && DDR_REVERSED_P (ddr))
3205 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3206 return false;
3209 if (dump_enabled_p ())
3211 dump_printf_loc (MSG_NOTE, vect_location,
3212 "dependence distance between ");
3213 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3214 dump_printf (MSG_NOTE, " and ");
3215 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3216 dump_printf (MSG_NOTE, " is >= VF\n");
3219 return true;
3222 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3224 static void
3225 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3227 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3228 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3229 dump_printf (dump_kind, ") >= ");
3230 dump_dec (dump_kind, lower_bound.min_value);
3233 /* Record that the vectorized loop requires the vec_lower_bound described
3234 by EXPR, UNSIGNED_P and MIN_VALUE. */
3236 static void
3237 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3238 poly_uint64 min_value)
3240 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3241 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3242 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3244 unsigned_p &= lower_bounds[i].unsigned_p;
3245 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3246 if (lower_bounds[i].unsigned_p != unsigned_p
3247 || maybe_lt (lower_bounds[i].min_value, min_value))
3249 lower_bounds[i].unsigned_p = unsigned_p;
3250 lower_bounds[i].min_value = min_value;
3251 if (dump_enabled_p ())
3253 dump_printf_loc (MSG_NOTE, vect_location,
3254 "updating run-time check to ");
3255 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3256 dump_printf (MSG_NOTE, "\n");
3259 return;
3262 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3263 if (dump_enabled_p ())
3265 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3266 dump_lower_bound (MSG_NOTE, lower_bound);
3267 dump_printf (MSG_NOTE, "\n");
3269 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3272 /* Return true if it's unlikely that the step of the vectorized form of DR
3273 will span fewer than GAP bytes. */
3275 static bool
3276 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3278 stmt_vec_info stmt_info = vect_dr_stmt (dr);
3279 HOST_WIDE_INT count
3280 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3281 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3282 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3283 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3286 /* Return true if we know that there is no alias between DR_A and DR_B
3287 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3288 *LOWER_BOUND_OUT to this N. */
3290 static bool
3291 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3292 poly_uint64 *lower_bound_out)
3294 /* Check that there is a constant gap of known sign between DR_A
3295 and DR_B. */
3296 poly_int64 init_a, init_b;
3297 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3298 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3299 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3300 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3301 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3302 || !ordered_p (init_a, init_b))
3303 return false;
3305 /* Sort DR_A and DR_B by the address they access. */
3306 if (maybe_lt (init_b, init_a))
3308 std::swap (init_a, init_b);
3309 std::swap (dr_a, dr_b);
3312 /* If the two accesses could be dependent within a scalar iteration,
3313 make sure that we'd retain their order. */
3314 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3315 && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3316 vect_dr_stmt (dr_b)))
3317 return false;
3319 /* There is no alias if abs (DR_STEP) is greater than or equal to
3320 the bytes spanned by the combination of the two accesses. */
3321 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3322 return true;
3325 /* Function vect_prune_runtime_alias_test_list.
3327 Prune a list of ddrs to be tested at run-time by versioning for alias.
3328 Merge several alias checks into one if possible.
3329 Return FALSE if resulting list of ddrs is longer then allowed by
3330 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3332 bool
3333 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3335 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3336 hash_set <tree_pair_hash> compared_objects;
3338 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3339 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3340 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3341 vec<vec_object_pair> &check_unequal_addrs
3342 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3343 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3344 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3346 ddr_p ddr;
3347 unsigned int i;
3348 tree length_factor;
3350 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3352 /* Step values are irrelevant for aliasing if the number of vector
3353 iterations is equal to the number of scalar iterations (which can
3354 happen for fully-SLP loops). */
3355 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3357 if (!ignore_step_p)
3359 /* Convert the checks for nonzero steps into bound tests. */
3360 tree value;
3361 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3362 vect_check_lower_bound (loop_vinfo, value, true, 1);
3365 if (may_alias_ddrs.is_empty ())
3366 return true;
3368 comp_alias_ddrs.create (may_alias_ddrs.length ());
3370 unsigned int loop_depth
3371 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3372 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3374 /* First, we collect all data ref pairs for aliasing checks. */
3375 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3377 int comp_res;
3378 poly_uint64 lower_bound;
3379 struct data_reference *dr_a, *dr_b;
3380 tree segment_length_a, segment_length_b;
3381 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3382 unsigned int align_a, align_b;
3384 /* Ignore the alias if the VF we chose ended up being no greater
3385 than the dependence distance. */
3386 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3387 continue;
3389 if (DDR_OBJECT_A (ddr))
3391 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3392 if (!compared_objects.add (new_pair))
3394 if (dump_enabled_p ())
3396 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3397 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3398 dump_printf (MSG_NOTE, " and ");
3399 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3400 dump_printf (MSG_NOTE, " have different addresses\n");
3402 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3404 continue;
3407 dr_a = DDR_A (ddr);
3408 stmt_vec_info stmt_info_a = vect_dr_stmt (DDR_A (ddr));
3410 dr_b = DDR_B (ddr);
3411 stmt_vec_info stmt_info_b = vect_dr_stmt (DDR_B (ddr));
3413 /* Skip the pair if inter-iteration dependencies are irrelevant
3414 and intra-iteration dependencies are guaranteed to be honored. */
3415 if (ignore_step_p
3416 && (vect_preserves_scalar_order_p (stmt_info_a, stmt_info_b)
3417 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3419 if (dump_enabled_p ())
3421 dump_printf_loc (MSG_NOTE, vect_location,
3422 "no need for alias check between ");
3423 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3424 dump_printf (MSG_NOTE, " and ");
3425 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3426 dump_printf (MSG_NOTE, " when VF is 1\n");
3428 continue;
3431 /* See whether we can handle the alias using a bounds check on
3432 the step, and whether that's likely to be the best approach.
3433 (It might not be, for example, if the minimum step is much larger
3434 than the number of bytes handled by one vector iteration.) */
3435 if (!ignore_step_p
3436 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3437 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3438 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3439 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3441 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3442 if (dump_enabled_p ())
3444 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3445 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3446 dump_printf (MSG_NOTE, " and ");
3447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3448 dump_printf (MSG_NOTE, " when the step ");
3449 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3450 dump_printf (MSG_NOTE, " is outside ");
3451 if (unsigned_p)
3452 dump_printf (MSG_NOTE, "[0");
3453 else
3455 dump_printf (MSG_NOTE, "(");
3456 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3458 dump_printf (MSG_NOTE, ", ");
3459 dump_dec (MSG_NOTE, lower_bound);
3460 dump_printf (MSG_NOTE, ")\n");
3462 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3463 lower_bound);
3464 continue;
3467 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3468 if (dr_group_first_a)
3470 stmt_info_a = dr_group_first_a;
3471 dr_a = STMT_VINFO_DATA_REF (stmt_info_a);
3474 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3475 if (dr_group_first_b)
3477 stmt_info_b = dr_group_first_b;
3478 dr_b = STMT_VINFO_DATA_REF (stmt_info_b);
3481 if (ignore_step_p)
3483 segment_length_a = size_zero_node;
3484 segment_length_b = size_zero_node;
3486 else
3488 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3489 length_factor = scalar_loop_iters;
3490 else
3491 length_factor = size_int (vect_factor);
3492 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3493 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3495 access_size_a = vect_vfa_access_size (dr_a);
3496 access_size_b = vect_vfa_access_size (dr_b);
3497 align_a = vect_vfa_align (dr_a);
3498 align_b = vect_vfa_align (dr_b);
3500 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3501 DR_BASE_ADDRESS (dr_b));
3502 if (comp_res == 0)
3503 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3504 DR_OFFSET (dr_b));
3506 /* See whether the alias is known at compilation time. */
3507 if (comp_res == 0
3508 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3509 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3510 && poly_int_tree_p (segment_length_a)
3511 && poly_int_tree_p (segment_length_b))
3513 int res = vect_compile_time_alias (dr_a, dr_b,
3514 segment_length_a,
3515 segment_length_b,
3516 access_size_a,
3517 access_size_b);
3518 if (res >= 0 && dump_enabled_p ())
3520 dump_printf_loc (MSG_NOTE, vect_location,
3521 "can tell at compile time that ");
3522 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3523 dump_printf (MSG_NOTE, " and ");
3524 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3525 if (res == 0)
3526 dump_printf (MSG_NOTE, " do not alias\n");
3527 else
3528 dump_printf (MSG_NOTE, " alias\n");
3531 if (res == 0)
3532 continue;
3534 if (res == 1)
3536 if (dump_enabled_p ())
3537 dump_printf_loc (MSG_NOTE, vect_location,
3538 "not vectorized: compilation time alias.\n");
3539 return false;
3543 dr_with_seg_len_pair_t dr_with_seg_len_pair
3544 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3545 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3547 /* Canonicalize pairs by sorting the two DR members. */
3548 if (comp_res > 0)
3549 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3551 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3554 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3556 unsigned int count = (comp_alias_ddrs.length ()
3557 + check_unequal_addrs.length ());
3559 dump_printf_loc (MSG_NOTE, vect_location,
3560 "improved number of alias checks from %d to %d\n",
3561 may_alias_ddrs.length (), count);
3562 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3564 if (dump_enabled_p ())
3565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3566 "number of versioning for alias "
3567 "run-time tests exceeds %d "
3568 "(--param vect-max-version-for-alias-checks)\n",
3569 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3570 return false;
3573 return true;
3576 /* Check whether we can use an internal function for a gather load
3577 or scatter store. READ_P is true for loads and false for stores.
3578 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3579 the type of the memory elements being loaded or stored. OFFSET_BITS
3580 is the number of bits in each scalar offset and OFFSET_SIGN is the
3581 sign of the offset. SCALE is the amount by which the offset should
3582 be multiplied *after* it has been converted to address width.
3584 Return true if the function is supported, storing the function
3585 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3587 bool
3588 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3589 tree memory_type, unsigned int offset_bits,
3590 signop offset_sign, int scale,
3591 internal_fn *ifn_out, tree *element_type_out)
3593 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3594 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3595 if (offset_bits > element_bits)
3596 /* Internal functions require the offset to be the same width as
3597 the vector elements. We can extend narrower offsets, but it isn't
3598 safe to truncate wider offsets. */
3599 return false;
3601 if (element_bits != memory_bits)
3602 /* For now the vector elements must be the same width as the
3603 memory elements. */
3604 return false;
3606 /* Work out which function we need. */
3607 internal_fn ifn;
3608 if (read_p)
3609 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3610 else
3611 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3613 /* Test whether the target supports this combination. */
3614 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3615 offset_sign, scale))
3616 return false;
3618 *ifn_out = ifn;
3619 *element_type_out = TREE_TYPE (vectype);
3620 return true;
3623 /* STMT_INFO is a call to an internal gather load or scatter store function.
3624 Describe the operation in INFO. */
3626 static void
3627 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3628 gather_scatter_info *info)
3630 gcall *call = as_a <gcall *> (stmt_info->stmt);
3631 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3632 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3634 info->ifn = gimple_call_internal_fn (call);
3635 info->decl = NULL_TREE;
3636 info->base = gimple_call_arg (call, 0);
3637 info->offset = gimple_call_arg (call, 1);
3638 info->offset_dt = vect_unknown_def_type;
3639 info->offset_vectype = NULL_TREE;
3640 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3641 info->element_type = TREE_TYPE (vectype);
3642 info->memory_type = TREE_TYPE (DR_REF (dr));
3645 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3646 gather load or scatter store. Describe the operation in *INFO if so. */
3648 bool
3649 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3650 gather_scatter_info *info)
3652 HOST_WIDE_INT scale = 1;
3653 poly_int64 pbitpos, pbitsize;
3654 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3655 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3656 tree offtype = NULL_TREE;
3657 tree decl = NULL_TREE, base, off;
3658 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3659 tree memory_type = TREE_TYPE (DR_REF (dr));
3660 machine_mode pmode;
3661 int punsignedp, reversep, pvolatilep = 0;
3662 internal_fn ifn;
3663 tree element_type;
3664 bool masked_p = false;
3666 /* See whether this is already a call to a gather/scatter internal function.
3667 If not, see whether it's a masked load or store. */
3668 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3669 if (call && gimple_call_internal_p (call))
3671 ifn = gimple_call_internal_fn (call);
3672 if (internal_gather_scatter_fn_p (ifn))
3674 vect_describe_gather_scatter_call (stmt_info, info);
3675 return true;
3677 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3680 /* True if we should aim to use internal functions rather than
3681 built-in functions. */
3682 bool use_ifn_p = (DR_IS_READ (dr)
3683 ? supports_vec_gather_load_p ()
3684 : supports_vec_scatter_store_p ());
3686 base = DR_REF (dr);
3687 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3688 see if we can use the def stmt of the address. */
3689 if (masked_p
3690 && TREE_CODE (base) == MEM_REF
3691 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3692 && integer_zerop (TREE_OPERAND (base, 1))
3693 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3695 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3696 if (is_gimple_assign (def_stmt)
3697 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3698 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3701 /* The gather and scatter builtins need address of the form
3702 loop_invariant + vector * {1, 2, 4, 8}
3704 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3705 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3706 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3707 multiplications and additions in it. To get a vector, we need
3708 a single SSA_NAME that will be defined in the loop and will
3709 contain everything that is not loop invariant and that can be
3710 vectorized. The following code attempts to find such a preexistng
3711 SSA_NAME OFF and put the loop invariants into a tree BASE
3712 that can be gimplified before the loop. */
3713 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3714 &punsignedp, &reversep, &pvolatilep);
3715 if (reversep)
3716 return false;
3718 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3720 if (TREE_CODE (base) == MEM_REF)
3722 if (!integer_zerop (TREE_OPERAND (base, 1)))
3724 if (off == NULL_TREE)
3725 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3726 else
3727 off = size_binop (PLUS_EXPR, off,
3728 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3730 base = TREE_OPERAND (base, 0);
3732 else
3733 base = build_fold_addr_expr (base);
3735 if (off == NULL_TREE)
3736 off = size_zero_node;
3738 /* If base is not loop invariant, either off is 0, then we start with just
3739 the constant offset in the loop invariant BASE and continue with base
3740 as OFF, otherwise give up.
3741 We could handle that case by gimplifying the addition of base + off
3742 into some SSA_NAME and use that as off, but for now punt. */
3743 if (!expr_invariant_in_loop_p (loop, base))
3745 if (!integer_zerop (off))
3746 return false;
3747 off = base;
3748 base = size_int (pbytepos);
3750 /* Otherwise put base + constant offset into the loop invariant BASE
3751 and continue with OFF. */
3752 else
3754 base = fold_convert (sizetype, base);
3755 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3758 /* OFF at this point may be either a SSA_NAME or some tree expression
3759 from get_inner_reference. Try to peel off loop invariants from it
3760 into BASE as long as possible. */
3761 STRIP_NOPS (off);
3762 while (offtype == NULL_TREE)
3764 enum tree_code code;
3765 tree op0, op1, add = NULL_TREE;
3767 if (TREE_CODE (off) == SSA_NAME)
3769 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3771 if (expr_invariant_in_loop_p (loop, off))
3772 return false;
3774 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3775 break;
3777 op0 = gimple_assign_rhs1 (def_stmt);
3778 code = gimple_assign_rhs_code (def_stmt);
3779 op1 = gimple_assign_rhs2 (def_stmt);
3781 else
3783 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3784 return false;
3785 code = TREE_CODE (off);
3786 extract_ops_from_tree (off, &code, &op0, &op1);
3788 switch (code)
3790 case POINTER_PLUS_EXPR:
3791 case PLUS_EXPR:
3792 if (expr_invariant_in_loop_p (loop, op0))
3794 add = op0;
3795 off = op1;
3796 do_add:
3797 add = fold_convert (sizetype, add);
3798 if (scale != 1)
3799 add = size_binop (MULT_EXPR, add, size_int (scale));
3800 base = size_binop (PLUS_EXPR, base, add);
3801 continue;
3803 if (expr_invariant_in_loop_p (loop, op1))
3805 add = op1;
3806 off = op0;
3807 goto do_add;
3809 break;
3810 case MINUS_EXPR:
3811 if (expr_invariant_in_loop_p (loop, op1))
3813 add = fold_convert (sizetype, op1);
3814 add = size_binop (MINUS_EXPR, size_zero_node, add);
3815 off = op0;
3816 goto do_add;
3818 break;
3819 case MULT_EXPR:
3820 if (scale == 1 && tree_fits_shwi_p (op1))
3822 int new_scale = tree_to_shwi (op1);
3823 /* Only treat this as a scaling operation if the target
3824 supports it. */
3825 if (use_ifn_p
3826 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3827 vectype, memory_type, 1,
3828 TYPE_SIGN (TREE_TYPE (op0)),
3829 new_scale, &ifn,
3830 &element_type))
3831 break;
3832 scale = new_scale;
3833 off = op0;
3834 continue;
3836 break;
3837 case SSA_NAME:
3838 off = op0;
3839 continue;
3840 CASE_CONVERT:
3841 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3842 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3843 break;
3844 if (TYPE_PRECISION (TREE_TYPE (op0))
3845 == TYPE_PRECISION (TREE_TYPE (off)))
3847 off = op0;
3848 continue;
3851 /* The internal functions need the offset to be the same width
3852 as the elements of VECTYPE. Don't include operations that
3853 cast the offset from that width to a different width. */
3854 if (use_ifn_p
3855 && (int_size_in_bytes (TREE_TYPE (vectype))
3856 == int_size_in_bytes (TREE_TYPE (off))))
3857 break;
3859 if (TYPE_PRECISION (TREE_TYPE (op0))
3860 < TYPE_PRECISION (TREE_TYPE (off)))
3862 off = op0;
3863 offtype = TREE_TYPE (off);
3864 STRIP_NOPS (off);
3865 continue;
3867 break;
3868 default:
3869 break;
3871 break;
3874 /* If at the end OFF still isn't a SSA_NAME or isn't
3875 defined in the loop, punt. */
3876 if (TREE_CODE (off) != SSA_NAME
3877 || expr_invariant_in_loop_p (loop, off))
3878 return false;
3880 if (offtype == NULL_TREE)
3881 offtype = TREE_TYPE (off);
3883 if (use_ifn_p)
3885 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3886 memory_type, TYPE_PRECISION (offtype),
3887 TYPE_SIGN (offtype), scale, &ifn,
3888 &element_type))
3889 return false;
3891 else
3893 if (DR_IS_READ (dr))
3895 if (targetm.vectorize.builtin_gather)
3896 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3898 else
3900 if (targetm.vectorize.builtin_scatter)
3901 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3904 if (!decl)
3905 return false;
3907 ifn = IFN_LAST;
3908 element_type = TREE_TYPE (vectype);
3911 info->ifn = ifn;
3912 info->decl = decl;
3913 info->base = base;
3914 info->offset = off;
3915 info->offset_dt = vect_unknown_def_type;
3916 info->offset_vectype = NULL_TREE;
3917 info->scale = scale;
3918 info->element_type = element_type;
3919 info->memory_type = memory_type;
3920 return true;
3923 /* Find the data references in STMT, analyze them with respect to LOOP and
3924 append them to DATAREFS. Return false if datarefs in this stmt cannot
3925 be handled. */
3927 bool
3928 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3929 vec<data_reference_p> *datarefs)
3931 /* We can ignore clobbers for dataref analysis - they are removed during
3932 loop vectorization and BB vectorization checks dependences with a
3933 stmt walk. */
3934 if (gimple_clobber_p (stmt))
3935 return true;
3937 if (gimple_has_volatile_ops (stmt))
3939 if (dump_enabled_p ())
3941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3942 "not vectorized: volatile type ");
3943 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3945 return false;
3948 if (stmt_can_throw_internal (stmt))
3950 if (dump_enabled_p ())
3952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3953 "not vectorized: statement can throw an "
3954 "exception ");
3955 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3957 return false;
3960 auto_vec<data_reference_p, 2> refs;
3961 if (!find_data_references_in_stmt (loop, stmt, &refs))
3962 return false;
3964 if (refs.is_empty ())
3965 return true;
3967 if (refs.length () > 1)
3969 if (dump_enabled_p ())
3971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3972 "not vectorized: more than one data ref "
3973 "in stmt: ");
3974 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3976 return false;
3979 if (gcall *call = dyn_cast <gcall *> (stmt))
3980 if (!gimple_call_internal_p (call)
3981 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3982 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3984 if (dump_enabled_p ())
3986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3987 "not vectorized: dr in a call ");
3988 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3990 return false;
3993 data_reference_p dr = refs.pop ();
3994 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3995 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3997 if (dump_enabled_p ())
3999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4000 "not vectorized: statement is bitfield "
4001 "access ");
4002 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4004 return false;
4007 if (DR_BASE_ADDRESS (dr)
4008 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4010 if (dump_enabled_p ())
4011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4012 "not vectorized: base addr of dr is a "
4013 "constant\n");
4014 return false;
4017 /* Check whether this may be a SIMD lane access and adjust the
4018 DR to make it easier for us to handle it. */
4019 if (loop
4020 && loop->simduid
4021 && (!DR_BASE_ADDRESS (dr)
4022 || !DR_OFFSET (dr)
4023 || !DR_INIT (dr)
4024 || !DR_STEP (dr)))
4026 struct data_reference *newdr
4027 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4028 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4029 if (DR_BASE_ADDRESS (newdr)
4030 && DR_OFFSET (newdr)
4031 && DR_INIT (newdr)
4032 && DR_STEP (newdr)
4033 && integer_zerop (DR_STEP (newdr)))
4035 tree off = DR_OFFSET (newdr);
4036 STRIP_NOPS (off);
4037 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4038 && TREE_CODE (off) == MULT_EXPR
4039 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4041 tree step = TREE_OPERAND (off, 1);
4042 off = TREE_OPERAND (off, 0);
4043 STRIP_NOPS (off);
4044 if (CONVERT_EXPR_P (off)
4045 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4046 < TYPE_PRECISION (TREE_TYPE (off))))
4047 off = TREE_OPERAND (off, 0);
4048 if (TREE_CODE (off) == SSA_NAME)
4050 gimple *def = SSA_NAME_DEF_STMT (off);
4051 tree reft = TREE_TYPE (DR_REF (newdr));
4052 if (is_gimple_call (def)
4053 && gimple_call_internal_p (def)
4054 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4056 tree arg = gimple_call_arg (def, 0);
4057 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4058 arg = SSA_NAME_VAR (arg);
4059 if (arg == loop->simduid
4060 /* For now. */
4061 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4063 DR_OFFSET (newdr) = ssize_int (0);
4064 DR_STEP (newdr) = step;
4065 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4066 DR_STEP_ALIGNMENT (newdr)
4067 = highest_pow2_factor (step);
4068 /* Mark as simd-lane access. */
4069 newdr->aux = (void *)-1;
4070 free_data_ref (dr);
4071 datarefs->safe_push (newdr);
4072 return true;
4078 free_data_ref (newdr);
4081 datarefs->safe_push (dr);
4082 return true;
4085 /* Function vect_analyze_data_refs.
4087 Find all the data references in the loop or basic block.
4089 The general structure of the analysis of data refs in the vectorizer is as
4090 follows:
4091 1- vect_analyze_data_refs(loop/bb): call
4092 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4093 in the loop/bb and their dependences.
4094 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4095 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4096 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4100 bool
4101 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4103 struct loop *loop = NULL;
4104 unsigned int i;
4105 struct data_reference *dr;
4106 tree scalar_type;
4108 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4110 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4111 loop = LOOP_VINFO_LOOP (loop_vinfo);
4113 /* Go through the data-refs, check that the analysis succeeded. Update
4114 pointer from stmt_vec_info struct to DR and vectype. */
4116 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4117 FOR_EACH_VEC_ELT (datarefs, i, dr)
4119 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4120 poly_uint64 vf;
4122 gcc_assert (DR_REF (dr));
4123 stmt_vec_info stmt_info = vect_dr_stmt (dr);
4125 /* Check that analysis of the data-ref succeeded. */
4126 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4127 || !DR_STEP (dr))
4129 bool maybe_gather
4130 = DR_IS_READ (dr)
4131 && !TREE_THIS_VOLATILE (DR_REF (dr))
4132 && (targetm.vectorize.builtin_gather != NULL
4133 || supports_vec_gather_load_p ());
4134 bool maybe_scatter
4135 = DR_IS_WRITE (dr)
4136 && !TREE_THIS_VOLATILE (DR_REF (dr))
4137 && (targetm.vectorize.builtin_scatter != NULL
4138 || supports_vec_scatter_store_p ());
4140 /* If target supports vector gather loads or scatter stores,
4141 see if they can't be used. */
4142 if (is_a <loop_vec_info> (vinfo)
4143 && !nested_in_vect_loop_p (loop, stmt_info))
4145 if (maybe_gather || maybe_scatter)
4147 if (maybe_gather)
4148 gatherscatter = GATHER;
4149 else
4150 gatherscatter = SCATTER;
4154 if (gatherscatter == SG_NONE)
4156 if (dump_enabled_p ())
4158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4159 "not vectorized: data ref analysis "
4160 "failed ");
4161 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4162 stmt_info->stmt, 0);
4164 if (is_a <bb_vec_info> (vinfo))
4166 /* In BB vectorization the ref can still participate
4167 in dependence analysis, we just can't vectorize it. */
4168 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4169 continue;
4171 return false;
4175 /* See if this was detected as SIMD lane access. */
4176 if (dr->aux == (void *)-1)
4178 if (nested_in_vect_loop_p (loop, stmt_info))
4180 if (dump_enabled_p ())
4182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4183 "not vectorized: data ref analysis "
4184 "failed ");
4185 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4186 stmt_info->stmt, 0);
4188 return false;
4190 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4193 tree base = get_base_address (DR_REF (dr));
4194 if (base && VAR_P (base) && DECL_NONALIASED (base))
4196 if (dump_enabled_p ())
4198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4199 "not vectorized: base object not addressable "
4200 "for stmt: ");
4201 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4202 stmt_info->stmt, 0);
4204 if (is_a <bb_vec_info> (vinfo))
4206 /* In BB vectorization the ref can still participate
4207 in dependence analysis, we just can't vectorize it. */
4208 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4209 continue;
4211 return false;
4214 if (is_a <loop_vec_info> (vinfo)
4215 && DR_STEP (dr)
4216 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4218 if (nested_in_vect_loop_p (loop, stmt_info))
4220 if (dump_enabled_p ())
4222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4223 "not vectorized: not suitable for strided "
4224 "load ");
4225 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4226 stmt_info->stmt, 0);
4228 return false;
4230 STMT_VINFO_STRIDED_P (stmt_info) = true;
4233 /* Update DR field in stmt_vec_info struct. */
4235 /* If the dataref is in an inner-loop of the loop that is considered for
4236 for vectorization, we also want to analyze the access relative to
4237 the outer-loop (DR contains information only relative to the
4238 inner-most enclosing loop). We do that by building a reference to the
4239 first location accessed by the inner-loop, and analyze it relative to
4240 the outer-loop. */
4241 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4243 /* Build a reference to the first location accessed by the
4244 inner loop: *(BASE + INIT + OFFSET). By construction,
4245 this address must be invariant in the inner loop, so we
4246 can consider it as being used in the outer loop. */
4247 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4248 tree offset = unshare_expr (DR_OFFSET (dr));
4249 tree init = unshare_expr (DR_INIT (dr));
4250 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4251 init, offset);
4252 tree init_addr = fold_build_pointer_plus (base, init_offset);
4253 tree init_ref = build_fold_indirect_ref (init_addr);
4255 if (dump_enabled_p ())
4257 dump_printf_loc (MSG_NOTE, vect_location,
4258 "analyze in outer loop: ");
4259 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4260 dump_printf (MSG_NOTE, "\n");
4263 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4264 init_ref, loop))
4265 /* dr_analyze_innermost already explained the failure. */
4266 return false;
4268 if (dump_enabled_p ())
4270 dump_printf_loc (MSG_NOTE, vect_location,
4271 "\touter base_address: ");
4272 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4273 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4274 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4275 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4276 STMT_VINFO_DR_OFFSET (stmt_info));
4277 dump_printf (MSG_NOTE,
4278 "\n\touter constant offset from base address: ");
4279 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4280 STMT_VINFO_DR_INIT (stmt_info));
4281 dump_printf (MSG_NOTE, "\n\touter step: ");
4282 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4283 STMT_VINFO_DR_STEP (stmt_info));
4284 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4285 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4286 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4287 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4288 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4289 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4290 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4291 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4295 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4296 STMT_VINFO_DATA_REF (stmt_info) = dr;
4298 /* Set vectype for STMT. */
4299 scalar_type = TREE_TYPE (DR_REF (dr));
4300 STMT_VINFO_VECTYPE (stmt_info)
4301 = get_vectype_for_scalar_type (scalar_type);
4302 if (!STMT_VINFO_VECTYPE (stmt_info))
4304 if (dump_enabled_p ())
4306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4307 "not vectorized: no vectype for stmt: ");
4308 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4309 stmt_info->stmt, 0);
4310 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4312 scalar_type);
4313 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4316 if (is_a <bb_vec_info> (vinfo))
4318 /* No vector type is fine, the ref can still participate
4319 in dependence analysis, we just can't vectorize it. */
4320 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4321 continue;
4323 return false;
4325 else
4327 if (dump_enabled_p ())
4329 dump_printf_loc (MSG_NOTE, vect_location,
4330 "got vectype for stmt: ");
4331 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
4332 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4333 STMT_VINFO_VECTYPE (stmt_info));
4334 dump_printf (MSG_NOTE, "\n");
4338 /* Adjust the minimal vectorization factor according to the
4339 vector type. */
4340 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4341 *min_vf = upper_bound (*min_vf, vf);
4343 if (gatherscatter != SG_NONE)
4345 gather_scatter_info gs_info;
4346 if (!vect_check_gather_scatter (stmt_info,
4347 as_a <loop_vec_info> (vinfo),
4348 &gs_info)
4349 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4351 if (dump_enabled_p ())
4353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4354 (gatherscatter == GATHER) ?
4355 "not vectorized: not suitable for gather "
4356 "load " :
4357 "not vectorized: not suitable for scatter "
4358 "store ");
4359 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4360 stmt_info->stmt, 0);
4362 return false;
4364 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4368 /* We used to stop processing and prune the list here. Verify we no
4369 longer need to. */
4370 gcc_assert (i == datarefs.length ());
4372 return true;
4376 /* Function vect_get_new_vect_var.
4378 Returns a name for a new variable. The current naming scheme appends the
4379 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4380 the name of vectorizer generated variables, and appends that to NAME if
4381 provided. */
4383 tree
4384 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4386 const char *prefix;
4387 tree new_vect_var;
4389 switch (var_kind)
4391 case vect_simple_var:
4392 prefix = "vect";
4393 break;
4394 case vect_scalar_var:
4395 prefix = "stmp";
4396 break;
4397 case vect_mask_var:
4398 prefix = "mask";
4399 break;
4400 case vect_pointer_var:
4401 prefix = "vectp";
4402 break;
4403 default:
4404 gcc_unreachable ();
4407 if (name)
4409 char* tmp = concat (prefix, "_", name, NULL);
4410 new_vect_var = create_tmp_reg (type, tmp);
4411 free (tmp);
4413 else
4414 new_vect_var = create_tmp_reg (type, prefix);
4416 return new_vect_var;
4419 /* Like vect_get_new_vect_var but return an SSA name. */
4421 tree
4422 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4424 const char *prefix;
4425 tree new_vect_var;
4427 switch (var_kind)
4429 case vect_simple_var:
4430 prefix = "vect";
4431 break;
4432 case vect_scalar_var:
4433 prefix = "stmp";
4434 break;
4435 case vect_pointer_var:
4436 prefix = "vectp";
4437 break;
4438 default:
4439 gcc_unreachable ();
4442 if (name)
4444 char* tmp = concat (prefix, "_", name, NULL);
4445 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4446 free (tmp);
4448 else
4449 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4451 return new_vect_var;
4454 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4456 static void
4457 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4459 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4460 int misalign = DR_MISALIGNMENT (dr);
4461 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4462 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4463 else
4464 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4465 DR_TARGET_ALIGNMENT (dr), misalign);
4468 /* Function vect_create_addr_base_for_vector_ref.
4470 Create an expression that computes the address of the first memory location
4471 that will be accessed for a data reference.
4473 Input:
4474 STMT_INFO: The statement containing the data reference.
4475 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4476 OFFSET: Optional. If supplied, it is be added to the initial address.
4477 LOOP: Specify relative to which loop-nest should the address be computed.
4478 For example, when the dataref is in an inner-loop nested in an
4479 outer-loop that is now being vectorized, LOOP can be either the
4480 outer-loop, or the inner-loop. The first memory location accessed
4481 by the following dataref ('in' points to short):
4483 for (i=0; i<N; i++)
4484 for (j=0; j<M; j++)
4485 s += in[i+j]
4487 is as follows:
4488 if LOOP=i_loop: &in (relative to i_loop)
4489 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4490 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4491 initial address. Unlike OFFSET, which is number of elements to
4492 be added, BYTE_OFFSET is measured in bytes.
4494 Output:
4495 1. Return an SSA_NAME whose value is the address of the memory location of
4496 the first vector of the data reference.
4497 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4498 these statement(s) which define the returned SSA_NAME.
4500 FORNOW: We are only handling array accesses with step 1. */
4502 tree
4503 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4504 gimple_seq *new_stmt_list,
4505 tree offset,
4506 tree byte_offset)
4508 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4509 const char *base_name;
4510 tree addr_base;
4511 tree dest;
4512 gimple_seq seq = NULL;
4513 tree vect_ptr_type;
4514 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4515 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4516 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4518 tree data_ref_base = unshare_expr (drb->base_address);
4519 tree base_offset = unshare_expr (drb->offset);
4520 tree init = unshare_expr (drb->init);
4522 if (loop_vinfo)
4523 base_name = get_name (data_ref_base);
4524 else
4526 base_offset = ssize_int (0);
4527 init = ssize_int (0);
4528 base_name = get_name (DR_REF (dr));
4531 /* Create base_offset */
4532 base_offset = size_binop (PLUS_EXPR,
4533 fold_convert (sizetype, base_offset),
4534 fold_convert (sizetype, init));
4536 if (offset)
4538 offset = fold_build2 (MULT_EXPR, sizetype,
4539 fold_convert (sizetype, offset), step);
4540 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4541 base_offset, offset);
4543 if (byte_offset)
4545 byte_offset = fold_convert (sizetype, byte_offset);
4546 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4547 base_offset, byte_offset);
4550 /* base + base_offset */
4551 if (loop_vinfo)
4552 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4553 else
4555 addr_base = build1 (ADDR_EXPR,
4556 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4557 unshare_expr (DR_REF (dr)));
4560 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4561 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4562 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4563 gimple_seq_add_seq (new_stmt_list, seq);
4565 if (DR_PTR_INFO (dr)
4566 && TREE_CODE (addr_base) == SSA_NAME
4567 && !SSA_NAME_PTR_INFO (addr_base))
4569 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4570 if (offset || byte_offset)
4571 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4574 if (dump_enabled_p ())
4576 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4577 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4578 dump_printf (MSG_NOTE, "\n");
4581 return addr_base;
4585 /* Function vect_create_data_ref_ptr.
4587 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4588 location accessed in the loop by STMT_INFO, along with the def-use update
4589 chain to appropriately advance the pointer through the loop iterations.
4590 Also set aliasing information for the pointer. This pointer is used by
4591 the callers to this function to create a memory reference expression for
4592 vector load/store access.
4594 Input:
4595 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4596 GIMPLE_ASSIGN <name, data-ref> or
4597 GIMPLE_ASSIGN <data-ref, name>.
4598 2. AGGR_TYPE: the type of the reference, which should be either a vector
4599 or an array.
4600 3. AT_LOOP: the loop where the vector memref is to be created.
4601 4. OFFSET (optional): an offset to be added to the initial address accessed
4602 by the data-ref in STMT_INFO.
4603 5. BSI: location where the new stmts are to be placed if there is no loop
4604 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4605 pointing to the initial address.
4606 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4607 to the initial address accessed by the data-ref in STMT_INFO. This is
4608 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4609 in bytes.
4610 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4611 to the IV during each iteration of the loop. NULL says to move
4612 by one copy of AGGR_TYPE up or down, depending on the step of the
4613 data reference.
4615 Output:
4616 1. Declare a new ptr to vector_type, and have it point to the base of the
4617 data reference (initial addressed accessed by the data reference).
4618 For example, for vector of type V8HI, the following code is generated:
4620 v8hi *ap;
4621 ap = (v8hi *)initial_address;
4623 if OFFSET is not supplied:
4624 initial_address = &a[init];
4625 if OFFSET is supplied:
4626 initial_address = &a[init + OFFSET];
4627 if BYTE_OFFSET is supplied:
4628 initial_address = &a[init] + BYTE_OFFSET;
4630 Return the initial_address in INITIAL_ADDRESS.
4632 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4633 update the pointer in each iteration of the loop.
4635 Return the increment stmt that updates the pointer in PTR_INCR.
4637 3. Set INV_P to true if the access pattern of the data reference in the
4638 vectorized loop is invariant. Set it to false otherwise.
4640 4. Return the pointer. */
4642 tree
4643 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4644 struct loop *at_loop, tree offset,
4645 tree *initial_address, gimple_stmt_iterator *gsi,
4646 gimple **ptr_incr, bool only_init, bool *inv_p,
4647 tree byte_offset, tree iv_step)
4649 const char *base_name;
4650 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4651 struct loop *loop = NULL;
4652 bool nested_in_vect_loop = false;
4653 struct loop *containing_loop = NULL;
4654 tree aggr_ptr_type;
4655 tree aggr_ptr;
4656 tree new_temp;
4657 gimple_seq new_stmt_list = NULL;
4658 edge pe = NULL;
4659 basic_block new_bb;
4660 tree aggr_ptr_init;
4661 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4662 tree aptr;
4663 gimple_stmt_iterator incr_gsi;
4664 bool insert_after;
4665 tree indx_before_incr, indx_after_incr;
4666 gimple *incr;
4667 tree step;
4668 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4670 gcc_assert (iv_step != NULL_TREE
4671 || TREE_CODE (aggr_type) == ARRAY_TYPE
4672 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4674 if (loop_vinfo)
4676 loop = LOOP_VINFO_LOOP (loop_vinfo);
4677 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4678 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4679 pe = loop_preheader_edge (loop);
4681 else
4683 gcc_assert (bb_vinfo);
4684 only_init = true;
4685 *ptr_incr = NULL;
4688 /* Check the step (evolution) of the load in LOOP, and record
4689 whether it's invariant. */
4690 step = vect_dr_behavior (dr)->step;
4691 if (integer_zerop (step))
4692 *inv_p = true;
4693 else
4694 *inv_p = false;
4696 /* Create an expression for the first address accessed by this load
4697 in LOOP. */
4698 base_name = get_name (DR_BASE_ADDRESS (dr));
4700 if (dump_enabled_p ())
4702 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4703 dump_printf_loc (MSG_NOTE, vect_location,
4704 "create %s-pointer variable to type: ",
4705 get_tree_code_name (TREE_CODE (aggr_type)));
4706 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4707 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4708 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4709 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4710 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4711 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4712 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4713 else
4714 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4715 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4716 dump_printf (MSG_NOTE, "\n");
4719 /* (1) Create the new aggregate-pointer variable.
4720 Vector and array types inherit the alias set of their component
4721 type by default so we need to use a ref-all pointer if the data
4722 reference does not conflict with the created aggregated data
4723 reference because it is not addressable. */
4724 bool need_ref_all = false;
4725 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4726 get_alias_set (DR_REF (dr))))
4727 need_ref_all = true;
4728 /* Likewise for any of the data references in the stmt group. */
4729 else if (DR_GROUP_SIZE (stmt_info) > 1)
4731 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4734 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4735 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4736 get_alias_set (DR_REF (sdr))))
4738 need_ref_all = true;
4739 break;
4741 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4743 while (sinfo);
4745 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4746 need_ref_all);
4747 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4750 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4751 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4752 def-use update cycles for the pointer: one relative to the outer-loop
4753 (LOOP), which is what steps (3) and (4) below do. The other is relative
4754 to the inner-loop (which is the inner-most loop containing the dataref),
4755 and this is done be step (5) below.
4757 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4758 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4759 redundant. Steps (3),(4) create the following:
4761 vp0 = &base_addr;
4762 LOOP: vp1 = phi(vp0,vp2)
4765 vp2 = vp1 + step
4766 goto LOOP
4768 If there is an inner-loop nested in loop, then step (5) will also be
4769 applied, and an additional update in the inner-loop will be created:
4771 vp0 = &base_addr;
4772 LOOP: vp1 = phi(vp0,vp2)
4774 inner: vp3 = phi(vp1,vp4)
4775 vp4 = vp3 + inner_step
4776 if () goto inner
4778 vp2 = vp1 + step
4779 if () goto LOOP */
4781 /* (2) Calculate the initial address of the aggregate-pointer, and set
4782 the aggregate-pointer to point to it before the loop. */
4784 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4786 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4787 offset, byte_offset);
4788 if (new_stmt_list)
4790 if (pe)
4792 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4793 gcc_assert (!new_bb);
4795 else
4796 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4799 *initial_address = new_temp;
4800 aggr_ptr_init = new_temp;
4802 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4803 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4804 inner-loop nested in LOOP (during outer-loop vectorization). */
4806 /* No update in loop is required. */
4807 if (only_init && (!loop_vinfo || at_loop == loop))
4808 aptr = aggr_ptr_init;
4809 else
4811 if (iv_step == NULL_TREE)
4813 /* The step of the aggregate pointer is the type size. */
4814 iv_step = TYPE_SIZE_UNIT (aggr_type);
4815 /* One exception to the above is when the scalar step of the load in
4816 LOOP is zero. In this case the step here is also zero. */
4817 if (*inv_p)
4818 iv_step = size_zero_node;
4819 else if (tree_int_cst_sgn (step) == -1)
4820 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4823 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4825 create_iv (aggr_ptr_init,
4826 fold_convert (aggr_ptr_type, iv_step),
4827 aggr_ptr, loop, &incr_gsi, insert_after,
4828 &indx_before_incr, &indx_after_incr);
4829 incr = gsi_stmt (incr_gsi);
4830 loop_vinfo->add_stmt (incr);
4832 /* Copy the points-to information if it exists. */
4833 if (DR_PTR_INFO (dr))
4835 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4836 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4838 if (ptr_incr)
4839 *ptr_incr = incr;
4841 aptr = indx_before_incr;
4844 if (!nested_in_vect_loop || only_init)
4845 return aptr;
4848 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4849 nested in LOOP, if exists. */
4851 gcc_assert (nested_in_vect_loop);
4852 if (!only_init)
4854 standard_iv_increment_position (containing_loop, &incr_gsi,
4855 &insert_after);
4856 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4857 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4858 &indx_after_incr);
4859 incr = gsi_stmt (incr_gsi);
4860 loop_vinfo->add_stmt (incr);
4862 /* Copy the points-to information if it exists. */
4863 if (DR_PTR_INFO (dr))
4865 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4866 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4868 if (ptr_incr)
4869 *ptr_incr = incr;
4871 return indx_before_incr;
4873 else
4874 gcc_unreachable ();
4878 /* Function bump_vector_ptr
4880 Increment a pointer (to a vector type) by vector-size. If requested,
4881 i.e. if PTR-INCR is given, then also connect the new increment stmt
4882 to the existing def-use update-chain of the pointer, by modifying
4883 the PTR_INCR as illustrated below:
4885 The pointer def-use update-chain before this function:
4886 DATAREF_PTR = phi (p_0, p_2)
4887 ....
4888 PTR_INCR: p_2 = DATAREF_PTR + step
4890 The pointer def-use update-chain after this function:
4891 DATAREF_PTR = phi (p_0, p_2)
4892 ....
4893 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4894 ....
4895 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4897 Input:
4898 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4899 in the loop.
4900 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4901 the loop. The increment amount across iterations is expected
4902 to be vector_size.
4903 BSI - location where the new update stmt is to be placed.
4904 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4905 BUMP - optional. The offset by which to bump the pointer. If not given,
4906 the offset is assumed to be vector_size.
4908 Output: Return NEW_DATAREF_PTR as illustrated above.
4912 tree
4913 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4914 stmt_vec_info stmt_info, tree bump)
4916 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4917 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4918 tree update = TYPE_SIZE_UNIT (vectype);
4919 gassign *incr_stmt;
4920 ssa_op_iter iter;
4921 use_operand_p use_p;
4922 tree new_dataref_ptr;
4924 if (bump)
4925 update = bump;
4927 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4928 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4929 else
4930 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4931 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4932 dataref_ptr, update);
4933 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4935 /* Copy the points-to information if it exists. */
4936 if (DR_PTR_INFO (dr))
4938 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4939 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4942 if (!ptr_incr)
4943 return new_dataref_ptr;
4945 /* Update the vector-pointer's cross-iteration increment. */
4946 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4948 tree use = USE_FROM_PTR (use_p);
4950 if (use == dataref_ptr)
4951 SET_USE (use_p, new_dataref_ptr);
4952 else
4953 gcc_assert (operand_equal_p (use, update, 0));
4956 return new_dataref_ptr;
4960 /* Copy memory reference info such as base/clique from the SRC reference
4961 to the DEST MEM_REF. */
4963 void
4964 vect_copy_ref_info (tree dest, tree src)
4966 if (TREE_CODE (dest) != MEM_REF)
4967 return;
4969 tree src_base = src;
4970 while (handled_component_p (src_base))
4971 src_base = TREE_OPERAND (src_base, 0);
4972 if (TREE_CODE (src_base) != MEM_REF
4973 && TREE_CODE (src_base) != TARGET_MEM_REF)
4974 return;
4976 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4977 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4981 /* Function vect_create_destination_var.
4983 Create a new temporary of type VECTYPE. */
4985 tree
4986 vect_create_destination_var (tree scalar_dest, tree vectype)
4988 tree vec_dest;
4989 const char *name;
4990 char *new_name;
4991 tree type;
4992 enum vect_var_kind kind;
4994 kind = vectype
4995 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4996 ? vect_mask_var
4997 : vect_simple_var
4998 : vect_scalar_var;
4999 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5001 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5003 name = get_name (scalar_dest);
5004 if (name)
5005 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5006 else
5007 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5008 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5009 free (new_name);
5011 return vec_dest;
5014 /* Function vect_grouped_store_supported.
5016 Returns TRUE if interleave high and interleave low permutations
5017 are supported, and FALSE otherwise. */
5019 bool
5020 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5022 machine_mode mode = TYPE_MODE (vectype);
5024 /* vect_permute_store_chain requires the group size to be equal to 3 or
5025 be a power of two. */
5026 if (count != 3 && exact_log2 (count) == -1)
5028 if (dump_enabled_p ())
5029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5030 "the size of the group of accesses"
5031 " is not a power of 2 or not eqaul to 3\n");
5032 return false;
5035 /* Check that the permutation is supported. */
5036 if (VECTOR_MODE_P (mode))
5038 unsigned int i;
5039 if (count == 3)
5041 unsigned int j0 = 0, j1 = 0, j2 = 0;
5042 unsigned int i, j;
5044 unsigned int nelt;
5045 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5047 if (dump_enabled_p ())
5048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5049 "cannot handle groups of 3 stores for"
5050 " variable-length vectors\n");
5051 return false;
5054 vec_perm_builder sel (nelt, nelt, 1);
5055 sel.quick_grow (nelt);
5056 vec_perm_indices indices;
5057 for (j = 0; j < 3; j++)
5059 int nelt0 = ((3 - j) * nelt) % 3;
5060 int nelt1 = ((3 - j) * nelt + 1) % 3;
5061 int nelt2 = ((3 - j) * nelt + 2) % 3;
5062 for (i = 0; i < nelt; i++)
5064 if (3 * i + nelt0 < nelt)
5065 sel[3 * i + nelt0] = j0++;
5066 if (3 * i + nelt1 < nelt)
5067 sel[3 * i + nelt1] = nelt + j1++;
5068 if (3 * i + nelt2 < nelt)
5069 sel[3 * i + nelt2] = 0;
5071 indices.new_vector (sel, 2, nelt);
5072 if (!can_vec_perm_const_p (mode, indices))
5074 if (dump_enabled_p ())
5075 dump_printf (MSG_MISSED_OPTIMIZATION,
5076 "permutation op not supported by target.\n");
5077 return false;
5080 for (i = 0; i < nelt; i++)
5082 if (3 * i + nelt0 < nelt)
5083 sel[3 * i + nelt0] = 3 * i + nelt0;
5084 if (3 * i + nelt1 < nelt)
5085 sel[3 * i + nelt1] = 3 * i + nelt1;
5086 if (3 * i + nelt2 < nelt)
5087 sel[3 * i + nelt2] = nelt + j2++;
5089 indices.new_vector (sel, 2, nelt);
5090 if (!can_vec_perm_const_p (mode, indices))
5092 if (dump_enabled_p ())
5093 dump_printf (MSG_MISSED_OPTIMIZATION,
5094 "permutation op not supported by target.\n");
5095 return false;
5098 return true;
5100 else
5102 /* If length is not equal to 3 then only power of 2 is supported. */
5103 gcc_assert (pow2p_hwi (count));
5104 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5106 /* The encoding has 2 interleaved stepped patterns. */
5107 vec_perm_builder sel (nelt, 2, 3);
5108 sel.quick_grow (6);
5109 for (i = 0; i < 3; i++)
5111 sel[i * 2] = i;
5112 sel[i * 2 + 1] = i + nelt;
5114 vec_perm_indices indices (sel, 2, nelt);
5115 if (can_vec_perm_const_p (mode, indices))
5117 for (i = 0; i < 6; i++)
5118 sel[i] += exact_div (nelt, 2);
5119 indices.new_vector (sel, 2, nelt);
5120 if (can_vec_perm_const_p (mode, indices))
5121 return true;
5126 if (dump_enabled_p ())
5127 dump_printf (MSG_MISSED_OPTIMIZATION,
5128 "permutaion op not supported by target.\n");
5129 return false;
5133 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5134 type VECTYPE. MASKED_P says whether the masked form is needed. */
5136 bool
5137 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5138 bool masked_p)
5140 if (masked_p)
5141 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5142 vec_mask_store_lanes_optab,
5143 vectype, count);
5144 else
5145 return vect_lanes_optab_supported_p ("vec_store_lanes",
5146 vec_store_lanes_optab,
5147 vectype, count);
5151 /* Function vect_permute_store_chain.
5153 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5154 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5155 the data correctly for the stores. Return the final references for stores
5156 in RESULT_CHAIN.
5158 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5159 The input is 4 vectors each containing 8 elements. We assign a number to
5160 each element, the input sequence is:
5162 1st vec: 0 1 2 3 4 5 6 7
5163 2nd vec: 8 9 10 11 12 13 14 15
5164 3rd vec: 16 17 18 19 20 21 22 23
5165 4th vec: 24 25 26 27 28 29 30 31
5167 The output sequence should be:
5169 1st vec: 0 8 16 24 1 9 17 25
5170 2nd vec: 2 10 18 26 3 11 19 27
5171 3rd vec: 4 12 20 28 5 13 21 30
5172 4th vec: 6 14 22 30 7 15 23 31
5174 i.e., we interleave the contents of the four vectors in their order.
5176 We use interleave_high/low instructions to create such output. The input of
5177 each interleave_high/low operation is two vectors:
5178 1st vec 2nd vec
5179 0 1 2 3 4 5 6 7
5180 the even elements of the result vector are obtained left-to-right from the
5181 high/low elements of the first vector. The odd elements of the result are
5182 obtained left-to-right from the high/low elements of the second vector.
5183 The output of interleave_high will be: 0 4 1 5
5184 and of interleave_low: 2 6 3 7
5187 The permutation is done in log LENGTH stages. In each stage interleave_high
5188 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5189 where the first argument is taken from the first half of DR_CHAIN and the
5190 second argument from it's second half.
5191 In our example,
5193 I1: interleave_high (1st vec, 3rd vec)
5194 I2: interleave_low (1st vec, 3rd vec)
5195 I3: interleave_high (2nd vec, 4th vec)
5196 I4: interleave_low (2nd vec, 4th vec)
5198 The output for the first stage is:
5200 I1: 0 16 1 17 2 18 3 19
5201 I2: 4 20 5 21 6 22 7 23
5202 I3: 8 24 9 25 10 26 11 27
5203 I4: 12 28 13 29 14 30 15 31
5205 The output of the second stage, i.e. the final result is:
5207 I1: 0 8 16 24 1 9 17 25
5208 I2: 2 10 18 26 3 11 19 27
5209 I3: 4 12 20 28 5 13 21 30
5210 I4: 6 14 22 30 7 15 23 31. */
5212 void
5213 vect_permute_store_chain (vec<tree> dr_chain,
5214 unsigned int length,
5215 stmt_vec_info stmt_info,
5216 gimple_stmt_iterator *gsi,
5217 vec<tree> *result_chain)
5219 tree vect1, vect2, high, low;
5220 gimple *perm_stmt;
5221 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5222 tree perm_mask_low, perm_mask_high;
5223 tree data_ref;
5224 tree perm3_mask_low, perm3_mask_high;
5225 unsigned int i, j, n, log_length = exact_log2 (length);
5227 result_chain->quick_grow (length);
5228 memcpy (result_chain->address (), dr_chain.address (),
5229 length * sizeof (tree));
5231 if (length == 3)
5233 /* vect_grouped_store_supported ensures that this is constant. */
5234 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5235 unsigned int j0 = 0, j1 = 0, j2 = 0;
5237 vec_perm_builder sel (nelt, nelt, 1);
5238 sel.quick_grow (nelt);
5239 vec_perm_indices indices;
5240 for (j = 0; j < 3; j++)
5242 int nelt0 = ((3 - j) * nelt) % 3;
5243 int nelt1 = ((3 - j) * nelt + 1) % 3;
5244 int nelt2 = ((3 - j) * nelt + 2) % 3;
5246 for (i = 0; i < nelt; i++)
5248 if (3 * i + nelt0 < nelt)
5249 sel[3 * i + nelt0] = j0++;
5250 if (3 * i + nelt1 < nelt)
5251 sel[3 * i + nelt1] = nelt + j1++;
5252 if (3 * i + nelt2 < nelt)
5253 sel[3 * i + nelt2] = 0;
5255 indices.new_vector (sel, 2, nelt);
5256 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5258 for (i = 0; i < nelt; i++)
5260 if (3 * i + nelt0 < nelt)
5261 sel[3 * i + nelt0] = 3 * i + nelt0;
5262 if (3 * i + nelt1 < nelt)
5263 sel[3 * i + nelt1] = 3 * i + nelt1;
5264 if (3 * i + nelt2 < nelt)
5265 sel[3 * i + nelt2] = nelt + j2++;
5267 indices.new_vector (sel, 2, nelt);
5268 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5270 vect1 = dr_chain[0];
5271 vect2 = dr_chain[1];
5273 /* Create interleaving stmt:
5274 low = VEC_PERM_EXPR <vect1, vect2,
5275 {j, nelt, *, j + 1, nelt + j + 1, *,
5276 j + 2, nelt + j + 2, *, ...}> */
5277 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5278 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5279 vect2, perm3_mask_low);
5280 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5282 vect1 = data_ref;
5283 vect2 = dr_chain[2];
5284 /* Create interleaving stmt:
5285 low = VEC_PERM_EXPR <vect1, vect2,
5286 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5287 6, 7, nelt + j + 2, ...}> */
5288 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5289 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5290 vect2, perm3_mask_high);
5291 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5292 (*result_chain)[j] = data_ref;
5295 else
5297 /* If length is not equal to 3 then only power of 2 is supported. */
5298 gcc_assert (pow2p_hwi (length));
5300 /* The encoding has 2 interleaved stepped patterns. */
5301 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5302 vec_perm_builder sel (nelt, 2, 3);
5303 sel.quick_grow (6);
5304 for (i = 0; i < 3; i++)
5306 sel[i * 2] = i;
5307 sel[i * 2 + 1] = i + nelt;
5309 vec_perm_indices indices (sel, 2, nelt);
5310 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5312 for (i = 0; i < 6; i++)
5313 sel[i] += exact_div (nelt, 2);
5314 indices.new_vector (sel, 2, nelt);
5315 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5317 for (i = 0, n = log_length; i < n; i++)
5319 for (j = 0; j < length/2; j++)
5321 vect1 = dr_chain[j];
5322 vect2 = dr_chain[j+length/2];
5324 /* Create interleaving stmt:
5325 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5326 ...}> */
5327 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5328 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5329 vect2, perm_mask_high);
5330 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5331 (*result_chain)[2*j] = high;
5333 /* Create interleaving stmt:
5334 low = VEC_PERM_EXPR <vect1, vect2,
5335 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5336 ...}> */
5337 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5338 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5339 vect2, perm_mask_low);
5340 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5341 (*result_chain)[2*j+1] = low;
5343 memcpy (dr_chain.address (), result_chain->address (),
5344 length * sizeof (tree));
5349 /* Function vect_setup_realignment
5351 This function is called when vectorizing an unaligned load using
5352 the dr_explicit_realign[_optimized] scheme.
5353 This function generates the following code at the loop prolog:
5355 p = initial_addr;
5356 x msq_init = *(floor(p)); # prolog load
5357 realignment_token = call target_builtin;
5358 loop:
5359 x msq = phi (msq_init, ---)
5361 The stmts marked with x are generated only for the case of
5362 dr_explicit_realign_optimized.
5364 The code above sets up a new (vector) pointer, pointing to the first
5365 location accessed by STMT_INFO, and a "floor-aligned" load using that
5366 pointer. It also generates code to compute the "realignment-token"
5367 (if the relevant target hook was defined), and creates a phi-node at the
5368 loop-header bb whose arguments are the result of the prolog-load (created
5369 by this function) and the result of a load that takes place in the loop
5370 (to be created by the caller to this function).
5372 For the case of dr_explicit_realign_optimized:
5373 The caller to this function uses the phi-result (msq) to create the
5374 realignment code inside the loop, and sets up the missing phi argument,
5375 as follows:
5376 loop:
5377 msq = phi (msq_init, lsq)
5378 lsq = *(floor(p')); # load in loop
5379 result = realign_load (msq, lsq, realignment_token);
5381 For the case of dr_explicit_realign:
5382 loop:
5383 msq = *(floor(p)); # load in loop
5384 p' = p + (VS-1);
5385 lsq = *(floor(p')); # load in loop
5386 result = realign_load (msq, lsq, realignment_token);
5388 Input:
5389 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5390 a memory location that may be unaligned.
5391 BSI - place where new code is to be inserted.
5392 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5393 is used.
5395 Output:
5396 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5397 target hook, if defined.
5398 Return value - the result of the loop-header phi node. */
5400 tree
5401 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5402 tree *realignment_token,
5403 enum dr_alignment_support alignment_support_scheme,
5404 tree init_addr,
5405 struct loop **at_loop)
5407 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5408 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5409 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5410 struct loop *loop = NULL;
5411 edge pe = NULL;
5412 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5413 tree vec_dest;
5414 gimple *inc;
5415 tree ptr;
5416 tree data_ref;
5417 basic_block new_bb;
5418 tree msq_init = NULL_TREE;
5419 tree new_temp;
5420 gphi *phi_stmt;
5421 tree msq = NULL_TREE;
5422 gimple_seq stmts = NULL;
5423 bool inv_p;
5424 bool compute_in_loop = false;
5425 bool nested_in_vect_loop = false;
5426 struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5427 struct loop *loop_for_initial_load = NULL;
5429 if (loop_vinfo)
5431 loop = LOOP_VINFO_LOOP (loop_vinfo);
5432 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5435 gcc_assert (alignment_support_scheme == dr_explicit_realign
5436 || alignment_support_scheme == dr_explicit_realign_optimized);
5438 /* We need to generate three things:
5439 1. the misalignment computation
5440 2. the extra vector load (for the optimized realignment scheme).
5441 3. the phi node for the two vectors from which the realignment is
5442 done (for the optimized realignment scheme). */
5444 /* 1. Determine where to generate the misalignment computation.
5446 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5447 calculation will be generated by this function, outside the loop (in the
5448 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5449 caller, inside the loop.
5451 Background: If the misalignment remains fixed throughout the iterations of
5452 the loop, then both realignment schemes are applicable, and also the
5453 misalignment computation can be done outside LOOP. This is because we are
5454 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5455 are a multiple of VS (the Vector Size), and therefore the misalignment in
5456 different vectorized LOOP iterations is always the same.
5457 The problem arises only if the memory access is in an inner-loop nested
5458 inside LOOP, which is now being vectorized using outer-loop vectorization.
5459 This is the only case when the misalignment of the memory access may not
5460 remain fixed throughout the iterations of the inner-loop (as explained in
5461 detail in vect_supportable_dr_alignment). In this case, not only is the
5462 optimized realignment scheme not applicable, but also the misalignment
5463 computation (and generation of the realignment token that is passed to
5464 REALIGN_LOAD) have to be done inside the loop.
5466 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5467 or not, which in turn determines if the misalignment is computed inside
5468 the inner-loop, or outside LOOP. */
5470 if (init_addr != NULL_TREE || !loop_vinfo)
5472 compute_in_loop = true;
5473 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5477 /* 2. Determine where to generate the extra vector load.
5479 For the optimized realignment scheme, instead of generating two vector
5480 loads in each iteration, we generate a single extra vector load in the
5481 preheader of the loop, and in each iteration reuse the result of the
5482 vector load from the previous iteration. In case the memory access is in
5483 an inner-loop nested inside LOOP, which is now being vectorized using
5484 outer-loop vectorization, we need to determine whether this initial vector
5485 load should be generated at the preheader of the inner-loop, or can be
5486 generated at the preheader of LOOP. If the memory access has no evolution
5487 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5488 to be generated inside LOOP (in the preheader of the inner-loop). */
5490 if (nested_in_vect_loop)
5492 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5493 bool invariant_in_outerloop =
5494 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5495 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5497 else
5498 loop_for_initial_load = loop;
5499 if (at_loop)
5500 *at_loop = loop_for_initial_load;
5502 if (loop_for_initial_load)
5503 pe = loop_preheader_edge (loop_for_initial_load);
5505 /* 3. For the case of the optimized realignment, create the first vector
5506 load at the loop preheader. */
5508 if (alignment_support_scheme == dr_explicit_realign_optimized)
5510 /* Create msq_init = *(floor(p1)) in the loop preheader */
5511 gassign *new_stmt;
5513 gcc_assert (!compute_in_loop);
5514 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5515 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5516 loop_for_initial_load, NULL_TREE,
5517 &init_addr, NULL, &inc, true, &inv_p);
5518 if (TREE_CODE (ptr) == SSA_NAME)
5519 new_temp = copy_ssa_name (ptr);
5520 else
5521 new_temp = make_ssa_name (TREE_TYPE (ptr));
5522 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5523 new_stmt = gimple_build_assign
5524 (new_temp, BIT_AND_EXPR, ptr,
5525 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5526 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5527 gcc_assert (!new_bb);
5528 data_ref
5529 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5530 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5531 vect_copy_ref_info (data_ref, DR_REF (dr));
5532 new_stmt = gimple_build_assign (vec_dest, data_ref);
5533 new_temp = make_ssa_name (vec_dest, new_stmt);
5534 gimple_assign_set_lhs (new_stmt, new_temp);
5535 if (pe)
5537 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5538 gcc_assert (!new_bb);
5540 else
5541 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5543 msq_init = gimple_assign_lhs (new_stmt);
5546 /* 4. Create realignment token using a target builtin, if available.
5547 It is done either inside the containing loop, or before LOOP (as
5548 determined above). */
5550 if (targetm.vectorize.builtin_mask_for_load)
5552 gcall *new_stmt;
5553 tree builtin_decl;
5555 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5556 if (!init_addr)
5558 /* Generate the INIT_ADDR computation outside LOOP. */
5559 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5560 NULL_TREE);
5561 if (loop)
5563 pe = loop_preheader_edge (loop);
5564 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5565 gcc_assert (!new_bb);
5567 else
5568 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5571 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5572 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5573 vec_dest =
5574 vect_create_destination_var (scalar_dest,
5575 gimple_call_return_type (new_stmt));
5576 new_temp = make_ssa_name (vec_dest, new_stmt);
5577 gimple_call_set_lhs (new_stmt, new_temp);
5579 if (compute_in_loop)
5580 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5581 else
5583 /* Generate the misalignment computation outside LOOP. */
5584 pe = loop_preheader_edge (loop);
5585 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5586 gcc_assert (!new_bb);
5589 *realignment_token = gimple_call_lhs (new_stmt);
5591 /* The result of the CALL_EXPR to this builtin is determined from
5592 the value of the parameter and no global variables are touched
5593 which makes the builtin a "const" function. Requiring the
5594 builtin to have the "const" attribute makes it unnecessary
5595 to call mark_call_clobbered. */
5596 gcc_assert (TREE_READONLY (builtin_decl));
5599 if (alignment_support_scheme == dr_explicit_realign)
5600 return msq;
5602 gcc_assert (!compute_in_loop);
5603 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5606 /* 5. Create msq = phi <msq_init, lsq> in loop */
5608 pe = loop_preheader_edge (containing_loop);
5609 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5610 msq = make_ssa_name (vec_dest);
5611 phi_stmt = create_phi_node (msq, containing_loop->header);
5612 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5614 return msq;
5618 /* Function vect_grouped_load_supported.
5620 COUNT is the size of the load group (the number of statements plus the
5621 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5622 only one statement, with a gap of COUNT - 1.
5624 Returns true if a suitable permute exists. */
5626 bool
5627 vect_grouped_load_supported (tree vectype, bool single_element_p,
5628 unsigned HOST_WIDE_INT count)
5630 machine_mode mode = TYPE_MODE (vectype);
5632 /* If this is single-element interleaving with an element distance
5633 that leaves unused vector loads around punt - we at least create
5634 very sub-optimal code in that case (and blow up memory,
5635 see PR65518). */
5636 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5638 if (dump_enabled_p ())
5639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5640 "single-element interleaving not supported "
5641 "for not adjacent vector loads\n");
5642 return false;
5645 /* vect_permute_load_chain requires the group size to be equal to 3 or
5646 be a power of two. */
5647 if (count != 3 && exact_log2 (count) == -1)
5649 if (dump_enabled_p ())
5650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5651 "the size of the group of accesses"
5652 " is not a power of 2 or not equal to 3\n");
5653 return false;
5656 /* Check that the permutation is supported. */
5657 if (VECTOR_MODE_P (mode))
5659 unsigned int i, j;
5660 if (count == 3)
5662 unsigned int nelt;
5663 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5665 if (dump_enabled_p ())
5666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5667 "cannot handle groups of 3 loads for"
5668 " variable-length vectors\n");
5669 return false;
5672 vec_perm_builder sel (nelt, nelt, 1);
5673 sel.quick_grow (nelt);
5674 vec_perm_indices indices;
5675 unsigned int k;
5676 for (k = 0; k < 3; k++)
5678 for (i = 0; i < nelt; i++)
5679 if (3 * i + k < 2 * nelt)
5680 sel[i] = 3 * i + k;
5681 else
5682 sel[i] = 0;
5683 indices.new_vector (sel, 2, nelt);
5684 if (!can_vec_perm_const_p (mode, indices))
5686 if (dump_enabled_p ())
5687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5688 "shuffle of 3 loads is not supported by"
5689 " target\n");
5690 return false;
5692 for (i = 0, j = 0; i < nelt; i++)
5693 if (3 * i + k < 2 * nelt)
5694 sel[i] = i;
5695 else
5696 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5697 indices.new_vector (sel, 2, nelt);
5698 if (!can_vec_perm_const_p (mode, indices))
5700 if (dump_enabled_p ())
5701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702 "shuffle of 3 loads is not supported by"
5703 " target\n");
5704 return false;
5707 return true;
5709 else
5711 /* If length is not equal to 3 then only power of 2 is supported. */
5712 gcc_assert (pow2p_hwi (count));
5713 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5715 /* The encoding has a single stepped pattern. */
5716 vec_perm_builder sel (nelt, 1, 3);
5717 sel.quick_grow (3);
5718 for (i = 0; i < 3; i++)
5719 sel[i] = i * 2;
5720 vec_perm_indices indices (sel, 2, nelt);
5721 if (can_vec_perm_const_p (mode, indices))
5723 for (i = 0; i < 3; i++)
5724 sel[i] = i * 2 + 1;
5725 indices.new_vector (sel, 2, nelt);
5726 if (can_vec_perm_const_p (mode, indices))
5727 return true;
5732 if (dump_enabled_p ())
5733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5734 "extract even/odd not supported by target\n");
5735 return false;
5738 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5739 type VECTYPE. MASKED_P says whether the masked form is needed. */
5741 bool
5742 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5743 bool masked_p)
5745 if (masked_p)
5746 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5747 vec_mask_load_lanes_optab,
5748 vectype, count);
5749 else
5750 return vect_lanes_optab_supported_p ("vec_load_lanes",
5751 vec_load_lanes_optab,
5752 vectype, count);
5755 /* Function vect_permute_load_chain.
5757 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5758 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5759 the input data correctly. Return the final references for loads in
5760 RESULT_CHAIN.
5762 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5763 The input is 4 vectors each containing 8 elements. We assign a number to each
5764 element, the input sequence is:
5766 1st vec: 0 1 2 3 4 5 6 7
5767 2nd vec: 8 9 10 11 12 13 14 15
5768 3rd vec: 16 17 18 19 20 21 22 23
5769 4th vec: 24 25 26 27 28 29 30 31
5771 The output sequence should be:
5773 1st vec: 0 4 8 12 16 20 24 28
5774 2nd vec: 1 5 9 13 17 21 25 29
5775 3rd vec: 2 6 10 14 18 22 26 30
5776 4th vec: 3 7 11 15 19 23 27 31
5778 i.e., the first output vector should contain the first elements of each
5779 interleaving group, etc.
5781 We use extract_even/odd instructions to create such output. The input of
5782 each extract_even/odd operation is two vectors
5783 1st vec 2nd vec
5784 0 1 2 3 4 5 6 7
5786 and the output is the vector of extracted even/odd elements. The output of
5787 extract_even will be: 0 2 4 6
5788 and of extract_odd: 1 3 5 7
5791 The permutation is done in log LENGTH stages. In each stage extract_even
5792 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5793 their order. In our example,
5795 E1: extract_even (1st vec, 2nd vec)
5796 E2: extract_odd (1st vec, 2nd vec)
5797 E3: extract_even (3rd vec, 4th vec)
5798 E4: extract_odd (3rd vec, 4th vec)
5800 The output for the first stage will be:
5802 E1: 0 2 4 6 8 10 12 14
5803 E2: 1 3 5 7 9 11 13 15
5804 E3: 16 18 20 22 24 26 28 30
5805 E4: 17 19 21 23 25 27 29 31
5807 In order to proceed and create the correct sequence for the next stage (or
5808 for the correct output, if the second stage is the last one, as in our
5809 example), we first put the output of extract_even operation and then the
5810 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5811 The input for the second stage is:
5813 1st vec (E1): 0 2 4 6 8 10 12 14
5814 2nd vec (E3): 16 18 20 22 24 26 28 30
5815 3rd vec (E2): 1 3 5 7 9 11 13 15
5816 4th vec (E4): 17 19 21 23 25 27 29 31
5818 The output of the second stage:
5820 E1: 0 4 8 12 16 20 24 28
5821 E2: 2 6 10 14 18 22 26 30
5822 E3: 1 5 9 13 17 21 25 29
5823 E4: 3 7 11 15 19 23 27 31
5825 And RESULT_CHAIN after reordering:
5827 1st vec (E1): 0 4 8 12 16 20 24 28
5828 2nd vec (E3): 1 5 9 13 17 21 25 29
5829 3rd vec (E2): 2 6 10 14 18 22 26 30
5830 4th vec (E4): 3 7 11 15 19 23 27 31. */
5832 static void
5833 vect_permute_load_chain (vec<tree> dr_chain,
5834 unsigned int length,
5835 stmt_vec_info stmt_info,
5836 gimple_stmt_iterator *gsi,
5837 vec<tree> *result_chain)
5839 tree data_ref, first_vect, second_vect;
5840 tree perm_mask_even, perm_mask_odd;
5841 tree perm3_mask_low, perm3_mask_high;
5842 gimple *perm_stmt;
5843 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5844 unsigned int i, j, log_length = exact_log2 (length);
5846 result_chain->quick_grow (length);
5847 memcpy (result_chain->address (), dr_chain.address (),
5848 length * sizeof (tree));
5850 if (length == 3)
5852 /* vect_grouped_load_supported ensures that this is constant. */
5853 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5854 unsigned int k;
5856 vec_perm_builder sel (nelt, nelt, 1);
5857 sel.quick_grow (nelt);
5858 vec_perm_indices indices;
5859 for (k = 0; k < 3; k++)
5861 for (i = 0; i < nelt; i++)
5862 if (3 * i + k < 2 * nelt)
5863 sel[i] = 3 * i + k;
5864 else
5865 sel[i] = 0;
5866 indices.new_vector (sel, 2, nelt);
5867 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5869 for (i = 0, j = 0; i < nelt; i++)
5870 if (3 * i + k < 2 * nelt)
5871 sel[i] = i;
5872 else
5873 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5874 indices.new_vector (sel, 2, nelt);
5875 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5877 first_vect = dr_chain[0];
5878 second_vect = dr_chain[1];
5880 /* Create interleaving stmt (low part of):
5881 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5882 ...}> */
5883 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5884 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5885 second_vect, perm3_mask_low);
5886 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5888 /* Create interleaving stmt (high part of):
5889 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5890 ...}> */
5891 first_vect = data_ref;
5892 second_vect = dr_chain[2];
5893 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5894 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5895 second_vect, perm3_mask_high);
5896 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5897 (*result_chain)[k] = data_ref;
5900 else
5902 /* If length is not equal to 3 then only power of 2 is supported. */
5903 gcc_assert (pow2p_hwi (length));
5905 /* The encoding has a single stepped pattern. */
5906 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5907 vec_perm_builder sel (nelt, 1, 3);
5908 sel.quick_grow (3);
5909 for (i = 0; i < 3; ++i)
5910 sel[i] = i * 2;
5911 vec_perm_indices indices (sel, 2, nelt);
5912 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5914 for (i = 0; i < 3; ++i)
5915 sel[i] = i * 2 + 1;
5916 indices.new_vector (sel, 2, nelt);
5917 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5919 for (i = 0; i < log_length; i++)
5921 for (j = 0; j < length; j += 2)
5923 first_vect = dr_chain[j];
5924 second_vect = dr_chain[j+1];
5926 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5927 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5928 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5929 first_vect, second_vect,
5930 perm_mask_even);
5931 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5932 (*result_chain)[j/2] = data_ref;
5934 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5935 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5936 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5937 first_vect, second_vect,
5938 perm_mask_odd);
5939 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5940 (*result_chain)[j/2+length/2] = data_ref;
5942 memcpy (dr_chain.address (), result_chain->address (),
5943 length * sizeof (tree));
5948 /* Function vect_shift_permute_load_chain.
5950 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5951 sequence of stmts to reorder the input data accordingly.
5952 Return the final references for loads in RESULT_CHAIN.
5953 Return true if successed, false otherwise.
5955 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5956 The input is 3 vectors each containing 8 elements. We assign a
5957 number to each element, the input sequence is:
5959 1st vec: 0 1 2 3 4 5 6 7
5960 2nd vec: 8 9 10 11 12 13 14 15
5961 3rd vec: 16 17 18 19 20 21 22 23
5963 The output sequence should be:
5965 1st vec: 0 3 6 9 12 15 18 21
5966 2nd vec: 1 4 7 10 13 16 19 22
5967 3rd vec: 2 5 8 11 14 17 20 23
5969 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5971 First we shuffle all 3 vectors to get correct elements order:
5973 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5974 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5975 3rd vec: (16 19 22) (17 20 23) (18 21)
5977 Next we unite and shift vector 3 times:
5979 1st step:
5980 shift right by 6 the concatenation of:
5981 "1st vec" and "2nd vec"
5982 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5983 "2nd vec" and "3rd vec"
5984 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5985 "3rd vec" and "1st vec"
5986 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5987 | New vectors |
5989 So that now new vectors are:
5991 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5992 2nd vec: (10 13) (16 19 22) (17 20 23)
5993 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5995 2nd step:
5996 shift right by 5 the concatenation of:
5997 "1st vec" and "3rd vec"
5998 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5999 "2nd vec" and "1st vec"
6000 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6001 "3rd vec" and "2nd vec"
6002 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6003 | New vectors |
6005 So that now new vectors are:
6007 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6008 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6009 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6011 3rd step:
6012 shift right by 5 the concatenation of:
6013 "1st vec" and "1st vec"
6014 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6015 shift right by 3 the concatenation of:
6016 "2nd vec" and "2nd vec"
6017 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6018 | New vectors |
6020 So that now all vectors are READY:
6021 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6022 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6023 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6025 This algorithm is faster than one in vect_permute_load_chain if:
6026 1. "shift of a concatination" is faster than general permutation.
6027 This is usually so.
6028 2. The TARGET machine can't execute vector instructions in parallel.
6029 This is because each step of the algorithm depends on previous.
6030 The algorithm in vect_permute_load_chain is much more parallel.
6032 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6035 static bool
6036 vect_shift_permute_load_chain (vec<tree> dr_chain,
6037 unsigned int length,
6038 stmt_vec_info stmt_info,
6039 gimple_stmt_iterator *gsi,
6040 vec<tree> *result_chain)
6042 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6043 tree perm2_mask1, perm2_mask2, perm3_mask;
6044 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6045 gimple *perm_stmt;
6047 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6048 unsigned int i;
6049 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6051 unsigned HOST_WIDE_INT nelt, vf;
6052 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6053 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6054 /* Not supported for variable-length vectors. */
6055 return false;
6057 vec_perm_builder sel (nelt, nelt, 1);
6058 sel.quick_grow (nelt);
6060 result_chain->quick_grow (length);
6061 memcpy (result_chain->address (), dr_chain.address (),
6062 length * sizeof (tree));
6064 if (pow2p_hwi (length) && vf > 4)
6066 unsigned int j, log_length = exact_log2 (length);
6067 for (i = 0; i < nelt / 2; ++i)
6068 sel[i] = i * 2;
6069 for (i = 0; i < nelt / 2; ++i)
6070 sel[nelt / 2 + i] = i * 2 + 1;
6071 vec_perm_indices indices (sel, 2, nelt);
6072 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6074 if (dump_enabled_p ())
6075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6076 "shuffle of 2 fields structure is not \
6077 supported by target\n");
6078 return false;
6080 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6082 for (i = 0; i < nelt / 2; ++i)
6083 sel[i] = i * 2 + 1;
6084 for (i = 0; i < nelt / 2; ++i)
6085 sel[nelt / 2 + i] = i * 2;
6086 indices.new_vector (sel, 2, nelt);
6087 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6089 if (dump_enabled_p ())
6090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6091 "shuffle of 2 fields structure is not \
6092 supported by target\n");
6093 return false;
6095 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6097 /* Generating permutation constant to shift all elements.
6098 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6099 for (i = 0; i < nelt; i++)
6100 sel[i] = nelt / 2 + i;
6101 indices.new_vector (sel, 2, nelt);
6102 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6104 if (dump_enabled_p ())
6105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6106 "shift permutation is not supported by target\n");
6107 return false;
6109 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6111 /* Generating permutation constant to select vector from 2.
6112 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6113 for (i = 0; i < nelt / 2; i++)
6114 sel[i] = i;
6115 for (i = nelt / 2; i < nelt; i++)
6116 sel[i] = nelt + i;
6117 indices.new_vector (sel, 2, nelt);
6118 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6120 if (dump_enabled_p ())
6121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6122 "select is not supported by target\n");
6123 return false;
6125 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6127 for (i = 0; i < log_length; i++)
6129 for (j = 0; j < length; j += 2)
6131 first_vect = dr_chain[j];
6132 second_vect = dr_chain[j + 1];
6134 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6135 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6136 first_vect, first_vect,
6137 perm2_mask1);
6138 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6139 vect[0] = data_ref;
6141 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6142 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6143 second_vect, second_vect,
6144 perm2_mask2);
6145 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6146 vect[1] = data_ref;
6148 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6149 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6150 vect[0], vect[1], shift1_mask);
6151 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6152 (*result_chain)[j/2 + length/2] = data_ref;
6154 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6155 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6156 vect[0], vect[1], select_mask);
6157 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6158 (*result_chain)[j/2] = data_ref;
6160 memcpy (dr_chain.address (), result_chain->address (),
6161 length * sizeof (tree));
6163 return true;
6165 if (length == 3 && vf > 2)
6167 unsigned int k = 0, l = 0;
6169 /* Generating permutation constant to get all elements in rigth order.
6170 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6171 for (i = 0; i < nelt; i++)
6173 if (3 * k + (l % 3) >= nelt)
6175 k = 0;
6176 l += (3 - (nelt % 3));
6178 sel[i] = 3 * k + (l % 3);
6179 k++;
6181 vec_perm_indices indices (sel, 2, nelt);
6182 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6184 if (dump_enabled_p ())
6185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6186 "shuffle of 3 fields structure is not \
6187 supported by target\n");
6188 return false;
6190 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6192 /* Generating permutation constant to shift all elements.
6193 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6194 for (i = 0; i < nelt; i++)
6195 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6196 indices.new_vector (sel, 2, nelt);
6197 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6199 if (dump_enabled_p ())
6200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6201 "shift permutation is not supported by target\n");
6202 return false;
6204 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6206 /* Generating permutation constant to shift all elements.
6207 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6208 for (i = 0; i < nelt; i++)
6209 sel[i] = 2 * (nelt / 3) + 1 + i;
6210 indices.new_vector (sel, 2, nelt);
6211 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6213 if (dump_enabled_p ())
6214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6215 "shift permutation is not supported by target\n");
6216 return false;
6218 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6220 /* Generating permutation constant to shift all elements.
6221 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6222 for (i = 0; i < nelt; i++)
6223 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6224 indices.new_vector (sel, 2, nelt);
6225 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6227 if (dump_enabled_p ())
6228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6229 "shift permutation is not supported by target\n");
6230 return false;
6232 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6234 /* Generating permutation constant to shift all elements.
6235 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6236 for (i = 0; i < nelt; i++)
6237 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6238 indices.new_vector (sel, 2, nelt);
6239 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6241 if (dump_enabled_p ())
6242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6243 "shift permutation is not supported by target\n");
6244 return false;
6246 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6248 for (k = 0; k < 3; k++)
6250 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6251 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6252 dr_chain[k], dr_chain[k],
6253 perm3_mask);
6254 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6255 vect[k] = data_ref;
6258 for (k = 0; k < 3; k++)
6260 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6261 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6262 vect[k % 3], vect[(k + 1) % 3],
6263 shift1_mask);
6264 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6265 vect_shift[k] = data_ref;
6268 for (k = 0; k < 3; k++)
6270 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6271 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6272 vect_shift[(4 - k) % 3],
6273 vect_shift[(3 - k) % 3],
6274 shift2_mask);
6275 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6276 vect[k] = data_ref;
6279 (*result_chain)[3 - (nelt % 3)] = vect[2];
6281 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6282 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6283 vect[0], shift3_mask);
6284 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6285 (*result_chain)[nelt % 3] = data_ref;
6287 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6288 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6289 vect[1], shift4_mask);
6290 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6291 (*result_chain)[0] = data_ref;
6292 return true;
6294 return false;
6297 /* Function vect_transform_grouped_load.
6299 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6300 to perform their permutation and ascribe the result vectorized statements to
6301 the scalar statements.
6304 void
6305 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6306 int size, gimple_stmt_iterator *gsi)
6308 machine_mode mode;
6309 vec<tree> result_chain = vNULL;
6311 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6312 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6313 vectors, that are ready for vector computation. */
6314 result_chain.create (size);
6316 /* If reassociation width for vector type is 2 or greater target machine can
6317 execute 2 or more vector instructions in parallel. Otherwise try to
6318 get chain for loads group using vect_shift_permute_load_chain. */
6319 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6320 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6321 || pow2p_hwi (size)
6322 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6323 gsi, &result_chain))
6324 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6325 vect_record_grouped_load_vectors (stmt_info, result_chain);
6326 result_chain.release ();
6329 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6330 generated as part of the vectorization of STMT_INFO. Assign the statement
6331 for each vector to the associated scalar statement. */
6333 void
6334 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6335 vec<tree> result_chain)
6337 vec_info *vinfo = stmt_info->vinfo;
6338 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6339 unsigned int i, gap_count;
6340 tree tmp_data_ref;
6342 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6343 Since we scan the chain starting from it's first node, their order
6344 corresponds the order of data-refs in RESULT_CHAIN. */
6345 stmt_vec_info next_stmt_info = first_stmt_info;
6346 gap_count = 1;
6347 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6349 if (!next_stmt_info)
6350 break;
6352 /* Skip the gaps. Loads created for the gaps will be removed by dead
6353 code elimination pass later. No need to check for the first stmt in
6354 the group, since it always exists.
6355 DR_GROUP_GAP is the number of steps in elements from the previous
6356 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6357 correspond to the gaps. */
6358 if (next_stmt_info != first_stmt_info
6359 && gap_count < DR_GROUP_GAP (next_stmt_info))
6361 gap_count++;
6362 continue;
6365 while (next_stmt_info)
6367 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6368 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6369 copies, and we put the new vector statement in the first available
6370 RELATED_STMT. */
6371 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6372 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6373 else
6375 if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6377 stmt_vec_info prev_stmt_info
6378 = STMT_VINFO_VEC_STMT (next_stmt_info);
6379 stmt_vec_info rel_stmt_info
6380 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6381 while (rel_stmt_info)
6383 prev_stmt_info = rel_stmt_info;
6384 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6387 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6391 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6392 gap_count = 1;
6393 /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6394 put the same TMP_DATA_REF as its vectorized statement; otherwise
6395 get the next data-ref from RESULT_CHAIN. */
6396 if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6397 break;
6402 /* Function vect_force_dr_alignment_p.
6404 Returns whether the alignment of a DECL can be forced to be aligned
6405 on ALIGNMENT bit boundary. */
6407 bool
6408 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6410 if (!VAR_P (decl))
6411 return false;
6413 if (decl_in_symtab_p (decl)
6414 && !symtab_node::get (decl)->can_increase_alignment_p ())
6415 return false;
6417 if (TREE_STATIC (decl))
6418 return (alignment <= MAX_OFILE_ALIGNMENT);
6419 else
6420 return (alignment <= MAX_STACK_ALIGNMENT);
6424 /* Return whether the data reference DR is supported with respect to its
6425 alignment.
6426 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6427 it is aligned, i.e., check if it is possible to vectorize it with different
6428 alignment. */
6430 enum dr_alignment_support
6431 vect_supportable_dr_alignment (struct data_reference *dr,
6432 bool check_aligned_accesses)
6434 stmt_vec_info stmt_info = vect_dr_stmt (dr);
6435 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6436 machine_mode mode = TYPE_MODE (vectype);
6437 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6438 struct loop *vect_loop = NULL;
6439 bool nested_in_vect_loop = false;
6441 if (aligned_access_p (dr) && !check_aligned_accesses)
6442 return dr_aligned;
6444 /* For now assume all conditional loads/stores support unaligned
6445 access without any special code. */
6446 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6447 if (gimple_call_internal_p (stmt)
6448 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6449 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6450 return dr_unaligned_supported;
6452 if (loop_vinfo)
6454 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6455 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6458 /* Possibly unaligned access. */
6460 /* We can choose between using the implicit realignment scheme (generating
6461 a misaligned_move stmt) and the explicit realignment scheme (generating
6462 aligned loads with a REALIGN_LOAD). There are two variants to the
6463 explicit realignment scheme: optimized, and unoptimized.
6464 We can optimize the realignment only if the step between consecutive
6465 vector loads is equal to the vector size. Since the vector memory
6466 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6467 is guaranteed that the misalignment amount remains the same throughout the
6468 execution of the vectorized loop. Therefore, we can create the
6469 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6470 at the loop preheader.
6472 However, in the case of outer-loop vectorization, when vectorizing a
6473 memory access in the inner-loop nested within the LOOP that is now being
6474 vectorized, while it is guaranteed that the misalignment of the
6475 vectorized memory access will remain the same in different outer-loop
6476 iterations, it is *not* guaranteed that is will remain the same throughout
6477 the execution of the inner-loop. This is because the inner-loop advances
6478 with the original scalar step (and not in steps of VS). If the inner-loop
6479 step happens to be a multiple of VS, then the misalignment remains fixed
6480 and we can use the optimized realignment scheme. For example:
6482 for (i=0; i<N; i++)
6483 for (j=0; j<M; j++)
6484 s += a[i+j];
6486 When vectorizing the i-loop in the above example, the step between
6487 consecutive vector loads is 1, and so the misalignment does not remain
6488 fixed across the execution of the inner-loop, and the realignment cannot
6489 be optimized (as illustrated in the following pseudo vectorized loop):
6491 for (i=0; i<N; i+=4)
6492 for (j=0; j<M; j++){
6493 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6494 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6495 // (assuming that we start from an aligned address).
6498 We therefore have to use the unoptimized realignment scheme:
6500 for (i=0; i<N; i+=4)
6501 for (j=k; j<M; j+=4)
6502 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6503 // that the misalignment of the initial address is
6504 // 0).
6506 The loop can then be vectorized as follows:
6508 for (k=0; k<4; k++){
6509 rt = get_realignment_token (&vp[k]);
6510 for (i=0; i<N; i+=4){
6511 v1 = vp[i+k];
6512 for (j=k; j<M; j+=4){
6513 v2 = vp[i+j+VS-1];
6514 va = REALIGN_LOAD <v1,v2,rt>;
6515 vs += va;
6516 v1 = v2;
6519 } */
6521 if (DR_IS_READ (dr))
6523 bool is_packed = false;
6524 tree type = (TREE_TYPE (DR_REF (dr)));
6526 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6527 && (!targetm.vectorize.builtin_mask_for_load
6528 || targetm.vectorize.builtin_mask_for_load ()))
6530 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6532 /* If we are doing SLP then the accesses need not have the
6533 same alignment, instead it depends on the SLP group size. */
6534 if (loop_vinfo
6535 && STMT_SLP_TYPE (stmt_info)
6536 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6537 * (DR_GROUP_SIZE
6538 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6539 TYPE_VECTOR_SUBPARTS (vectype)))
6541 else if (!loop_vinfo
6542 || (nested_in_vect_loop
6543 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6544 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6545 return dr_explicit_realign;
6546 else
6547 return dr_explicit_realign_optimized;
6549 if (!known_alignment_for_access_p (dr))
6550 is_packed = not_size_aligned (DR_REF (dr));
6552 if (targetm.vectorize.support_vector_misalignment
6553 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6554 /* Can't software pipeline the loads, but can at least do them. */
6555 return dr_unaligned_supported;
6557 else
6559 bool is_packed = false;
6560 tree type = (TREE_TYPE (DR_REF (dr)));
6562 if (!known_alignment_for_access_p (dr))
6563 is_packed = not_size_aligned (DR_REF (dr));
6565 if (targetm.vectorize.support_vector_misalignment
6566 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6567 return dr_unaligned_supported;
6570 /* Unsupported. */
6571 return dr_unaligned_unsupported;