2018-06-25 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob19ff78226b00769775db2fb2d733e3045cc34671
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (!runtime_alias_check_p (ddr, loop,
167 optimize_loop_nest_for_speed_p (loop)))
168 return false;
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171 return true;
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176 static void
177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180 for (unsigned int i = 0; i < checks.length(); ++i)
181 if (checks[i] == value)
182 return;
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188 dump_printf (MSG_NOTE, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
197 static bool
198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206 return true;
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 if (is_pattern_stmt_p (stmtinfo_a))
216 stmt_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
217 if (is_pattern_stmt_p (stmtinfo_b))
218 stmt_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
219 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
220 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
223 /* A subroutine of vect_analyze_data_ref_dependence. Handle
224 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
225 distances. These distances are conservatively correct but they don't
226 reflect a guaranteed dependence.
228 Return true if this function does all the work necessary to avoid
229 an alias or false if the caller should use the dependence distances
230 to limit the vectorization factor in the usual way. LOOP_DEPTH is
231 the depth of the loop described by LOOP_VINFO and the other arguments
232 are as for vect_analyze_data_ref_dependence. */
234 static bool
235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
236 loop_vec_info loop_vinfo,
237 int loop_depth, unsigned int *max_vf)
239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
240 lambda_vector dist_v;
241 unsigned int i;
242 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
244 int dist = dist_v[loop_depth];
245 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
247 /* If the user asserted safelen >= DIST consecutive iterations
248 can be executed concurrently, assume independence.
250 ??? An alternative would be to add the alias check even
251 in this case, and vectorize the fallback loop with the
252 maximum VF set to safelen. However, if the user has
253 explicitly given a length, it's less likely that that
254 would be a win. */
255 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
257 if ((unsigned int) loop->safelen < *max_vf)
258 *max_vf = loop->safelen;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
260 continue;
263 /* For dependence distances of 2 or more, we have the option
264 of limiting VF or checking for an alias at runtime.
265 Prefer to check at runtime if we can, to avoid limiting
266 the VF unnecessarily when the bases are in fact independent.
268 Note that the alias checks will be removed if the VF ends up
269 being small enough. */
270 return (!STMT_VINFO_GATHER_SCATTER_P
271 (vinfo_for_stmt (DR_STMT (DDR_A (ddr))))
272 && !STMT_VINFO_GATHER_SCATTER_P
273 (vinfo_for_stmt (DR_STMT (DDR_B (ddr))))
274 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
277 return true;
281 /* Function vect_analyze_data_ref_dependence.
283 Return TRUE if there (might) exist a dependence between a memory-reference
284 DRA and a memory-reference DRB. When versioning for alias may check a
285 dependence at run-time, return FALSE. Adjust *MAX_VF according to
286 the data dependence. */
288 static bool
289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
290 loop_vec_info loop_vinfo,
291 unsigned int *max_vf)
293 unsigned int i;
294 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
295 struct data_reference *dra = DDR_A (ddr);
296 struct data_reference *drb = DDR_B (ddr);
297 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
298 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
299 lambda_vector dist_v;
300 unsigned int loop_depth;
302 /* In loop analysis all data references should be vectorizable. */
303 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
304 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
305 gcc_unreachable ();
307 /* Independent data accesses. */
308 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
309 return false;
311 if (dra == drb
312 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
313 return false;
315 /* We do not have to consider dependences between accesses that belong
316 to the same group, unless the stride could be smaller than the
317 group size. */
318 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
319 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
320 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
321 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
322 return false;
324 /* Even if we have an anti-dependence then, as the vectorized loop covers at
325 least two scalar iterations, there is always also a true dependence.
326 As the vectorizer does not re-order loads and stores we can ignore
327 the anti-dependence if TBAA can disambiguate both DRs similar to the
328 case with known negative distance anti-dependences (positive
329 distance anti-dependences would violate TBAA constraints). */
330 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
331 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
332 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
333 get_alias_set (DR_REF (drb))))
334 return false;
336 /* Unknown data dependence. */
337 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
339 /* If user asserted safelen consecutive iterations can be
340 executed concurrently, assume independence. */
341 if (loop->safelen >= 2)
343 if ((unsigned int) loop->safelen < *max_vf)
344 *max_vf = loop->safelen;
345 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
346 return false;
349 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
350 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "versioning for alias not supported for: "
356 "can't determine dependence between ");
357 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
358 DR_REF (dra));
359 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
360 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
361 DR_REF (drb));
362 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 return true;
367 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "versioning for alias required: "
371 "can't determine dependence between ");
372 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
373 DR_REF (dra));
374 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
375 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
376 DR_REF (drb));
377 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
380 /* Add to list of ddrs that need to be tested at run-time. */
381 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
384 /* Known data dependence. */
385 if (DDR_NUM_DIST_VECTS (ddr) == 0)
387 /* If user asserted safelen consecutive iterations can be
388 executed concurrently, assume independence. */
389 if (loop->safelen >= 2)
391 if ((unsigned int) loop->safelen < *max_vf)
392 *max_vf = loop->safelen;
393 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
394 return false;
397 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
398 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
400 if (dump_enabled_p ())
402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
403 "versioning for alias not supported for: "
404 "bad dist vector for ");
405 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
406 DR_REF (dra));
407 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
408 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
409 DR_REF (drb));
410 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
412 return true;
415 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "versioning for alias required: "
419 "bad dist vector for ");
420 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
421 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
422 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
423 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
425 /* Add to list of ddrs that need to be tested at run-time. */
426 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
429 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
431 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
432 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
433 loop_depth, max_vf))
434 return false;
436 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
438 int dist = dist_v[loop_depth];
440 if (dump_enabled_p ())
441 dump_printf_loc (MSG_NOTE, vect_location,
442 "dependence distance = %d.\n", dist);
444 if (dist == 0)
446 if (dump_enabled_p ())
448 dump_printf_loc (MSG_NOTE, vect_location,
449 "dependence distance == 0 between ");
450 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
451 dump_printf (MSG_NOTE, " and ");
452 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
453 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
456 /* When we perform grouped accesses and perform implicit CSE
457 by detecting equal accesses and doing disambiguation with
458 runtime alias tests like for
459 .. = a[i];
460 .. = a[i+1];
461 a[i] = ..;
462 a[i+1] = ..;
463 *p = ..;
464 .. = a[i];
465 .. = a[i+1];
466 where we will end up loading { a[i], a[i+1] } once, make
467 sure that inserting group loads before the first load and
468 stores after the last store will do the right thing.
469 Similar for groups like
470 a[i] = ...;
471 ... = a[i];
472 a[i+1] = ...;
473 where loads from the group interleave with the store. */
474 if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
475 vect_dr_stmt (drb)))
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
479 "READ_WRITE dependence in interleaving.\n");
480 return true;
483 if (loop->safelen < 2)
485 tree indicator = dr_zero_step_indicator (dra);
486 if (!indicator || integer_zerop (indicator))
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
490 "access also has a zero step\n");
491 return true;
493 else if (TREE_CODE (indicator) != INTEGER_CST)
494 vect_check_nonzero_value (loop_vinfo, indicator);
496 continue;
499 if (dist > 0 && DDR_REVERSED_P (ddr))
501 /* If DDR_REVERSED_P the order of the data-refs in DDR was
502 reversed (to make distance vector positive), and the actual
503 distance is negative. */
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
506 "dependence distance negative.\n");
507 /* Record a negative dependence distance to later limit the
508 amount of stmt copying / unrolling we can perform.
509 Only need to handle read-after-write dependence. */
510 if (DR_IS_READ (drb)
511 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
512 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
513 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
514 continue;
517 unsigned int abs_dist = abs (dist);
518 if (abs_dist >= 2 && abs_dist < *max_vf)
520 /* The dependence distance requires reduction of the maximal
521 vectorization factor. */
522 *max_vf = abs (dist);
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_NOTE, vect_location,
525 "adjusting maximal vectorization factor to %i\n",
526 *max_vf);
529 if (abs_dist >= *max_vf)
531 /* Dependence distance does not create dependence, as far as
532 vectorization is concerned, in this case. */
533 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE, vect_location,
535 "dependence distance >= VF.\n");
536 continue;
539 if (dump_enabled_p ())
541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
542 "not vectorized, possible dependence "
543 "between data-refs ");
544 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
545 dump_printf (MSG_NOTE, " and ");
546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
547 dump_printf (MSG_NOTE, "\n");
550 return true;
553 return false;
556 /* Function vect_analyze_data_ref_dependences.
558 Examine all the data references in the loop, and make sure there do not
559 exist any data dependences between them. Set *MAX_VF according to
560 the maximum vectorization factor the data dependences allow. */
562 bool
563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
564 unsigned int *max_vf)
566 unsigned int i;
567 struct data_dependence_relation *ddr;
569 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
571 LOOP_VINFO_DDRS (loop_vinfo)
572 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
573 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
574 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
575 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
576 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
577 &LOOP_VINFO_DDRS (loop_vinfo),
578 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
579 return false;
581 /* For epilogues we either have no aliases or alias versioning
582 was applied to original loop. Therefore we may just get max_vf
583 using VF of original loop. */
584 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
585 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
586 else
587 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
588 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
589 return false;
591 return true;
595 /* Function vect_slp_analyze_data_ref_dependence.
597 Return TRUE if there (might) exist a dependence between a memory-reference
598 DRA and a memory-reference DRB. When versioning for alias may check a
599 dependence at run-time, return FALSE. Adjust *MAX_VF according to
600 the data dependence. */
602 static bool
603 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
605 struct data_reference *dra = DDR_A (ddr);
606 struct data_reference *drb = DDR_B (ddr);
608 /* We need to check dependences of statements marked as unvectorizable
609 as well, they still can prohibit vectorization. */
611 /* Independent data accesses. */
612 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
613 return false;
615 if (dra == drb)
616 return false;
618 /* Read-read is OK. */
619 if (DR_IS_READ (dra) && DR_IS_READ (drb))
620 return false;
622 /* If dra and drb are part of the same interleaving chain consider
623 them independent. */
624 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (vect_dr_stmt (dra)))
625 && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dra)))
626 == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (drb)))))
627 return false;
629 /* Unknown data dependence. */
630 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
632 if (dump_enabled_p ())
634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
635 "can't determine dependence between ");
636 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
637 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
638 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
639 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
642 else if (dump_enabled_p ())
644 dump_printf_loc (MSG_NOTE, vect_location,
645 "determined dependence between ");
646 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
647 dump_printf (MSG_NOTE, " and ");
648 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
649 dump_printf (MSG_NOTE, "\n");
652 return true;
656 /* Analyze dependences involved in the transform of SLP NODE. STORES
657 contain the vector of scalar stores of this instance if we are
658 disambiguating the loads. */
660 static bool
661 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
662 vec<gimple *> stores, gimple *last_store)
664 /* This walks over all stmts involved in the SLP load/store done
665 in NODE verifying we can sink them up to the last stmt in the
666 group. */
667 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
668 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
670 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
671 if (access == last_access)
672 continue;
673 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
674 ao_ref ref;
675 bool ref_initialized_p = false;
676 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
677 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
679 gimple *stmt = gsi_stmt (gsi);
680 if (! gimple_vuse (stmt)
681 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
682 continue;
684 /* If we couldn't record a (single) data reference for this
685 stmt we have to resort to the alias oracle. */
686 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
687 if (!dr_b)
689 /* We are moving a store or sinking a load - this means
690 we cannot use TBAA for disambiguation. */
691 if (!ref_initialized_p)
692 ao_ref_init (&ref, DR_REF (dr_a));
693 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
694 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
695 return false;
696 continue;
699 bool dependent = false;
700 /* If we run into a store of this same instance (we've just
701 marked those) then delay dependence checking until we run
702 into the last store because this is where it will have
703 been sunk to (and we verify if we can do that as well). */
704 if (gimple_visited_p (stmt))
706 if (stmt != last_store)
707 continue;
708 unsigned i;
709 gimple *store;
710 FOR_EACH_VEC_ELT (stores, i, store)
712 data_reference *store_dr
713 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
714 ddr_p ddr = initialize_data_dependence_relation
715 (dr_a, store_dr, vNULL);
716 dependent = vect_slp_analyze_data_ref_dependence (ddr);
717 free_dependence_relation (ddr);
718 if (dependent)
719 break;
722 else
724 ddr_p ddr = initialize_data_dependence_relation (dr_a,
725 dr_b, vNULL);
726 dependent = vect_slp_analyze_data_ref_dependence (ddr);
727 free_dependence_relation (ddr);
729 if (dependent)
730 return false;
733 return true;
737 /* Function vect_analyze_data_ref_dependences.
739 Examine all the data references in the basic-block, and make sure there
740 do not exist any data dependences between them. Set *MAX_VF according to
741 the maximum vectorization factor the data dependences allow. */
743 bool
744 vect_slp_analyze_instance_dependence (slp_instance instance)
746 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
748 /* The stores of this instance are at the root of the SLP tree. */
749 slp_tree store = SLP_INSTANCE_TREE (instance);
750 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
751 store = NULL;
753 /* Verify we can sink stores to the vectorized stmt insert location. */
754 gimple *last_store = NULL;
755 if (store)
757 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
758 return false;
760 /* Mark stores in this instance and remember the last one. */
761 last_store = vect_find_last_scalar_stmt_in_slp (store);
762 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
763 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
766 bool res = true;
768 /* Verify we can sink loads to the vectorized stmt insert location,
769 special-casing stores of this instance. */
770 slp_tree load;
771 unsigned int i;
772 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
773 if (! vect_slp_analyze_node_dependences (instance, load,
774 store
775 ? SLP_TREE_SCALAR_STMTS (store)
776 : vNULL, last_store))
778 res = false;
779 break;
782 /* Unset the visited flag. */
783 if (store)
784 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
785 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
787 return res;
790 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
791 the statement that contains DRB, which is useful for recording in the
792 dump file. */
794 static void
795 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
796 innermost_loop_behavior *drb)
798 bool existed;
799 innermost_loop_behavior *&entry
800 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
801 if (!existed || entry->base_alignment < drb->base_alignment)
803 entry = drb;
804 if (dump_enabled_p ())
806 dump_printf_loc (MSG_NOTE, vect_location,
807 "recording new base alignment for ");
808 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
809 dump_printf (MSG_NOTE, "\n");
810 dump_printf_loc (MSG_NOTE, vect_location,
811 " alignment: %d\n", drb->base_alignment);
812 dump_printf_loc (MSG_NOTE, vect_location,
813 " misalignment: %d\n", drb->base_misalignment);
814 dump_printf_loc (MSG_NOTE, vect_location,
815 " based on: ");
816 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
821 /* If the region we're going to vectorize is reached, all unconditional
822 data references occur at least once. We can therefore pool the base
823 alignment guarantees from each unconditional reference. Do this by
824 going through all the data references in VINFO and checking whether
825 the containing statement makes the reference unconditionally. If so,
826 record the alignment of the base address in VINFO so that it can be
827 used for all other references with the same base. */
829 void
830 vect_record_base_alignments (vec_info *vinfo)
832 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
833 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
834 data_reference *dr;
835 unsigned int i;
836 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
838 gimple *stmt = vect_dr_stmt (dr);
839 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
840 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
841 && STMT_VINFO_VECTORIZABLE (stmt_info)
842 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
844 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
846 /* If DR is nested in the loop that is being vectorized, we can also
847 record the alignment of the base wrt the outer loop. */
848 if (loop && nested_in_vect_loop_p (loop, stmt))
849 vect_record_base_alignment
850 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
855 /* Return the target alignment for the vectorized form of DR. */
857 static unsigned int
858 vect_calculate_target_alignment (struct data_reference *dr)
860 gimple *stmt = vect_dr_stmt (dr);
861 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
862 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
863 return targetm.vectorize.preferred_vector_alignment (vectype);
866 /* Function vect_compute_data_ref_alignment
868 Compute the misalignment of the data reference DR.
870 Output:
871 1. DR_MISALIGNMENT (DR) is defined.
873 FOR NOW: No analysis is actually performed. Misalignment is calculated
874 only for trivial cases. TODO. */
876 static void
877 vect_compute_data_ref_alignment (struct data_reference *dr)
879 gimple *stmt = vect_dr_stmt (dr);
880 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
881 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
882 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
883 struct loop *loop = NULL;
884 tree ref = DR_REF (dr);
885 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
887 if (dump_enabled_p ())
888 dump_printf_loc (MSG_NOTE, vect_location,
889 "vect_compute_data_ref_alignment:\n");
891 if (loop_vinfo)
892 loop = LOOP_VINFO_LOOP (loop_vinfo);
894 /* Initialize misalignment to unknown. */
895 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
897 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
898 return;
900 innermost_loop_behavior *drb = vect_dr_behavior (dr);
901 bool step_preserves_misalignment_p;
903 unsigned HOST_WIDE_INT vector_alignment
904 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
905 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
907 /* No step for BB vectorization. */
908 if (!loop)
910 gcc_assert (integer_zerop (drb->step));
911 step_preserves_misalignment_p = true;
914 /* In case the dataref is in an inner-loop of the loop that is being
915 vectorized (LOOP), we use the base and misalignment information
916 relative to the outer-loop (LOOP). This is ok only if the misalignment
917 stays the same throughout the execution of the inner-loop, which is why
918 we have to check that the stride of the dataref in the inner-loop evenly
919 divides by the vector alignment. */
920 else if (nested_in_vect_loop_p (loop, stmt))
922 step_preserves_misalignment_p
923 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
925 if (dump_enabled_p ())
927 if (step_preserves_misalignment_p)
928 dump_printf_loc (MSG_NOTE, vect_location,
929 "inner step divides the vector alignment.\n");
930 else
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 "inner step doesn't divide the vector"
933 " alignment.\n");
937 /* Similarly we can only use base and misalignment information relative to
938 an innermost loop if the misalignment stays the same throughout the
939 execution of the loop. As above, this is the case if the stride of
940 the dataref evenly divides by the alignment. */
941 else
943 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
944 step_preserves_misalignment_p
945 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
947 if (!step_preserves_misalignment_p && dump_enabled_p ())
948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
949 "step doesn't divide the vector alignment.\n");
952 unsigned int base_alignment = drb->base_alignment;
953 unsigned int base_misalignment = drb->base_misalignment;
955 /* Calculate the maximum of the pooled base address alignment and the
956 alignment that we can compute for DR itself. */
957 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
958 if (entry && base_alignment < (*entry)->base_alignment)
960 base_alignment = (*entry)->base_alignment;
961 base_misalignment = (*entry)->base_misalignment;
964 if (drb->offset_alignment < vector_alignment
965 || !step_preserves_misalignment_p
966 /* We need to know whether the step wrt the vectorized loop is
967 negative when computing the starting misalignment below. */
968 || TREE_CODE (drb->step) != INTEGER_CST)
970 if (dump_enabled_p ())
972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
973 "Unknown alignment for access: ");
974 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
975 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
977 return;
980 if (base_alignment < vector_alignment)
982 unsigned int max_alignment;
983 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
984 if (max_alignment < vector_alignment
985 || !vect_can_force_dr_alignment_p (base,
986 vector_alignment * BITS_PER_UNIT))
988 if (dump_enabled_p ())
990 dump_printf_loc (MSG_NOTE, vect_location,
991 "can't force alignment of ref: ");
992 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
993 dump_printf (MSG_NOTE, "\n");
995 return;
998 /* Force the alignment of the decl.
999 NOTE: This is the only change to the code we make during
1000 the analysis phase, before deciding to vectorize the loop. */
1001 if (dump_enabled_p ())
1003 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1004 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1005 dump_printf (MSG_NOTE, "\n");
1008 DR_VECT_AUX (dr)->base_decl = base;
1009 DR_VECT_AUX (dr)->base_misaligned = true;
1010 base_misalignment = 0;
1012 poly_int64 misalignment
1013 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1015 /* If this is a backward running DR then first access in the larger
1016 vectype actually is N-1 elements before the address in the DR.
1017 Adjust misalign accordingly. */
1018 if (tree_int_cst_sgn (drb->step) < 0)
1019 /* PLUS because STEP is negative. */
1020 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1021 * TREE_INT_CST_LOW (drb->step));
1023 unsigned int const_misalignment;
1024 if (!known_misalignment (misalignment, vector_alignment,
1025 &const_misalignment))
1027 if (dump_enabled_p ())
1029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1030 "Non-constant misalignment for access: ");
1031 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1032 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1034 return;
1037 SET_DR_MISALIGNMENT (dr, const_misalignment);
1039 if (dump_enabled_p ())
1041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1042 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1043 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1044 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1047 return;
1050 /* Function vect_update_misalignment_for_peel.
1051 Sets DR's misalignment
1052 - to 0 if it has the same alignment as DR_PEEL,
1053 - to the misalignment computed using NPEEL if DR's salignment is known,
1054 - to -1 (unknown) otherwise.
1056 DR - the data reference whose misalignment is to be adjusted.
1057 DR_PEEL - the data reference whose misalignment is being made
1058 zero in the vector loop by the peel.
1059 NPEEL - the number of iterations in the peel loop if the misalignment
1060 of DR_PEEL is known at compile time. */
1062 static void
1063 vect_update_misalignment_for_peel (struct data_reference *dr,
1064 struct data_reference *dr_peel, int npeel)
1066 unsigned int i;
1067 vec<dr_p> same_aligned_drs;
1068 struct data_reference *current_dr;
1069 int dr_size = vect_get_scalar_dr_size (dr);
1070 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1071 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
1072 stmt_vec_info peel_stmt_info = vinfo_for_stmt (vect_dr_stmt (dr_peel));
1074 /* For interleaved data accesses the step in the loop must be multiplied by
1075 the size of the interleaving group. */
1076 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1077 dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1078 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1079 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1081 /* It can be assumed that the data refs with the same alignment as dr_peel
1082 are aligned in the vector loop. */
1083 same_aligned_drs
1084 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (vect_dr_stmt (dr_peel)));
1085 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1087 if (current_dr != dr)
1088 continue;
1089 gcc_assert (!known_alignment_for_access_p (dr)
1090 || !known_alignment_for_access_p (dr_peel)
1091 || (DR_MISALIGNMENT (dr) / dr_size
1092 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1093 SET_DR_MISALIGNMENT (dr, 0);
1094 return;
1097 if (known_alignment_for_access_p (dr)
1098 && known_alignment_for_access_p (dr_peel))
1100 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1101 int misal = DR_MISALIGNMENT (dr);
1102 misal += negative ? -npeel * dr_size : npeel * dr_size;
1103 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1104 SET_DR_MISALIGNMENT (dr, misal);
1105 return;
1108 if (dump_enabled_p ())
1109 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1110 "to unknown (-1).\n");
1111 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1115 /* Function verify_data_ref_alignment
1117 Return TRUE if DR can be handled with respect to alignment. */
1119 static bool
1120 verify_data_ref_alignment (data_reference_p dr)
1122 enum dr_alignment_support supportable_dr_alignment
1123 = vect_supportable_dr_alignment (dr, false);
1124 if (!supportable_dr_alignment)
1126 if (dump_enabled_p ())
1128 if (DR_IS_READ (dr))
1129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1130 "not vectorized: unsupported unaligned load.");
1131 else
1132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1133 "not vectorized: unsupported unaligned "
1134 "store.");
1136 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1137 DR_REF (dr));
1138 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1140 return false;
1143 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1144 dump_printf_loc (MSG_NOTE, vect_location,
1145 "Vectorizing an unaligned access.\n");
1147 return true;
1150 /* Function vect_verify_datarefs_alignment
1152 Return TRUE if all data references in the loop can be
1153 handled with respect to alignment. */
1155 bool
1156 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1158 vec<data_reference_p> datarefs = vinfo->datarefs;
1159 struct data_reference *dr;
1160 unsigned int i;
1162 FOR_EACH_VEC_ELT (datarefs, i, dr)
1164 gimple *stmt = vect_dr_stmt (dr);
1165 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1167 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1168 continue;
1170 /* For interleaving, only the alignment of the first access matters. */
1171 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1172 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1173 continue;
1175 /* Strided accesses perform only component accesses, alignment is
1176 irrelevant for them. */
1177 if (STMT_VINFO_STRIDED_P (stmt_info)
1178 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1179 continue;
1181 if (! verify_data_ref_alignment (dr))
1182 return false;
1185 return true;
1188 /* Given an memory reference EXP return whether its alignment is less
1189 than its size. */
1191 static bool
1192 not_size_aligned (tree exp)
1194 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1195 return true;
1197 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1198 > get_object_alignment (exp));
1201 /* Function vector_alignment_reachable_p
1203 Return true if vector alignment for DR is reachable by peeling
1204 a few loop iterations. Return false otherwise. */
1206 static bool
1207 vector_alignment_reachable_p (struct data_reference *dr)
1209 gimple *stmt = vect_dr_stmt (dr);
1210 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1211 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1213 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1215 /* For interleaved access we peel only if number of iterations in
1216 the prolog loop ({VF - misalignment}), is a multiple of the
1217 number of the interleaved accesses. */
1218 int elem_size, mis_in_elements;
1220 /* FORNOW: handle only known alignment. */
1221 if (!known_alignment_for_access_p (dr))
1222 return false;
1224 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1225 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1226 elem_size = vector_element_size (vector_size, nelements);
1227 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1229 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1230 return false;
1233 /* If misalignment is known at the compile time then allow peeling
1234 only if natural alignment is reachable through peeling. */
1235 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1237 HOST_WIDE_INT elmsize =
1238 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1239 if (dump_enabled_p ())
1241 dump_printf_loc (MSG_NOTE, vect_location,
1242 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1243 dump_printf (MSG_NOTE,
1244 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1246 if (DR_MISALIGNMENT (dr) % elmsize)
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 "data size does not divide the misalignment.\n");
1251 return false;
1255 if (!known_alignment_for_access_p (dr))
1257 tree type = TREE_TYPE (DR_REF (dr));
1258 bool is_packed = not_size_aligned (DR_REF (dr));
1259 if (dump_enabled_p ())
1260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1261 "Unknown misalignment, %snaturally aligned\n",
1262 is_packed ? "not " : "");
1263 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1266 return true;
1270 /* Calculate the cost of the memory access represented by DR. */
1272 static void
1273 vect_get_data_access_cost (struct data_reference *dr,
1274 unsigned int *inside_cost,
1275 unsigned int *outside_cost,
1276 stmt_vector_for_cost *body_cost_vec,
1277 stmt_vector_for_cost *prologue_cost_vec)
1279 gimple *stmt = vect_dr_stmt (dr);
1280 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1281 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1282 int ncopies;
1284 if (PURE_SLP_STMT (stmt_info))
1285 ncopies = 1;
1286 else
1287 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1289 if (DR_IS_READ (dr))
1290 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1291 prologue_cost_vec, body_cost_vec, false);
1292 else
1293 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1295 if (dump_enabled_p ())
1296 dump_printf_loc (MSG_NOTE, vect_location,
1297 "vect_get_data_access_cost: inside_cost = %d, "
1298 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1302 typedef struct _vect_peel_info
1304 struct data_reference *dr;
1305 int npeel;
1306 unsigned int count;
1307 } *vect_peel_info;
1309 typedef struct _vect_peel_extended_info
1311 struct _vect_peel_info peel_info;
1312 unsigned int inside_cost;
1313 unsigned int outside_cost;
1314 } *vect_peel_extended_info;
1317 /* Peeling hashtable helpers. */
1319 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1321 static inline hashval_t hash (const _vect_peel_info *);
1322 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1325 inline hashval_t
1326 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1328 return (hashval_t) peel_info->npeel;
1331 inline bool
1332 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1334 return (a->npeel == b->npeel);
1338 /* Insert DR into peeling hash table with NPEEL as key. */
1340 static void
1341 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1342 loop_vec_info loop_vinfo, struct data_reference *dr,
1343 int npeel)
1345 struct _vect_peel_info elem, *slot;
1346 _vect_peel_info **new_slot;
1347 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1349 elem.npeel = npeel;
1350 slot = peeling_htab->find (&elem);
1351 if (slot)
1352 slot->count++;
1353 else
1355 slot = XNEW (struct _vect_peel_info);
1356 slot->npeel = npeel;
1357 slot->dr = dr;
1358 slot->count = 1;
1359 new_slot = peeling_htab->find_slot (slot, INSERT);
1360 *new_slot = slot;
1363 if (!supportable_dr_alignment
1364 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1365 slot->count += VECT_MAX_COST;
1369 /* Traverse peeling hash table to find peeling option that aligns maximum
1370 number of data accesses. */
1373 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1374 _vect_peel_extended_info *max)
1376 vect_peel_info elem = *slot;
1378 if (elem->count > max->peel_info.count
1379 || (elem->count == max->peel_info.count
1380 && max->peel_info.npeel > elem->npeel))
1382 max->peel_info.npeel = elem->npeel;
1383 max->peel_info.count = elem->count;
1384 max->peel_info.dr = elem->dr;
1387 return 1;
1390 /* Get the costs of peeling NPEEL iterations checking data access costs
1391 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1392 misalignment will be zero after peeling. */
1394 static void
1395 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1396 struct data_reference *dr0,
1397 unsigned int *inside_cost,
1398 unsigned int *outside_cost,
1399 stmt_vector_for_cost *body_cost_vec,
1400 stmt_vector_for_cost *prologue_cost_vec,
1401 unsigned int npeel,
1402 bool unknown_misalignment)
1404 unsigned i;
1405 data_reference *dr;
1407 FOR_EACH_VEC_ELT (datarefs, i, dr)
1409 gimple *stmt = vect_dr_stmt (dr);
1410 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1411 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1412 continue;
1414 /* For interleaving, only the alignment of the first access
1415 matters. */
1416 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1417 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1418 continue;
1420 /* Strided accesses perform only component accesses, alignment is
1421 irrelevant for them. */
1422 if (STMT_VINFO_STRIDED_P (stmt_info)
1423 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1424 continue;
1426 int save_misalignment;
1427 save_misalignment = DR_MISALIGNMENT (dr);
1428 if (npeel == 0)
1430 else if (unknown_misalignment && dr == dr0)
1431 SET_DR_MISALIGNMENT (dr, 0);
1432 else
1433 vect_update_misalignment_for_peel (dr, dr0, npeel);
1434 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1435 body_cost_vec, prologue_cost_vec);
1436 SET_DR_MISALIGNMENT (dr, save_misalignment);
1440 /* Traverse peeling hash table and calculate cost for each peeling option.
1441 Find the one with the lowest cost. */
1444 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1445 _vect_peel_extended_info *min)
1447 vect_peel_info elem = *slot;
1448 int dummy;
1449 unsigned int inside_cost = 0, outside_cost = 0;
1450 gimple *stmt = vect_dr_stmt (elem->dr);
1451 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1452 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1453 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1454 epilogue_cost_vec;
1456 prologue_cost_vec.create (2);
1457 body_cost_vec.create (2);
1458 epilogue_cost_vec.create (2);
1460 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1461 elem->dr, &inside_cost, &outside_cost,
1462 &body_cost_vec, &prologue_cost_vec,
1463 elem->npeel, false);
1465 body_cost_vec.release ();
1467 outside_cost += vect_get_known_peeling_cost
1468 (loop_vinfo, elem->npeel, &dummy,
1469 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1470 &prologue_cost_vec, &epilogue_cost_vec);
1472 /* Prologue and epilogue costs are added to the target model later.
1473 These costs depend only on the scalar iteration cost, the
1474 number of peeling iterations finally chosen, and the number of
1475 misaligned statements. So discard the information found here. */
1476 prologue_cost_vec.release ();
1477 epilogue_cost_vec.release ();
1479 if (inside_cost < min->inside_cost
1480 || (inside_cost == min->inside_cost
1481 && outside_cost < min->outside_cost))
1483 min->inside_cost = inside_cost;
1484 min->outside_cost = outside_cost;
1485 min->peel_info.dr = elem->dr;
1486 min->peel_info.npeel = elem->npeel;
1487 min->peel_info.count = elem->count;
1490 return 1;
1494 /* Choose best peeling option by traversing peeling hash table and either
1495 choosing an option with the lowest cost (if cost model is enabled) or the
1496 option that aligns as many accesses as possible. */
1498 static struct _vect_peel_extended_info
1499 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1500 loop_vec_info loop_vinfo)
1502 struct _vect_peel_extended_info res;
1504 res.peel_info.dr = NULL;
1506 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1508 res.inside_cost = INT_MAX;
1509 res.outside_cost = INT_MAX;
1510 peeling_htab->traverse <_vect_peel_extended_info *,
1511 vect_peeling_hash_get_lowest_cost> (&res);
1513 else
1515 res.peel_info.count = 0;
1516 peeling_htab->traverse <_vect_peel_extended_info *,
1517 vect_peeling_hash_get_most_frequent> (&res);
1518 res.inside_cost = 0;
1519 res.outside_cost = 0;
1522 return res;
1525 /* Return true if the new peeling NPEEL is supported. */
1527 static bool
1528 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1529 unsigned npeel)
1531 unsigned i;
1532 struct data_reference *dr = NULL;
1533 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1534 gimple *stmt;
1535 stmt_vec_info stmt_info;
1536 enum dr_alignment_support supportable_dr_alignment;
1538 /* Ensure that all data refs can be vectorized after the peel. */
1539 FOR_EACH_VEC_ELT (datarefs, i, dr)
1541 int save_misalignment;
1543 if (dr == dr0)
1544 continue;
1546 stmt = vect_dr_stmt (dr);
1547 stmt_info = vinfo_for_stmt (stmt);
1548 /* For interleaving, only the alignment of the first access
1549 matters. */
1550 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1551 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1552 continue;
1554 /* Strided accesses perform only component accesses, alignment is
1555 irrelevant for them. */
1556 if (STMT_VINFO_STRIDED_P (stmt_info)
1557 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1558 continue;
1560 save_misalignment = DR_MISALIGNMENT (dr);
1561 vect_update_misalignment_for_peel (dr, dr0, npeel);
1562 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1563 SET_DR_MISALIGNMENT (dr, save_misalignment);
1565 if (!supportable_dr_alignment)
1566 return false;
1569 return true;
1572 /* Function vect_enhance_data_refs_alignment
1574 This pass will use loop versioning and loop peeling in order to enhance
1575 the alignment of data references in the loop.
1577 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1578 original loop is to be vectorized. Any other loops that are created by
1579 the transformations performed in this pass - are not supposed to be
1580 vectorized. This restriction will be relaxed.
1582 This pass will require a cost model to guide it whether to apply peeling
1583 or versioning or a combination of the two. For example, the scheme that
1584 intel uses when given a loop with several memory accesses, is as follows:
1585 choose one memory access ('p') which alignment you want to force by doing
1586 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1587 other accesses are not necessarily aligned, or (2) use loop versioning to
1588 generate one loop in which all accesses are aligned, and another loop in
1589 which only 'p' is necessarily aligned.
1591 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1592 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1593 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1595 Devising a cost model is the most critical aspect of this work. It will
1596 guide us on which access to peel for, whether to use loop versioning, how
1597 many versions to create, etc. The cost model will probably consist of
1598 generic considerations as well as target specific considerations (on
1599 powerpc for example, misaligned stores are more painful than misaligned
1600 loads).
1602 Here are the general steps involved in alignment enhancements:
1604 -- original loop, before alignment analysis:
1605 for (i=0; i<N; i++){
1606 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1607 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1610 -- After vect_compute_data_refs_alignment:
1611 for (i=0; i<N; i++){
1612 x = q[i]; # DR_MISALIGNMENT(q) = 3
1613 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1616 -- Possibility 1: we do loop versioning:
1617 if (p is aligned) {
1618 for (i=0; i<N; i++){ # loop 1A
1619 x = q[i]; # DR_MISALIGNMENT(q) = 3
1620 p[i] = y; # DR_MISALIGNMENT(p) = 0
1623 else {
1624 for (i=0; i<N; i++){ # loop 1B
1625 x = q[i]; # DR_MISALIGNMENT(q) = 3
1626 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1630 -- Possibility 2: we do loop peeling:
1631 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1632 x = q[i];
1633 p[i] = y;
1635 for (i = 3; i < N; i++){ # loop 2A
1636 x = q[i]; # DR_MISALIGNMENT(q) = 0
1637 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1640 -- Possibility 3: combination of loop peeling and versioning:
1641 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1642 x = q[i];
1643 p[i] = y;
1645 if (p is aligned) {
1646 for (i = 3; i<N; i++){ # loop 3A
1647 x = q[i]; # DR_MISALIGNMENT(q) = 0
1648 p[i] = y; # DR_MISALIGNMENT(p) = 0
1651 else {
1652 for (i = 3; i<N; i++){ # loop 3B
1653 x = q[i]; # DR_MISALIGNMENT(q) = 0
1654 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1658 These loops are later passed to loop_transform to be vectorized. The
1659 vectorizer will use the alignment information to guide the transformation
1660 (whether to generate regular loads/stores, or with special handling for
1661 misalignment). */
1663 bool
1664 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1666 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1667 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1668 enum dr_alignment_support supportable_dr_alignment;
1669 struct data_reference *dr0 = NULL, *first_store = NULL;
1670 struct data_reference *dr;
1671 unsigned int i, j;
1672 bool do_peeling = false;
1673 bool do_versioning = false;
1674 bool stat;
1675 gimple *stmt;
1676 stmt_vec_info stmt_info;
1677 unsigned int npeel = 0;
1678 bool one_misalignment_known = false;
1679 bool one_misalignment_unknown = false;
1680 bool one_dr_unsupportable = false;
1681 struct data_reference *unsupportable_dr = NULL;
1682 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1683 unsigned possible_npeel_number = 1;
1684 tree vectype;
1685 unsigned int mis, same_align_drs_max = 0;
1686 hash_table<peel_info_hasher> peeling_htab (1);
1688 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1690 /* Reset data so we can safely be called multiple times. */
1691 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1692 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1694 /* While cost model enhancements are expected in the future, the high level
1695 view of the code at this time is as follows:
1697 A) If there is a misaligned access then see if peeling to align
1698 this access can make all data references satisfy
1699 vect_supportable_dr_alignment. If so, update data structures
1700 as needed and return true.
1702 B) If peeling wasn't possible and there is a data reference with an
1703 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1704 then see if loop versioning checks can be used to make all data
1705 references satisfy vect_supportable_dr_alignment. If so, update
1706 data structures as needed and return true.
1708 C) If neither peeling nor versioning were successful then return false if
1709 any data reference does not satisfy vect_supportable_dr_alignment.
1711 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1713 Note, Possibility 3 above (which is peeling and versioning together) is not
1714 being done at this time. */
1716 /* (1) Peeling to force alignment. */
1718 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1719 Considerations:
1720 + How many accesses will become aligned due to the peeling
1721 - How many accesses will become unaligned due to the peeling,
1722 and the cost of misaligned accesses.
1723 - The cost of peeling (the extra runtime checks, the increase
1724 in code size). */
1726 FOR_EACH_VEC_ELT (datarefs, i, dr)
1728 stmt = vect_dr_stmt (dr);
1729 stmt_info = vinfo_for_stmt (stmt);
1731 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1732 continue;
1734 /* For interleaving, only the alignment of the first access
1735 matters. */
1736 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1737 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1738 continue;
1740 /* For scatter-gather or invariant accesses there is nothing
1741 to enhance. */
1742 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1743 || integer_zerop (DR_STEP (dr)))
1744 continue;
1746 /* Strided accesses perform only component accesses, alignment is
1747 irrelevant for them. */
1748 if (STMT_VINFO_STRIDED_P (stmt_info)
1749 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1750 continue;
1752 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1753 do_peeling = vector_alignment_reachable_p (dr);
1754 if (do_peeling)
1756 if (known_alignment_for_access_p (dr))
1758 unsigned int npeel_tmp = 0;
1759 bool negative = tree_int_cst_compare (DR_STEP (dr),
1760 size_zero_node) < 0;
1762 vectype = STMT_VINFO_VECTYPE (stmt_info);
1763 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1764 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1765 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1766 if (DR_MISALIGNMENT (dr) != 0)
1767 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1769 /* For multiple types, it is possible that the bigger type access
1770 will have more than one peeling option. E.g., a loop with two
1771 types: one of size (vector size / 4), and the other one of
1772 size (vector size / 8). Vectorization factor will 8. If both
1773 accesses are misaligned by 3, the first one needs one scalar
1774 iteration to be aligned, and the second one needs 5. But the
1775 first one will be aligned also by peeling 5 scalar
1776 iterations, and in that case both accesses will be aligned.
1777 Hence, except for the immediate peeling amount, we also want
1778 to try to add full vector size, while we don't exceed
1779 vectorization factor.
1780 We do this automatically for cost model, since we calculate
1781 cost for every peeling option. */
1782 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1784 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786 possible_npeel_number
1787 = vect_get_num_vectors (nscalars, vectype);
1789 /* NPEEL_TMP is 0 when there is no misalignment, but also
1790 allow peeling NELEMENTS. */
1791 if (DR_MISALIGNMENT (dr) == 0)
1792 possible_npeel_number++;
1795 /* Save info about DR in the hash table. Also include peeling
1796 amounts according to the explanation above. */
1797 for (j = 0; j < possible_npeel_number; j++)
1799 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800 dr, npeel_tmp);
1801 npeel_tmp += target_align / dr_size;
1804 one_misalignment_known = true;
1806 else
1808 /* If we don't know any misalignment values, we prefer
1809 peeling for data-ref that has the maximum number of data-refs
1810 with the same alignment, unless the target prefers to align
1811 stores over load. */
1812 unsigned same_align_drs
1813 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814 if (!dr0
1815 || same_align_drs_max < same_align_drs)
1817 same_align_drs_max = same_align_drs;
1818 dr0 = dr;
1820 /* For data-refs with the same number of related
1821 accesses prefer the one where the misalign
1822 computation will be invariant in the outermost loop. */
1823 else if (same_align_drs_max == same_align_drs)
1825 struct loop *ivloop0, *ivloop;
1826 ivloop0 = outermost_invariant_loop_for_expr
1827 (loop, DR_BASE_ADDRESS (dr0));
1828 ivloop = outermost_invariant_loop_for_expr
1829 (loop, DR_BASE_ADDRESS (dr));
1830 if ((ivloop && !ivloop0)
1831 || (ivloop && ivloop0
1832 && flow_loop_nested_p (ivloop, ivloop0)))
1833 dr0 = dr;
1836 one_misalignment_unknown = true;
1838 /* Check for data refs with unsupportable alignment that
1839 can be peeled. */
1840 if (!supportable_dr_alignment)
1842 one_dr_unsupportable = true;
1843 unsupportable_dr = dr;
1846 if (!first_store && DR_IS_WRITE (dr))
1847 first_store = dr;
1850 else
1852 if (!aligned_access_p (dr))
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856 "vector alignment may not be reachable\n");
1857 break;
1862 /* Check if we can possibly peel the loop. */
1863 if (!vect_can_advance_ivs_p (loop_vinfo)
1864 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865 || loop->inner)
1866 do_peeling = false;
1868 struct _vect_peel_extended_info peel_for_known_alignment;
1869 struct _vect_peel_extended_info peel_for_unknown_alignment;
1870 struct _vect_peel_extended_info best_peel;
1872 peel_for_unknown_alignment.inside_cost = INT_MAX;
1873 peel_for_unknown_alignment.outside_cost = INT_MAX;
1874 peel_for_unknown_alignment.peel_info.count = 0;
1876 if (do_peeling
1877 && one_misalignment_unknown)
1879 /* Check if the target requires to prefer stores over loads, i.e., if
1880 misaligned stores are more expensive than misaligned loads (taking
1881 drs with same alignment into account). */
1882 unsigned int load_inside_cost = 0;
1883 unsigned int load_outside_cost = 0;
1884 unsigned int store_inside_cost = 0;
1885 unsigned int store_outside_cost = 0;
1886 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1888 stmt_vector_for_cost dummy;
1889 dummy.create (2);
1890 vect_get_peeling_costs_all_drs (datarefs, dr0,
1891 &load_inside_cost,
1892 &load_outside_cost,
1893 &dummy, &dummy, estimated_npeels, true);
1894 dummy.release ();
1896 if (first_store)
1898 dummy.create (2);
1899 vect_get_peeling_costs_all_drs (datarefs, first_store,
1900 &store_inside_cost,
1901 &store_outside_cost,
1902 &dummy, &dummy,
1903 estimated_npeels, true);
1904 dummy.release ();
1906 else
1908 store_inside_cost = INT_MAX;
1909 store_outside_cost = INT_MAX;
1912 if (load_inside_cost > store_inside_cost
1913 || (load_inside_cost == store_inside_cost
1914 && load_outside_cost > store_outside_cost))
1916 dr0 = first_store;
1917 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1920 else
1922 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1926 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927 prologue_cost_vec.create (2);
1928 epilogue_cost_vec.create (2);
1930 int dummy2;
1931 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932 (loop_vinfo, estimated_npeels, &dummy2,
1933 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934 &prologue_cost_vec, &epilogue_cost_vec);
1936 prologue_cost_vec.release ();
1937 epilogue_cost_vec.release ();
1939 peel_for_unknown_alignment.peel_info.count = 1
1940 + STMT_VINFO_SAME_ALIGN_REFS
1941 (vinfo_for_stmt (vect_dr_stmt (dr0))).length ();
1944 peel_for_unknown_alignment.peel_info.npeel = 0;
1945 peel_for_unknown_alignment.peel_info.dr = dr0;
1947 best_peel = peel_for_unknown_alignment;
1949 peel_for_known_alignment.inside_cost = INT_MAX;
1950 peel_for_known_alignment.outside_cost = INT_MAX;
1951 peel_for_known_alignment.peel_info.count = 0;
1952 peel_for_known_alignment.peel_info.dr = NULL;
1954 if (do_peeling && one_misalignment_known)
1956 /* Peeling is possible, but there is no data access that is not supported
1957 unless aligned. So we try to choose the best possible peeling from
1958 the hash table. */
1959 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1960 (&peeling_htab, loop_vinfo);
1963 /* Compare costs of peeling for known and unknown alignment. */
1964 if (peel_for_known_alignment.peel_info.dr != NULL
1965 && peel_for_unknown_alignment.inside_cost
1966 >= peel_for_known_alignment.inside_cost)
1968 best_peel = peel_for_known_alignment;
1970 /* If the best peeling for known alignment has NPEEL == 0, perform no
1971 peeling at all except if there is an unsupportable dr that we can
1972 align. */
1973 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1974 do_peeling = false;
1977 /* If there is an unsupportable data ref, prefer this over all choices so far
1978 since we'd have to discard a chosen peeling except when it accidentally
1979 aligned the unsupportable data ref. */
1980 if (one_dr_unsupportable)
1981 dr0 = unsupportable_dr;
1982 else if (do_peeling)
1984 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1985 TODO: Use nopeel_outside_cost or get rid of it? */
1986 unsigned nopeel_inside_cost = 0;
1987 unsigned nopeel_outside_cost = 0;
1989 stmt_vector_for_cost dummy;
1990 dummy.create (2);
1991 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1992 &nopeel_outside_cost, &dummy, &dummy,
1993 0, false);
1994 dummy.release ();
1996 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1997 costs will be recorded. */
1998 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1999 prologue_cost_vec.create (2);
2000 epilogue_cost_vec.create (2);
2002 int dummy2;
2003 nopeel_outside_cost += vect_get_known_peeling_cost
2004 (loop_vinfo, 0, &dummy2,
2005 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2006 &prologue_cost_vec, &epilogue_cost_vec);
2008 prologue_cost_vec.release ();
2009 epilogue_cost_vec.release ();
2011 npeel = best_peel.peel_info.npeel;
2012 dr0 = best_peel.peel_info.dr;
2014 /* If no peeling is not more expensive than the best peeling we
2015 have so far, don't perform any peeling. */
2016 if (nopeel_inside_cost <= best_peel.inside_cost)
2017 do_peeling = false;
2020 if (do_peeling)
2022 stmt = vect_dr_stmt (dr0);
2023 stmt_info = vinfo_for_stmt (stmt);
2024 vectype = STMT_VINFO_VECTYPE (stmt_info);
2026 if (known_alignment_for_access_p (dr0))
2028 bool negative = tree_int_cst_compare (DR_STEP (dr0),
2029 size_zero_node) < 0;
2030 if (!npeel)
2032 /* Since it's known at compile time, compute the number of
2033 iterations in the peeled loop (the peeling factor) for use in
2034 updating DR_MISALIGNMENT values. The peeling factor is the
2035 vectorization factor minus the misalignment as an element
2036 count. */
2037 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2038 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2039 npeel = ((mis & (target_align - 1))
2040 / vect_get_scalar_dr_size (dr0));
2043 /* For interleaved data access every iteration accesses all the
2044 members of the group, therefore we divide the number of iterations
2045 by the group size. */
2046 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr0));
2047 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2048 npeel /= DR_GROUP_SIZE (stmt_info);
2050 if (dump_enabled_p ())
2051 dump_printf_loc (MSG_NOTE, vect_location,
2052 "Try peeling by %d\n", npeel);
2055 /* Ensure that all datarefs can be vectorized after the peel. */
2056 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2057 do_peeling = false;
2059 /* Check if all datarefs are supportable and log. */
2060 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2062 stat = vect_verify_datarefs_alignment (loop_vinfo);
2063 if (!stat)
2064 do_peeling = false;
2065 else
2066 return stat;
2069 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2070 if (do_peeling)
2072 unsigned max_allowed_peel
2073 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2074 if (max_allowed_peel != (unsigned)-1)
2076 unsigned max_peel = npeel;
2077 if (max_peel == 0)
2079 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2080 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2082 if (max_peel > max_allowed_peel)
2084 do_peeling = false;
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_NOTE, vect_location,
2087 "Disable peeling, max peels reached: %d\n", max_peel);
2092 /* Cost model #2 - if peeling may result in a remaining loop not
2093 iterating enough to be vectorized then do not peel. Since this
2094 is a cost heuristic rather than a correctness decision, use the
2095 most likely runtime value for variable vectorization factors. */
2096 if (do_peeling
2097 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2099 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2100 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2101 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2102 < assumed_vf + max_peel)
2103 do_peeling = false;
2106 if (do_peeling)
2108 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2109 If the misalignment of DR_i is identical to that of dr0 then set
2110 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2111 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2112 by the peeling factor times the element size of DR_i (MOD the
2113 vectorization factor times the size). Otherwise, the
2114 misalignment of DR_i must be set to unknown. */
2115 FOR_EACH_VEC_ELT (datarefs, i, dr)
2116 if (dr != dr0)
2118 /* Strided accesses perform only component accesses, alignment
2119 is irrelevant for them. */
2120 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2121 if (STMT_VINFO_STRIDED_P (stmt_info)
2122 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2123 continue;
2125 vect_update_misalignment_for_peel (dr, dr0, npeel);
2128 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2129 if (npeel)
2130 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2131 else
2132 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2133 = DR_MISALIGNMENT (dr0);
2134 SET_DR_MISALIGNMENT (dr0, 0);
2135 if (dump_enabled_p ())
2137 dump_printf_loc (MSG_NOTE, vect_location,
2138 "Alignment of access forced using peeling.\n");
2139 dump_printf_loc (MSG_NOTE, vect_location,
2140 "Peeling for alignment will be applied.\n");
2143 /* The inside-loop cost will be accounted for in vectorizable_load
2144 and vectorizable_store correctly with adjusted alignments.
2145 Drop the body_cst_vec on the floor here. */
2146 stat = vect_verify_datarefs_alignment (loop_vinfo);
2147 gcc_assert (stat);
2148 return stat;
2152 /* (2) Versioning to force alignment. */
2154 /* Try versioning if:
2155 1) optimize loop for speed
2156 2) there is at least one unsupported misaligned data ref with an unknown
2157 misalignment, and
2158 3) all misaligned data refs with a known misalignment are supported, and
2159 4) the number of runtime alignment checks is within reason. */
2161 do_versioning =
2162 optimize_loop_nest_for_speed_p (loop)
2163 && (!loop->inner); /* FORNOW */
2165 if (do_versioning)
2167 FOR_EACH_VEC_ELT (datarefs, i, dr)
2169 stmt = vect_dr_stmt (dr);
2170 stmt_info = vinfo_for_stmt (stmt);
2172 /* For interleaving, only the alignment of the first access
2173 matters. */
2174 if (aligned_access_p (dr)
2175 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2176 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2177 continue;
2179 if (STMT_VINFO_STRIDED_P (stmt_info))
2181 /* Strided loads perform only component accesses, alignment is
2182 irrelevant for them. */
2183 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2184 continue;
2185 do_versioning = false;
2186 break;
2189 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2191 if (!supportable_dr_alignment)
2193 gimple *stmt;
2194 int mask;
2195 tree vectype;
2197 if (known_alignment_for_access_p (dr)
2198 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2199 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2201 do_versioning = false;
2202 break;
2205 stmt = vect_dr_stmt (dr);
2206 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2207 gcc_assert (vectype);
2209 /* At present we don't support versioning for alignment
2210 with variable VF, since there's no guarantee that the
2211 VF is a power of two. We could relax this if we added
2212 a way of enforcing a power-of-two size. */
2213 unsigned HOST_WIDE_INT size;
2214 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2216 do_versioning = false;
2217 break;
2220 /* The rightmost bits of an aligned address must be zeros.
2221 Construct the mask needed for this test. For example,
2222 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2223 mask must be 15 = 0xf. */
2224 mask = size - 1;
2226 /* FORNOW: use the same mask to test all potentially unaligned
2227 references in the loop. The vectorizer currently supports
2228 a single vector size, see the reference to
2229 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2230 vectorization factor is computed. */
2231 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2232 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2233 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2234 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2235 vect_dr_stmt (dr));
2239 /* Versioning requires at least one misaligned data reference. */
2240 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2241 do_versioning = false;
2242 else if (!do_versioning)
2243 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2246 if (do_versioning)
2248 vec<gimple *> may_misalign_stmts
2249 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2250 gimple *stmt;
2252 /* It can now be assumed that the data references in the statements
2253 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2254 of the loop being vectorized. */
2255 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2257 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2258 dr = STMT_VINFO_DATA_REF (stmt_info);
2259 SET_DR_MISALIGNMENT (dr, 0);
2260 if (dump_enabled_p ())
2261 dump_printf_loc (MSG_NOTE, vect_location,
2262 "Alignment of access forced using versioning.\n");
2265 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_NOTE, vect_location,
2267 "Versioning for alignment will be applied.\n");
2269 /* Peeling and versioning can't be done together at this time. */
2270 gcc_assert (! (do_peeling && do_versioning));
2272 stat = vect_verify_datarefs_alignment (loop_vinfo);
2273 gcc_assert (stat);
2274 return stat;
2277 /* This point is reached if neither peeling nor versioning is being done. */
2278 gcc_assert (! (do_peeling || do_versioning));
2280 stat = vect_verify_datarefs_alignment (loop_vinfo);
2281 return stat;
2285 /* Function vect_find_same_alignment_drs.
2287 Update group and alignment relations according to the chosen
2288 vectorization factor. */
2290 static void
2291 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2293 struct data_reference *dra = DDR_A (ddr);
2294 struct data_reference *drb = DDR_B (ddr);
2295 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2296 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2298 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2299 return;
2301 if (dra == drb)
2302 return;
2304 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2305 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2306 return;
2308 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2309 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2310 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2311 return;
2313 /* Two references with distance zero have the same alignment. */
2314 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2315 - wi::to_poly_offset (DR_INIT (drb)));
2316 if (maybe_ne (diff, 0))
2318 /* Get the wider of the two alignments. */
2319 unsigned int align_a = (vect_calculate_target_alignment (dra)
2320 / BITS_PER_UNIT);
2321 unsigned int align_b = (vect_calculate_target_alignment (drb)
2322 / BITS_PER_UNIT);
2323 unsigned int max_align = MAX (align_a, align_b);
2325 /* Require the gap to be a multiple of the larger vector alignment. */
2326 if (!multiple_p (diff, max_align))
2327 return;
2330 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2331 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2332 if (dump_enabled_p ())
2334 dump_printf_loc (MSG_NOTE, vect_location,
2335 "accesses have the same alignment: ");
2336 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2337 dump_printf (MSG_NOTE, " and ");
2338 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2339 dump_printf (MSG_NOTE, "\n");
2344 /* Function vect_analyze_data_refs_alignment
2346 Analyze the alignment of the data-references in the loop.
2347 Return FALSE if a data reference is found that cannot be vectorized. */
2349 bool
2350 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2352 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2354 /* Mark groups of data references with same alignment using
2355 data dependence information. */
2356 vec<ddr_p> ddrs = vinfo->ddrs;
2357 struct data_dependence_relation *ddr;
2358 unsigned int i;
2360 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2361 vect_find_same_alignment_drs (ddr);
2363 vec<data_reference_p> datarefs = vinfo->datarefs;
2364 struct data_reference *dr;
2366 vect_record_base_alignments (vinfo);
2367 FOR_EACH_VEC_ELT (datarefs, i, dr)
2369 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2370 if (STMT_VINFO_VECTORIZABLE (stmt_info))
2371 vect_compute_data_ref_alignment (dr);
2374 return true;
2378 /* Analyze alignment of DRs of stmts in NODE. */
2380 static bool
2381 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2383 /* We vectorize from the first scalar stmt in the node unless
2384 the node is permuted in which case we start from the first
2385 element in the group. */
2386 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2387 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2388 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2389 first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2391 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2392 vect_compute_data_ref_alignment (dr);
2393 /* For creating the data-ref pointer we need alignment of the
2394 first element anyway. */
2395 if (dr != first_dr)
2396 vect_compute_data_ref_alignment (first_dr);
2397 if (! verify_data_ref_alignment (dr))
2399 if (dump_enabled_p ())
2400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2401 "not vectorized: bad data alignment in basic "
2402 "block.\n");
2403 return false;
2406 return true;
2409 /* Function vect_slp_analyze_instance_alignment
2411 Analyze the alignment of the data-references in the SLP instance.
2412 Return FALSE if a data reference is found that cannot be vectorized. */
2414 bool
2415 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2417 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2419 slp_tree node;
2420 unsigned i;
2421 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2422 if (! vect_slp_analyze_and_verify_node_alignment (node))
2423 return false;
2425 node = SLP_INSTANCE_TREE (instance);
2426 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2427 && ! vect_slp_analyze_and_verify_node_alignment
2428 (SLP_INSTANCE_TREE (instance)))
2429 return false;
2431 return true;
2435 /* Analyze groups of accesses: check that DR belongs to a group of
2436 accesses of legal size, step, etc. Detect gaps, single element
2437 interleaving, and other special cases. Set grouped access info.
2438 Collect groups of strided stores for further use in SLP analysis.
2439 Worker for vect_analyze_group_access. */
2441 static bool
2442 vect_analyze_group_access_1 (struct data_reference *dr)
2444 tree step = DR_STEP (dr);
2445 tree scalar_type = TREE_TYPE (DR_REF (dr));
2446 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2447 gimple *stmt = vect_dr_stmt (dr);
2448 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2449 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2450 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2451 HOST_WIDE_INT dr_step = -1;
2452 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2453 bool slp_impossible = false;
2455 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2456 size of the interleaving group (including gaps). */
2457 if (tree_fits_shwi_p (step))
2459 dr_step = tree_to_shwi (step);
2460 /* Check that STEP is a multiple of type size. Otherwise there is
2461 a non-element-sized gap at the end of the group which we
2462 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2463 ??? As we can handle non-constant step fine here we should
2464 simply remove uses of DR_GROUP_GAP between the last and first
2465 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2466 simply not include that gap. */
2467 if ((dr_step % type_size) != 0)
2469 if (dump_enabled_p ())
2471 dump_printf_loc (MSG_NOTE, vect_location,
2472 "Step ");
2473 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2474 dump_printf (MSG_NOTE,
2475 " is not a multiple of the element size for ");
2476 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2477 dump_printf (MSG_NOTE, "\n");
2479 return false;
2481 groupsize = absu_hwi (dr_step) / type_size;
2483 else
2484 groupsize = 0;
2486 /* Not consecutive access is possible only if it is a part of interleaving. */
2487 if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2489 /* Check if it this DR is a part of interleaving, and is a single
2490 element of the group that is accessed in the loop. */
2492 /* Gaps are supported only for loads. STEP must be a multiple of the type
2493 size. */
2494 if (DR_IS_READ (dr)
2495 && (dr_step % type_size) == 0
2496 && groupsize > 0)
2498 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2499 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2500 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2501 if (dump_enabled_p ())
2503 dump_printf_loc (MSG_NOTE, vect_location,
2504 "Detected single element interleaving ");
2505 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2506 dump_printf (MSG_NOTE, " step ");
2507 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2508 dump_printf (MSG_NOTE, "\n");
2511 return true;
2514 if (dump_enabled_p ())
2516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2517 "not consecutive access ");
2518 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2521 if (bb_vinfo)
2523 /* Mark the statement as unvectorizable. */
2524 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
2525 return true;
2528 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2529 STMT_VINFO_STRIDED_P (stmt_info) = true;
2530 return true;
2533 if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2535 /* First stmt in the interleaving chain. Check the chain. */
2536 gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2537 struct data_reference *data_ref = dr;
2538 unsigned int count = 1;
2539 tree prev_init = DR_INIT (data_ref);
2540 gimple *prev = stmt;
2541 HOST_WIDE_INT diff, gaps = 0;
2543 /* By construction, all group members have INTEGER_CST DR_INITs. */
2544 while (next)
2546 /* Skip same data-refs. In case that two or more stmts share
2547 data-ref (supported only for loads), we vectorize only the first
2548 stmt, and the rest get their vectorized loads from the first
2549 one. */
2550 if (!tree_int_cst_compare (DR_INIT (data_ref),
2551 DR_INIT (STMT_VINFO_DATA_REF (
2552 vinfo_for_stmt (next)))))
2554 if (DR_IS_WRITE (data_ref))
2556 if (dump_enabled_p ())
2557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2558 "Two store stmts share the same dr.\n");
2559 return false;
2562 if (dump_enabled_p ())
2563 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2564 "Two or more load stmts share the same dr.\n");
2566 /* For load use the same data-ref load. */
2567 DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2569 prev = next;
2570 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2571 continue;
2574 prev = next;
2575 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2577 /* All group members have the same STEP by construction. */
2578 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2580 /* Check that the distance between two accesses is equal to the type
2581 size. Otherwise, we have gaps. */
2582 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2583 - TREE_INT_CST_LOW (prev_init)) / type_size;
2584 if (diff != 1)
2586 /* FORNOW: SLP of accesses with gaps is not supported. */
2587 slp_impossible = true;
2588 if (DR_IS_WRITE (data_ref))
2590 if (dump_enabled_p ())
2591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2592 "interleaved store with gaps\n");
2593 return false;
2596 gaps += diff - 1;
2599 last_accessed_element += diff;
2601 /* Store the gap from the previous member of the group. If there is no
2602 gap in the access, DR_GROUP_GAP is always 1. */
2603 DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2605 prev_init = DR_INIT (data_ref);
2606 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2607 /* Count the number of data-refs in the chain. */
2608 count++;
2611 if (groupsize == 0)
2612 groupsize = count + gaps;
2614 /* This could be UINT_MAX but as we are generating code in a very
2615 inefficient way we have to cap earlier. See PR78699 for example. */
2616 if (groupsize > 4096)
2618 if (dump_enabled_p ())
2619 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2620 "group is too large\n");
2621 return false;
2624 /* Check that the size of the interleaving is equal to count for stores,
2625 i.e., that there are no gaps. */
2626 if (groupsize != count
2627 && !DR_IS_READ (dr))
2629 if (dump_enabled_p ())
2630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631 "interleaved store with gaps\n");
2632 return false;
2635 /* If there is a gap after the last load in the group it is the
2636 difference between the groupsize and the last accessed
2637 element.
2638 When there is no gap, this difference should be 0. */
2639 DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2641 DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2642 if (dump_enabled_p ())
2644 dump_printf_loc (MSG_NOTE, vect_location,
2645 "Detected interleaving ");
2646 if (DR_IS_READ (dr))
2647 dump_printf (MSG_NOTE, "load ");
2648 else
2649 dump_printf (MSG_NOTE, "store ");
2650 dump_printf (MSG_NOTE, "of size %u starting with ",
2651 (unsigned)groupsize);
2652 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2653 if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2654 dump_printf_loc (MSG_NOTE, vect_location,
2655 "There is a gap of %u elements after the group\n",
2656 DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2659 /* SLP: create an SLP data structure for every interleaving group of
2660 stores for further analysis in vect_analyse_slp. */
2661 if (DR_IS_WRITE (dr) && !slp_impossible)
2663 if (loop_vinfo)
2664 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2665 if (bb_vinfo)
2666 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2670 return true;
2673 /* Analyze groups of accesses: check that DR belongs to a group of
2674 accesses of legal size, step, etc. Detect gaps, single element
2675 interleaving, and other special cases. Set grouped access info.
2676 Collect groups of strided stores for further use in SLP analysis. */
2678 static bool
2679 vect_analyze_group_access (struct data_reference *dr)
2681 if (!vect_analyze_group_access_1 (dr))
2683 /* Dissolve the group if present. */
2684 gimple *next;
2685 gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dr)));
2686 while (stmt)
2688 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2689 next = DR_GROUP_NEXT_ELEMENT (vinfo);
2690 DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2691 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2692 stmt = next;
2694 return false;
2696 return true;
2699 /* Analyze the access pattern of the data-reference DR.
2700 In case of non-consecutive accesses call vect_analyze_group_access() to
2701 analyze groups of accesses. */
2703 static bool
2704 vect_analyze_data_ref_access (struct data_reference *dr)
2706 tree step = DR_STEP (dr);
2707 tree scalar_type = TREE_TYPE (DR_REF (dr));
2708 gimple *stmt = vect_dr_stmt (dr);
2709 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2710 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2711 struct loop *loop = NULL;
2713 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2714 return true;
2716 if (loop_vinfo)
2717 loop = LOOP_VINFO_LOOP (loop_vinfo);
2719 if (loop_vinfo && !step)
2721 if (dump_enabled_p ())
2722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2723 "bad data-ref access in loop\n");
2724 return false;
2727 /* Allow loads with zero step in inner-loop vectorization. */
2728 if (loop_vinfo && integer_zerop (step))
2730 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2731 if (!nested_in_vect_loop_p (loop, stmt))
2732 return DR_IS_READ (dr);
2733 /* Allow references with zero step for outer loops marked
2734 with pragma omp simd only - it guarantees absence of
2735 loop-carried dependencies between inner loop iterations. */
2736 if (loop->safelen < 2)
2738 if (dump_enabled_p ())
2739 dump_printf_loc (MSG_NOTE, vect_location,
2740 "zero step in inner loop of nest\n");
2741 return false;
2745 if (loop && nested_in_vect_loop_p (loop, stmt))
2747 /* Interleaved accesses are not yet supported within outer-loop
2748 vectorization for references in the inner-loop. */
2749 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2751 /* For the rest of the analysis we use the outer-loop step. */
2752 step = STMT_VINFO_DR_STEP (stmt_info);
2753 if (integer_zerop (step))
2755 if (dump_enabled_p ())
2756 dump_printf_loc (MSG_NOTE, vect_location,
2757 "zero step in outer loop.\n");
2758 return DR_IS_READ (dr);
2762 /* Consecutive? */
2763 if (TREE_CODE (step) == INTEGER_CST)
2765 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2766 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2767 || (dr_step < 0
2768 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2770 /* Mark that it is not interleaving. */
2771 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2772 return true;
2776 if (loop && nested_in_vect_loop_p (loop, stmt))
2778 if (dump_enabled_p ())
2779 dump_printf_loc (MSG_NOTE, vect_location,
2780 "grouped access in outer loop.\n");
2781 return false;
2785 /* Assume this is a DR handled by non-constant strided load case. */
2786 if (TREE_CODE (step) != INTEGER_CST)
2787 return (STMT_VINFO_STRIDED_P (stmt_info)
2788 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2789 || vect_analyze_group_access (dr)));
2791 /* Not consecutive access - check if it's a part of interleaving group. */
2792 return vect_analyze_group_access (dr);
2795 /* Compare two data-references DRA and DRB to group them into chunks
2796 suitable for grouping. */
2798 static int
2799 dr_group_sort_cmp (const void *dra_, const void *drb_)
2801 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2802 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2803 int cmp;
2805 /* Stabilize sort. */
2806 if (dra == drb)
2807 return 0;
2809 /* DRs in different loops never belong to the same group. */
2810 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2811 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2812 if (loopa != loopb)
2813 return loopa->num < loopb->num ? -1 : 1;
2815 /* Ordering of DRs according to base. */
2816 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2817 DR_BASE_ADDRESS (drb));
2818 if (cmp != 0)
2819 return cmp;
2821 /* And according to DR_OFFSET. */
2822 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2823 if (cmp != 0)
2824 return cmp;
2826 /* Put reads before writes. */
2827 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2828 return DR_IS_READ (dra) ? -1 : 1;
2830 /* Then sort after access size. */
2831 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2832 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2833 if (cmp != 0)
2834 return cmp;
2836 /* And after step. */
2837 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2838 if (cmp != 0)
2839 return cmp;
2841 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2842 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2843 if (cmp == 0)
2844 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2845 return cmp;
2848 /* If OP is the result of a conversion, return the unconverted value,
2849 otherwise return null. */
2851 static tree
2852 strip_conversion (tree op)
2854 if (TREE_CODE (op) != SSA_NAME)
2855 return NULL_TREE;
2856 gimple *stmt = SSA_NAME_DEF_STMT (op);
2857 if (!is_gimple_assign (stmt)
2858 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2859 return NULL_TREE;
2860 return gimple_assign_rhs1 (stmt);
2863 /* Return true if vectorizable_* routines can handle statements STMT1
2864 and STMT2 being in a single group. */
2866 static bool
2867 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2869 if (gimple_assign_single_p (stmt1))
2870 return gimple_assign_single_p (stmt2);
2872 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2874 /* Check for two masked loads or two masked stores. */
2875 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2876 return false;
2877 internal_fn ifn = gimple_call_internal_fn (stmt1);
2878 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2879 return false;
2880 if (ifn != gimple_call_internal_fn (stmt2))
2881 return false;
2883 /* Check that the masks are the same. Cope with casts of masks,
2884 like those created by build_mask_conversion. */
2885 tree mask1 = gimple_call_arg (stmt1, 2);
2886 tree mask2 = gimple_call_arg (stmt2, 2);
2887 if (!operand_equal_p (mask1, mask2, 0))
2889 mask1 = strip_conversion (mask1);
2890 if (!mask1)
2891 return false;
2892 mask2 = strip_conversion (mask2);
2893 if (!mask2)
2894 return false;
2895 if (!operand_equal_p (mask1, mask2, 0))
2896 return false;
2898 return true;
2901 return false;
2904 /* Function vect_analyze_data_ref_accesses.
2906 Analyze the access pattern of all the data references in the loop.
2908 FORNOW: the only access pattern that is considered vectorizable is a
2909 simple step 1 (consecutive) access.
2911 FORNOW: handle only arrays and pointer accesses. */
2913 bool
2914 vect_analyze_data_ref_accesses (vec_info *vinfo)
2916 unsigned int i;
2917 vec<data_reference_p> datarefs = vinfo->datarefs;
2918 struct data_reference *dr;
2920 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2922 if (datarefs.is_empty ())
2923 return true;
2925 /* Sort the array of datarefs to make building the interleaving chains
2926 linear. Don't modify the original vector's order, it is needed for
2927 determining what dependencies are reversed. */
2928 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2929 datarefs_copy.qsort (dr_group_sort_cmp);
2931 /* Build the interleaving chains. */
2932 for (i = 0; i < datarefs_copy.length () - 1;)
2934 data_reference_p dra = datarefs_copy[i];
2935 stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2936 stmt_vec_info lastinfo = NULL;
2937 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2938 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2940 ++i;
2941 continue;
2943 for (i = i + 1; i < datarefs_copy.length (); ++i)
2945 data_reference_p drb = datarefs_copy[i];
2946 stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2947 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2948 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2949 break;
2951 /* ??? Imperfect sorting (non-compatible types, non-modulo
2952 accesses, same accesses) can lead to a group to be artificially
2953 split here as we don't just skip over those. If it really
2954 matters we can push those to a worklist and re-iterate
2955 over them. The we can just skip ahead to the next DR here. */
2957 /* DRs in a different loop should not be put into the same
2958 interleaving group. */
2959 if (gimple_bb (DR_STMT (dra))->loop_father
2960 != gimple_bb (DR_STMT (drb))->loop_father)
2961 break;
2963 /* Check that the data-refs have same first location (except init)
2964 and they are both either store or load (not load and store,
2965 not masked loads or stores). */
2966 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2967 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2968 DR_BASE_ADDRESS (drb)) != 0
2969 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2970 || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2971 break;
2973 /* Check that the data-refs have the same constant size. */
2974 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2975 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2976 if (!tree_fits_uhwi_p (sza)
2977 || !tree_fits_uhwi_p (szb)
2978 || !tree_int_cst_equal (sza, szb))
2979 break;
2981 /* Check that the data-refs have the same step. */
2982 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2983 break;
2985 /* Check the types are compatible.
2986 ??? We don't distinguish this during sorting. */
2987 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2988 TREE_TYPE (DR_REF (drb))))
2989 break;
2991 /* Check that the DR_INITs are compile-time constants. */
2992 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2993 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2994 break;
2996 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2997 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2998 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2999 HOST_WIDE_INT init_prev
3000 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3001 gcc_assert (init_a <= init_b
3002 && init_a <= init_prev
3003 && init_prev <= init_b);
3005 /* Do not place the same access in the interleaving chain twice. */
3006 if (init_b == init_prev)
3008 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3009 < gimple_uid (DR_STMT (drb)));
3010 /* ??? For now we simply "drop" the later reference which is
3011 otherwise the same rather than finishing off this group.
3012 In the end we'd want to re-process duplicates forming
3013 multiple groups from the refs, likely by just collecting
3014 all candidates (including duplicates and split points
3015 below) in a vector and then process them together. */
3016 continue;
3019 /* If init_b == init_a + the size of the type * k, we have an
3020 interleaving, and DRA is accessed before DRB. */
3021 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3022 if (type_size_a == 0
3023 || (init_b - init_a) % type_size_a != 0)
3024 break;
3026 /* If we have a store, the accesses are adjacent. This splits
3027 groups into chunks we support (we don't support vectorization
3028 of stores with gaps). */
3029 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3030 break;
3032 /* If the step (if not zero or non-constant) is greater than the
3033 difference between data-refs' inits this splits groups into
3034 suitable sizes. */
3035 if (tree_fits_shwi_p (DR_STEP (dra)))
3037 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3038 if (step != 0 && step <= (init_b - init_a))
3039 break;
3042 if (dump_enabled_p ())
3044 dump_printf_loc (MSG_NOTE, vect_location,
3045 "Detected interleaving ");
3046 if (DR_IS_READ (dra))
3047 dump_printf (MSG_NOTE, "load ");
3048 else
3049 dump_printf (MSG_NOTE, "store ");
3050 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3051 dump_printf (MSG_NOTE, " and ");
3052 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3053 dump_printf (MSG_NOTE, "\n");
3056 /* Link the found element into the group list. */
3057 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3059 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3060 lastinfo = stmtinfo_a;
3062 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3063 DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3064 lastinfo = stmtinfo_b;
3068 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3069 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr)))
3070 && !vect_analyze_data_ref_access (dr))
3072 if (dump_enabled_p ())
3073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3074 "not vectorized: complicated access pattern.\n");
3076 if (is_a <bb_vec_info> (vinfo))
3078 /* Mark the statement as not vectorizable. */
3079 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
3080 continue;
3082 else
3084 datarefs_copy.release ();
3085 return false;
3089 datarefs_copy.release ();
3090 return true;
3093 /* Function vect_vfa_segment_size.
3095 Input:
3096 DR: The data reference.
3097 LENGTH_FACTOR: segment length to consider.
3099 Return a value suitable for the dr_with_seg_len::seg_len field.
3100 This is the "distance travelled" by the pointer from the first
3101 iteration in the segment to the last. Note that it does not include
3102 the size of the access; in effect it only describes the first byte. */
3104 static tree
3105 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3107 length_factor = size_binop (MINUS_EXPR,
3108 fold_convert (sizetype, length_factor),
3109 size_one_node);
3110 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3111 length_factor);
3114 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3115 gives the worst-case number of bytes covered by the segment. */
3117 static unsigned HOST_WIDE_INT
3118 vect_vfa_access_size (data_reference *dr)
3120 stmt_vec_info stmt_vinfo = vinfo_for_stmt (vect_dr_stmt (dr));
3121 tree ref_type = TREE_TYPE (DR_REF (dr));
3122 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3123 unsigned HOST_WIDE_INT access_size = ref_size;
3124 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3126 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3127 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3129 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3130 && (vect_supportable_dr_alignment (dr, false)
3131 == dr_explicit_realign_optimized))
3133 /* We might access a full vector's worth. */
3134 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3135 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3137 return access_size;
3140 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3142 static unsigned int
3143 vect_vfa_align (const data_reference *dr)
3145 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3148 /* Function vect_no_alias_p.
3150 Given data references A and B with equal base and offset, see whether
3151 the alias relation can be decided at compilation time. Return 1 if
3152 it can and the references alias, 0 if it can and the references do
3153 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3154 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3155 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3157 static int
3158 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3159 tree segment_length_a, tree segment_length_b,
3160 unsigned HOST_WIDE_INT access_size_a,
3161 unsigned HOST_WIDE_INT access_size_b)
3163 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3164 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3165 poly_uint64 const_length_a;
3166 poly_uint64 const_length_b;
3168 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3169 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3170 [a, a+12) */
3171 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3173 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3174 offset_a = (offset_a + access_size_a) - const_length_a;
3176 else
3177 const_length_a = tree_to_poly_uint64 (segment_length_a);
3178 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3180 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3181 offset_b = (offset_b + access_size_b) - const_length_b;
3183 else
3184 const_length_b = tree_to_poly_uint64 (segment_length_b);
3186 const_length_a += access_size_a;
3187 const_length_b += access_size_b;
3189 if (ranges_known_overlap_p (offset_a, const_length_a,
3190 offset_b, const_length_b))
3191 return 1;
3193 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3194 offset_b, const_length_b))
3195 return 0;
3197 return -1;
3200 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3201 in DDR is >= VF. */
3203 static bool
3204 dependence_distance_ge_vf (data_dependence_relation *ddr,
3205 unsigned int loop_depth, poly_uint64 vf)
3207 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3208 || DDR_NUM_DIST_VECTS (ddr) == 0)
3209 return false;
3211 /* If the dependence is exact, we should have limited the VF instead. */
3212 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3214 unsigned int i;
3215 lambda_vector dist_v;
3216 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3218 HOST_WIDE_INT dist = dist_v[loop_depth];
3219 if (dist != 0
3220 && !(dist > 0 && DDR_REVERSED_P (ddr))
3221 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3222 return false;
3225 if (dump_enabled_p ())
3227 dump_printf_loc (MSG_NOTE, vect_location,
3228 "dependence distance between ");
3229 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3230 dump_printf (MSG_NOTE, " and ");
3231 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3232 dump_printf (MSG_NOTE, " is >= VF\n");
3235 return true;
3238 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3240 static void
3241 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3243 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3244 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3245 dump_printf (dump_kind, ") >= ");
3246 dump_dec (dump_kind, lower_bound.min_value);
3249 /* Record that the vectorized loop requires the vec_lower_bound described
3250 by EXPR, UNSIGNED_P and MIN_VALUE. */
3252 static void
3253 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3254 poly_uint64 min_value)
3256 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3257 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3258 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3260 unsigned_p &= lower_bounds[i].unsigned_p;
3261 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3262 if (lower_bounds[i].unsigned_p != unsigned_p
3263 || maybe_lt (lower_bounds[i].min_value, min_value))
3265 lower_bounds[i].unsigned_p = unsigned_p;
3266 lower_bounds[i].min_value = min_value;
3267 if (dump_enabled_p ())
3269 dump_printf_loc (MSG_NOTE, vect_location,
3270 "updating run-time check to ");
3271 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3272 dump_printf (MSG_NOTE, "\n");
3275 return;
3278 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3279 if (dump_enabled_p ())
3281 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3282 dump_lower_bound (MSG_NOTE, lower_bound);
3283 dump_printf (MSG_NOTE, "\n");
3285 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3288 /* Return true if it's unlikely that the step of the vectorized form of DR
3289 will span fewer than GAP bytes. */
3291 static bool
3292 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3294 stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
3295 HOST_WIDE_INT count
3296 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3297 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3298 count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3299 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3302 /* Return true if we know that there is no alias between DR_A and DR_B
3303 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3304 *LOWER_BOUND_OUT to this N. */
3306 static bool
3307 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3308 poly_uint64 *lower_bound_out)
3310 /* Check that there is a constant gap of known sign between DR_A
3311 and DR_B. */
3312 poly_int64 init_a, init_b;
3313 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3314 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3315 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3316 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3317 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3318 || !ordered_p (init_a, init_b))
3319 return false;
3321 /* Sort DR_A and DR_B by the address they access. */
3322 if (maybe_lt (init_b, init_a))
3324 std::swap (init_a, init_b);
3325 std::swap (dr_a, dr_b);
3328 /* If the two accesses could be dependent within a scalar iteration,
3329 make sure that we'd retain their order. */
3330 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3331 && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3332 vect_dr_stmt (dr_b)))
3333 return false;
3335 /* There is no alias if abs (DR_STEP) is greater than or equal to
3336 the bytes spanned by the combination of the two accesses. */
3337 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3338 return true;
3341 /* Function vect_prune_runtime_alias_test_list.
3343 Prune a list of ddrs to be tested at run-time by versioning for alias.
3344 Merge several alias checks into one if possible.
3345 Return FALSE if resulting list of ddrs is longer then allowed by
3346 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3348 bool
3349 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3351 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3352 hash_set <tree_pair_hash> compared_objects;
3354 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3355 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3356 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3357 vec<vec_object_pair> &check_unequal_addrs
3358 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3359 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3360 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3362 ddr_p ddr;
3363 unsigned int i;
3364 tree length_factor;
3366 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3368 /* Step values are irrelevant for aliasing if the number of vector
3369 iterations is equal to the number of scalar iterations (which can
3370 happen for fully-SLP loops). */
3371 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3373 if (!ignore_step_p)
3375 /* Convert the checks for nonzero steps into bound tests. */
3376 tree value;
3377 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3378 vect_check_lower_bound (loop_vinfo, value, true, 1);
3381 if (may_alias_ddrs.is_empty ())
3382 return true;
3384 comp_alias_ddrs.create (may_alias_ddrs.length ());
3386 unsigned int loop_depth
3387 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3388 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3390 /* First, we collect all data ref pairs for aliasing checks. */
3391 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3393 int comp_res;
3394 poly_uint64 lower_bound;
3395 struct data_reference *dr_a, *dr_b;
3396 gimple *dr_group_first_a, *dr_group_first_b;
3397 tree segment_length_a, segment_length_b;
3398 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3399 unsigned int align_a, align_b;
3400 gimple *stmt_a, *stmt_b;
3402 /* Ignore the alias if the VF we chose ended up being no greater
3403 than the dependence distance. */
3404 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3405 continue;
3407 if (DDR_OBJECT_A (ddr))
3409 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3410 if (!compared_objects.add (new_pair))
3412 if (dump_enabled_p ())
3414 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3415 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3416 dump_printf (MSG_NOTE, " and ");
3417 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3418 dump_printf (MSG_NOTE, " have different addresses\n");
3420 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3422 continue;
3425 dr_a = DDR_A (ddr);
3426 stmt_a = vect_dr_stmt (DDR_A (ddr));
3428 dr_b = DDR_B (ddr);
3429 stmt_b = vect_dr_stmt (DDR_B (ddr));
3431 /* Skip the pair if inter-iteration dependencies are irrelevant
3432 and intra-iteration dependencies are guaranteed to be honored. */
3433 if (ignore_step_p
3434 && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3435 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3437 if (dump_enabled_p ())
3439 dump_printf_loc (MSG_NOTE, vect_location,
3440 "no need for alias check between ");
3441 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3442 dump_printf (MSG_NOTE, " and ");
3443 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3444 dump_printf (MSG_NOTE, " when VF is 1\n");
3446 continue;
3449 /* See whether we can handle the alias using a bounds check on
3450 the step, and whether that's likely to be the best approach.
3451 (It might not be, for example, if the minimum step is much larger
3452 than the number of bytes handled by one vector iteration.) */
3453 if (!ignore_step_p
3454 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3455 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3456 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3457 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3459 bool unsigned_p = dr_known_forward_stride_p (dr_a);
3460 if (dump_enabled_p ())
3462 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3463 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3464 dump_printf (MSG_NOTE, " and ");
3465 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3466 dump_printf (MSG_NOTE, " when the step ");
3467 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3468 dump_printf (MSG_NOTE, " is outside ");
3469 if (unsigned_p)
3470 dump_printf (MSG_NOTE, "[0");
3471 else
3473 dump_printf (MSG_NOTE, "(");
3474 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3476 dump_printf (MSG_NOTE, ", ");
3477 dump_dec (MSG_NOTE, lower_bound);
3478 dump_printf (MSG_NOTE, ")\n");
3480 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3481 lower_bound);
3482 continue;
3485 dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3486 if (dr_group_first_a)
3488 stmt_a = dr_group_first_a;
3489 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3492 dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3493 if (dr_group_first_b)
3495 stmt_b = dr_group_first_b;
3496 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3499 if (ignore_step_p)
3501 segment_length_a = size_zero_node;
3502 segment_length_b = size_zero_node;
3504 else
3506 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3507 length_factor = scalar_loop_iters;
3508 else
3509 length_factor = size_int (vect_factor);
3510 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3511 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3513 access_size_a = vect_vfa_access_size (dr_a);
3514 access_size_b = vect_vfa_access_size (dr_b);
3515 align_a = vect_vfa_align (dr_a);
3516 align_b = vect_vfa_align (dr_b);
3518 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3519 DR_BASE_ADDRESS (dr_b));
3520 if (comp_res == 0)
3521 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3522 DR_OFFSET (dr_b));
3524 /* See whether the alias is known at compilation time. */
3525 if (comp_res == 0
3526 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3527 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3528 && poly_int_tree_p (segment_length_a)
3529 && poly_int_tree_p (segment_length_b))
3531 int res = vect_compile_time_alias (dr_a, dr_b,
3532 segment_length_a,
3533 segment_length_b,
3534 access_size_a,
3535 access_size_b);
3536 if (res >= 0 && dump_enabled_p ())
3538 dump_printf_loc (MSG_NOTE, vect_location,
3539 "can tell at compile time that ");
3540 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3541 dump_printf (MSG_NOTE, " and ");
3542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3543 if (res == 0)
3544 dump_printf (MSG_NOTE, " do not alias\n");
3545 else
3546 dump_printf (MSG_NOTE, " alias\n");
3549 if (res == 0)
3550 continue;
3552 if (res == 1)
3554 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_NOTE, vect_location,
3556 "not vectorized: compilation time alias.\n");
3557 return false;
3561 dr_with_seg_len_pair_t dr_with_seg_len_pair
3562 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3563 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3565 /* Canonicalize pairs by sorting the two DR members. */
3566 if (comp_res > 0)
3567 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3569 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3572 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3574 unsigned int count = (comp_alias_ddrs.length ()
3575 + check_unequal_addrs.length ());
3577 dump_printf_loc (MSG_NOTE, vect_location,
3578 "improved number of alias checks from %d to %d\n",
3579 may_alias_ddrs.length (), count);
3580 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3582 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584 "number of versioning for alias "
3585 "run-time tests exceeds %d "
3586 "(--param vect-max-version-for-alias-checks)\n",
3587 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3588 return false;
3591 return true;
3594 /* Check whether we can use an internal function for a gather load
3595 or scatter store. READ_P is true for loads and false for stores.
3596 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3597 the type of the memory elements being loaded or stored. OFFSET_BITS
3598 is the number of bits in each scalar offset and OFFSET_SIGN is the
3599 sign of the offset. SCALE is the amount by which the offset should
3600 be multiplied *after* it has been converted to address width.
3602 Return true if the function is supported, storing the function
3603 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3605 bool
3606 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3607 tree memory_type, unsigned int offset_bits,
3608 signop offset_sign, int scale,
3609 internal_fn *ifn_out, tree *element_type_out)
3611 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3612 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3613 if (offset_bits > element_bits)
3614 /* Internal functions require the offset to be the same width as
3615 the vector elements. We can extend narrower offsets, but it isn't
3616 safe to truncate wider offsets. */
3617 return false;
3619 if (element_bits != memory_bits)
3620 /* For now the vector elements must be the same width as the
3621 memory elements. */
3622 return false;
3624 /* Work out which function we need. */
3625 internal_fn ifn;
3626 if (read_p)
3627 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3628 else
3629 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3631 /* Test whether the target supports this combination. */
3632 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3633 offset_sign, scale))
3634 return false;
3636 *ifn_out = ifn;
3637 *element_type_out = TREE_TYPE (vectype);
3638 return true;
3641 /* CALL is a call to an internal gather load or scatter store function.
3642 Describe the operation in INFO. */
3644 static void
3645 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3647 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3648 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3649 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3651 info->ifn = gimple_call_internal_fn (call);
3652 info->decl = NULL_TREE;
3653 info->base = gimple_call_arg (call, 0);
3654 info->offset = gimple_call_arg (call, 1);
3655 info->offset_dt = vect_unknown_def_type;
3656 info->offset_vectype = NULL_TREE;
3657 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3658 info->element_type = TREE_TYPE (vectype);
3659 info->memory_type = TREE_TYPE (DR_REF (dr));
3662 /* Return true if a non-affine read or write in STMT is suitable for a
3663 gather load or scatter store. Describe the operation in *INFO if so. */
3665 bool
3666 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3667 gather_scatter_info *info)
3669 HOST_WIDE_INT scale = 1;
3670 poly_int64 pbitpos, pbitsize;
3671 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3672 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3673 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3674 tree offtype = NULL_TREE;
3675 tree decl = NULL_TREE, base, off;
3676 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3677 tree memory_type = TREE_TYPE (DR_REF (dr));
3678 machine_mode pmode;
3679 int punsignedp, reversep, pvolatilep = 0;
3680 internal_fn ifn;
3681 tree element_type;
3682 bool masked_p = false;
3684 /* See whether this is already a call to a gather/scatter internal function.
3685 If not, see whether it's a masked load or store. */
3686 gcall *call = dyn_cast <gcall *> (stmt);
3687 if (call && gimple_call_internal_p (call))
3689 ifn = gimple_call_internal_fn (stmt);
3690 if (internal_gather_scatter_fn_p (ifn))
3692 vect_describe_gather_scatter_call (call, info);
3693 return true;
3695 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3698 /* True if we should aim to use internal functions rather than
3699 built-in functions. */
3700 bool use_ifn_p = (DR_IS_READ (dr)
3701 ? supports_vec_gather_load_p ()
3702 : supports_vec_scatter_store_p ());
3704 base = DR_REF (dr);
3705 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3706 see if we can use the def stmt of the address. */
3707 if (masked_p
3708 && TREE_CODE (base) == MEM_REF
3709 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3710 && integer_zerop (TREE_OPERAND (base, 1))
3711 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3713 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3714 if (is_gimple_assign (def_stmt)
3715 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3716 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3719 /* The gather and scatter builtins need address of the form
3720 loop_invariant + vector * {1, 2, 4, 8}
3722 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3723 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3724 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3725 multiplications and additions in it. To get a vector, we need
3726 a single SSA_NAME that will be defined in the loop and will
3727 contain everything that is not loop invariant and that can be
3728 vectorized. The following code attempts to find such a preexistng
3729 SSA_NAME OFF and put the loop invariants into a tree BASE
3730 that can be gimplified before the loop. */
3731 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3732 &punsignedp, &reversep, &pvolatilep);
3733 if (reversep)
3734 return false;
3736 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3738 if (TREE_CODE (base) == MEM_REF)
3740 if (!integer_zerop (TREE_OPERAND (base, 1)))
3742 if (off == NULL_TREE)
3743 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3744 else
3745 off = size_binop (PLUS_EXPR, off,
3746 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3748 base = TREE_OPERAND (base, 0);
3750 else
3751 base = build_fold_addr_expr (base);
3753 if (off == NULL_TREE)
3754 off = size_zero_node;
3756 /* If base is not loop invariant, either off is 0, then we start with just
3757 the constant offset in the loop invariant BASE and continue with base
3758 as OFF, otherwise give up.
3759 We could handle that case by gimplifying the addition of base + off
3760 into some SSA_NAME and use that as off, but for now punt. */
3761 if (!expr_invariant_in_loop_p (loop, base))
3763 if (!integer_zerop (off))
3764 return false;
3765 off = base;
3766 base = size_int (pbytepos);
3768 /* Otherwise put base + constant offset into the loop invariant BASE
3769 and continue with OFF. */
3770 else
3772 base = fold_convert (sizetype, base);
3773 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3776 /* OFF at this point may be either a SSA_NAME or some tree expression
3777 from get_inner_reference. Try to peel off loop invariants from it
3778 into BASE as long as possible. */
3779 STRIP_NOPS (off);
3780 while (offtype == NULL_TREE)
3782 enum tree_code code;
3783 tree op0, op1, add = NULL_TREE;
3785 if (TREE_CODE (off) == SSA_NAME)
3787 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3789 if (expr_invariant_in_loop_p (loop, off))
3790 return false;
3792 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3793 break;
3795 op0 = gimple_assign_rhs1 (def_stmt);
3796 code = gimple_assign_rhs_code (def_stmt);
3797 op1 = gimple_assign_rhs2 (def_stmt);
3799 else
3801 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3802 return false;
3803 code = TREE_CODE (off);
3804 extract_ops_from_tree (off, &code, &op0, &op1);
3806 switch (code)
3808 case POINTER_PLUS_EXPR:
3809 case PLUS_EXPR:
3810 if (expr_invariant_in_loop_p (loop, op0))
3812 add = op0;
3813 off = op1;
3814 do_add:
3815 add = fold_convert (sizetype, add);
3816 if (scale != 1)
3817 add = size_binop (MULT_EXPR, add, size_int (scale));
3818 base = size_binop (PLUS_EXPR, base, add);
3819 continue;
3821 if (expr_invariant_in_loop_p (loop, op1))
3823 add = op1;
3824 off = op0;
3825 goto do_add;
3827 break;
3828 case MINUS_EXPR:
3829 if (expr_invariant_in_loop_p (loop, op1))
3831 add = fold_convert (sizetype, op1);
3832 add = size_binop (MINUS_EXPR, size_zero_node, add);
3833 off = op0;
3834 goto do_add;
3836 break;
3837 case MULT_EXPR:
3838 if (scale == 1 && tree_fits_shwi_p (op1))
3840 int new_scale = tree_to_shwi (op1);
3841 /* Only treat this as a scaling operation if the target
3842 supports it. */
3843 if (use_ifn_p
3844 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3845 vectype, memory_type, 1,
3846 TYPE_SIGN (TREE_TYPE (op0)),
3847 new_scale, &ifn,
3848 &element_type))
3849 break;
3850 scale = new_scale;
3851 off = op0;
3852 continue;
3854 break;
3855 case SSA_NAME:
3856 off = op0;
3857 continue;
3858 CASE_CONVERT:
3859 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3860 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3861 break;
3862 if (TYPE_PRECISION (TREE_TYPE (op0))
3863 == TYPE_PRECISION (TREE_TYPE (off)))
3865 off = op0;
3866 continue;
3869 /* The internal functions need the offset to be the same width
3870 as the elements of VECTYPE. Don't include operations that
3871 cast the offset from that width to a different width. */
3872 if (use_ifn_p
3873 && (int_size_in_bytes (TREE_TYPE (vectype))
3874 == int_size_in_bytes (TREE_TYPE (off))))
3875 break;
3877 if (TYPE_PRECISION (TREE_TYPE (op0))
3878 < TYPE_PRECISION (TREE_TYPE (off)))
3880 off = op0;
3881 offtype = TREE_TYPE (off);
3882 STRIP_NOPS (off);
3883 continue;
3885 break;
3886 default:
3887 break;
3889 break;
3892 /* If at the end OFF still isn't a SSA_NAME or isn't
3893 defined in the loop, punt. */
3894 if (TREE_CODE (off) != SSA_NAME
3895 || expr_invariant_in_loop_p (loop, off))
3896 return false;
3898 if (offtype == NULL_TREE)
3899 offtype = TREE_TYPE (off);
3901 if (use_ifn_p)
3903 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3904 memory_type, TYPE_PRECISION (offtype),
3905 TYPE_SIGN (offtype), scale, &ifn,
3906 &element_type))
3907 return false;
3909 else
3911 if (DR_IS_READ (dr))
3913 if (targetm.vectorize.builtin_gather)
3914 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3916 else
3918 if (targetm.vectorize.builtin_scatter)
3919 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3922 if (!decl)
3923 return false;
3925 ifn = IFN_LAST;
3926 element_type = TREE_TYPE (vectype);
3929 info->ifn = ifn;
3930 info->decl = decl;
3931 info->base = base;
3932 info->offset = off;
3933 info->offset_dt = vect_unknown_def_type;
3934 info->offset_vectype = NULL_TREE;
3935 info->scale = scale;
3936 info->element_type = element_type;
3937 info->memory_type = memory_type;
3938 return true;
3941 /* Find the data references in STMT, analyze them with respect to LOOP and
3942 append them to DATAREFS. Return false if datarefs in this stmt cannot
3943 be handled. */
3945 bool
3946 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3947 vec<data_reference_p> *datarefs)
3949 /* We can ignore clobbers for dataref analysis - they are removed during
3950 loop vectorization and BB vectorization checks dependences with a
3951 stmt walk. */
3952 if (gimple_clobber_p (stmt))
3953 return true;
3955 if (gimple_has_volatile_ops (stmt))
3957 if (dump_enabled_p ())
3959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3960 "not vectorized: volatile type ");
3961 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3963 return false;
3966 if (stmt_can_throw_internal (stmt))
3968 if (dump_enabled_p ())
3970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3971 "not vectorized: statement can throw an "
3972 "exception ");
3973 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3975 return false;
3978 auto_vec<data_reference_p, 2> refs;
3979 if (!find_data_references_in_stmt (loop, stmt, &refs))
3980 return false;
3982 if (refs.is_empty ())
3983 return true;
3985 if (refs.length () > 1)
3987 if (dump_enabled_p ())
3989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3990 "not vectorized: more than one data ref "
3991 "in stmt: ");
3992 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3994 return false;
3997 if (gcall *call = dyn_cast <gcall *> (stmt))
3998 if (!gimple_call_internal_p (call)
3999 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4000 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4002 if (dump_enabled_p ())
4004 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4005 "not vectorized: dr in a call ");
4006 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4008 return false;
4011 data_reference_p dr = refs.pop ();
4012 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4013 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4015 if (dump_enabled_p ())
4017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4018 "not vectorized: statement is bitfield "
4019 "access ");
4020 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4022 return false;
4025 if (DR_BASE_ADDRESS (dr)
4026 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4028 if (dump_enabled_p ())
4029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4030 "not vectorized: base addr of dr is a "
4031 "constant\n");
4032 return false;
4035 datarefs->safe_push (dr);
4036 return true;
4039 /* Function vect_analyze_data_refs.
4041 Find all the data references in the loop or basic block.
4043 The general structure of the analysis of data refs in the vectorizer is as
4044 follows:
4045 1- vect_analyze_data_refs(loop/bb): call
4046 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4047 in the loop/bb and their dependences.
4048 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4049 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4050 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4054 bool
4055 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4057 struct loop *loop = NULL;
4058 unsigned int i;
4059 struct data_reference *dr;
4060 tree scalar_type;
4062 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4064 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4065 loop = LOOP_VINFO_LOOP (loop_vinfo);
4067 /* Go through the data-refs, check that the analysis succeeded. Update
4068 pointer from stmt_vec_info struct to DR and vectype. */
4070 vec<data_reference_p> datarefs = vinfo->datarefs;
4071 FOR_EACH_VEC_ELT (datarefs, i, dr)
4073 gimple *stmt;
4074 stmt_vec_info stmt_info;
4075 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4076 bool simd_lane_access = false;
4077 poly_uint64 vf;
4079 gcc_assert (DR_REF (dr));
4080 stmt = vect_dr_stmt (dr);
4081 stmt_info = vinfo_for_stmt (stmt);
4083 /* Check that analysis of the data-ref succeeded. */
4084 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4085 || !DR_STEP (dr))
4087 bool maybe_gather
4088 = DR_IS_READ (dr)
4089 && !TREE_THIS_VOLATILE (DR_REF (dr))
4090 && (targetm.vectorize.builtin_gather != NULL
4091 || supports_vec_gather_load_p ());
4092 bool maybe_scatter
4093 = DR_IS_WRITE (dr)
4094 && !TREE_THIS_VOLATILE (DR_REF (dr))
4095 && (targetm.vectorize.builtin_scatter != NULL
4096 || supports_vec_scatter_store_p ());
4097 bool maybe_simd_lane_access
4098 = is_a <loop_vec_info> (vinfo) && loop->simduid;
4100 /* If target supports vector gather loads or scatter stores, or if
4101 this might be a SIMD lane access, see if they can't be used. */
4102 if (is_a <loop_vec_info> (vinfo)
4103 && !nested_in_vect_loop_p (loop, stmt))
4105 if (maybe_simd_lane_access)
4107 struct data_reference *newdr
4108 = create_data_ref (NULL, loop_containing_stmt (stmt),
4109 DR_REF (dr), stmt, !maybe_scatter,
4110 DR_IS_CONDITIONAL_IN_STMT (dr));
4111 gcc_assert (newdr != NULL && DR_REF (newdr));
4112 if (DR_BASE_ADDRESS (newdr)
4113 && DR_OFFSET (newdr)
4114 && DR_INIT (newdr)
4115 && DR_STEP (newdr)
4116 && integer_zerop (DR_STEP (newdr)))
4118 tree off = DR_OFFSET (newdr);
4119 STRIP_NOPS (off);
4120 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4121 && TREE_CODE (off) == MULT_EXPR
4122 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4124 tree step = TREE_OPERAND (off, 1);
4125 off = TREE_OPERAND (off, 0);
4126 STRIP_NOPS (off);
4127 if (CONVERT_EXPR_P (off)
4128 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4129 0)))
4130 < TYPE_PRECISION (TREE_TYPE (off)))
4131 off = TREE_OPERAND (off, 0);
4132 if (TREE_CODE (off) == SSA_NAME)
4134 gimple *def = SSA_NAME_DEF_STMT (off);
4135 tree reft = TREE_TYPE (DR_REF (newdr));
4136 if (is_gimple_call (def)
4137 && gimple_call_internal_p (def)
4138 && (gimple_call_internal_fn (def)
4139 == IFN_GOMP_SIMD_LANE))
4141 tree arg = gimple_call_arg (def, 0);
4142 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4143 arg = SSA_NAME_VAR (arg);
4144 if (arg == loop->simduid
4145 /* For now. */
4146 && tree_int_cst_equal
4147 (TYPE_SIZE_UNIT (reft),
4148 step))
4150 DR_OFFSET (newdr) = ssize_int (0);
4151 DR_STEP (newdr) = step;
4152 DR_OFFSET_ALIGNMENT (newdr)
4153 = BIGGEST_ALIGNMENT;
4154 DR_STEP_ALIGNMENT (newdr)
4155 = highest_pow2_factor (step);
4156 dr = newdr;
4157 simd_lane_access = true;
4163 if (!simd_lane_access)
4164 free_data_ref (newdr);
4166 if (!simd_lane_access && (maybe_gather || maybe_scatter))
4168 if (maybe_gather)
4169 gatherscatter = GATHER;
4170 else
4171 gatherscatter = SCATTER;
4175 if (gatherscatter == SG_NONE && !simd_lane_access)
4177 if (dump_enabled_p ())
4179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4180 "not vectorized: data ref analysis "
4181 "failed ");
4182 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4184 if (is_a <bb_vec_info> (vinfo))
4186 /* In BB vectorization the ref can still participate
4187 in dependence analysis, we just can't vectorize it. */
4188 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4189 continue;
4191 return false;
4195 tree base = get_base_address (DR_REF (dr));
4196 if (base && VAR_P (base) && DECL_NONALIASED (base))
4198 if (dump_enabled_p ())
4200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4201 "not vectorized: base object not addressable "
4202 "for stmt: ");
4203 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4205 if (is_a <bb_vec_info> (vinfo))
4207 /* In BB vectorization the ref can still participate
4208 in dependence analysis, we just can't vectorize it. */
4209 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4210 continue;
4212 return false;
4215 if (is_a <loop_vec_info> (vinfo)
4216 && DR_STEP (dr)
4217 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4219 if (nested_in_vect_loop_p (loop, stmt))
4221 if (dump_enabled_p ())
4223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4224 "not vectorized: not suitable for strided "
4225 "load ");
4226 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4228 return false;
4230 STMT_VINFO_STRIDED_P (stmt_info) = true;
4233 /* Update DR field in stmt_vec_info struct. */
4235 /* If the dataref is in an inner-loop of the loop that is considered for
4236 for vectorization, we also want to analyze the access relative to
4237 the outer-loop (DR contains information only relative to the
4238 inner-most enclosing loop). We do that by building a reference to the
4239 first location accessed by the inner-loop, and analyze it relative to
4240 the outer-loop. */
4241 if (loop && nested_in_vect_loop_p (loop, stmt))
4243 /* Build a reference to the first location accessed by the
4244 inner loop: *(BASE + INIT + OFFSET). By construction,
4245 this address must be invariant in the inner loop, so we
4246 can consider it as being used in the outer loop. */
4247 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4248 tree offset = unshare_expr (DR_OFFSET (dr));
4249 tree init = unshare_expr (DR_INIT (dr));
4250 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4251 init, offset);
4252 tree init_addr = fold_build_pointer_plus (base, init_offset);
4253 tree init_ref = build_fold_indirect_ref (init_addr);
4255 if (dump_enabled_p ())
4257 dump_printf_loc (MSG_NOTE, vect_location,
4258 "analyze in outer loop: ");
4259 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4260 dump_printf (MSG_NOTE, "\n");
4263 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4264 init_ref, loop))
4265 /* dr_analyze_innermost already explained the failure. */
4266 return false;
4268 if (dump_enabled_p ())
4270 dump_printf_loc (MSG_NOTE, vect_location,
4271 "\touter base_address: ");
4272 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4273 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4274 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4275 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4276 STMT_VINFO_DR_OFFSET (stmt_info));
4277 dump_printf (MSG_NOTE,
4278 "\n\touter constant offset from base address: ");
4279 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4280 STMT_VINFO_DR_INIT (stmt_info));
4281 dump_printf (MSG_NOTE, "\n\touter step: ");
4282 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4283 STMT_VINFO_DR_STEP (stmt_info));
4284 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4285 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4286 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4287 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4288 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4289 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4290 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4291 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4295 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4296 STMT_VINFO_DATA_REF (stmt_info) = dr;
4297 if (simd_lane_access)
4299 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4300 free_data_ref (datarefs[i]);
4301 datarefs[i] = dr;
4304 /* Set vectype for STMT. */
4305 scalar_type = TREE_TYPE (DR_REF (dr));
4306 STMT_VINFO_VECTYPE (stmt_info)
4307 = get_vectype_for_scalar_type (scalar_type);
4308 if (!STMT_VINFO_VECTYPE (stmt_info))
4310 if (dump_enabled_p ())
4312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4313 "not vectorized: no vectype for stmt: ");
4314 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4315 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4316 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4317 scalar_type);
4318 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4321 if (is_a <bb_vec_info> (vinfo))
4323 /* No vector type is fine, the ref can still participate
4324 in dependence analysis, we just can't vectorize it. */
4325 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4326 continue;
4329 if (simd_lane_access)
4331 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4332 free_data_ref (dr);
4334 return false;
4336 else
4338 if (dump_enabled_p ())
4340 dump_printf_loc (MSG_NOTE, vect_location,
4341 "got vectype for stmt: ");
4342 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4343 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4344 STMT_VINFO_VECTYPE (stmt_info));
4345 dump_printf (MSG_NOTE, "\n");
4349 /* Adjust the minimal vectorization factor according to the
4350 vector type. */
4351 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4352 *min_vf = upper_bound (*min_vf, vf);
4354 if (gatherscatter != SG_NONE)
4356 gather_scatter_info gs_info;
4357 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4358 &gs_info)
4359 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4361 if (dump_enabled_p ())
4363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4364 (gatherscatter == GATHER) ?
4365 "not vectorized: not suitable for gather "
4366 "load " :
4367 "not vectorized: not suitable for scatter "
4368 "store ");
4369 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4371 return false;
4373 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4377 /* We used to stop processing and prune the list here. Verify we no
4378 longer need to. */
4379 gcc_assert (i == datarefs.length ());
4381 return true;
4385 /* Function vect_get_new_vect_var.
4387 Returns a name for a new variable. The current naming scheme appends the
4388 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4389 the name of vectorizer generated variables, and appends that to NAME if
4390 provided. */
4392 tree
4393 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4395 const char *prefix;
4396 tree new_vect_var;
4398 switch (var_kind)
4400 case vect_simple_var:
4401 prefix = "vect";
4402 break;
4403 case vect_scalar_var:
4404 prefix = "stmp";
4405 break;
4406 case vect_mask_var:
4407 prefix = "mask";
4408 break;
4409 case vect_pointer_var:
4410 prefix = "vectp";
4411 break;
4412 default:
4413 gcc_unreachable ();
4416 if (name)
4418 char* tmp = concat (prefix, "_", name, NULL);
4419 new_vect_var = create_tmp_reg (type, tmp);
4420 free (tmp);
4422 else
4423 new_vect_var = create_tmp_reg (type, prefix);
4425 return new_vect_var;
4428 /* Like vect_get_new_vect_var but return an SSA name. */
4430 tree
4431 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4433 const char *prefix;
4434 tree new_vect_var;
4436 switch (var_kind)
4438 case vect_simple_var:
4439 prefix = "vect";
4440 break;
4441 case vect_scalar_var:
4442 prefix = "stmp";
4443 break;
4444 case vect_pointer_var:
4445 prefix = "vectp";
4446 break;
4447 default:
4448 gcc_unreachable ();
4451 if (name)
4453 char* tmp = concat (prefix, "_", name, NULL);
4454 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4455 free (tmp);
4457 else
4458 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4460 return new_vect_var;
4463 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4465 static void
4466 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4468 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4469 int misalign = DR_MISALIGNMENT (dr);
4470 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4471 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4472 else
4473 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4474 DR_TARGET_ALIGNMENT (dr), misalign);
4477 /* Function vect_create_addr_base_for_vector_ref.
4479 Create an expression that computes the address of the first memory location
4480 that will be accessed for a data reference.
4482 Input:
4483 STMT: The statement containing the data reference.
4484 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4485 OFFSET: Optional. If supplied, it is be added to the initial address.
4486 LOOP: Specify relative to which loop-nest should the address be computed.
4487 For example, when the dataref is in an inner-loop nested in an
4488 outer-loop that is now being vectorized, LOOP can be either the
4489 outer-loop, or the inner-loop. The first memory location accessed
4490 by the following dataref ('in' points to short):
4492 for (i=0; i<N; i++)
4493 for (j=0; j<M; j++)
4494 s += in[i+j]
4496 is as follows:
4497 if LOOP=i_loop: &in (relative to i_loop)
4498 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4499 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4500 initial address. Unlike OFFSET, which is number of elements to
4501 be added, BYTE_OFFSET is measured in bytes.
4503 Output:
4504 1. Return an SSA_NAME whose value is the address of the memory location of
4505 the first vector of the data reference.
4506 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4507 these statement(s) which define the returned SSA_NAME.
4509 FORNOW: We are only handling array accesses with step 1. */
4511 tree
4512 vect_create_addr_base_for_vector_ref (gimple *stmt,
4513 gimple_seq *new_stmt_list,
4514 tree offset,
4515 tree byte_offset)
4517 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4518 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4519 const char *base_name;
4520 tree addr_base;
4521 tree dest;
4522 gimple_seq seq = NULL;
4523 tree vect_ptr_type;
4524 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4525 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4526 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4528 tree data_ref_base = unshare_expr (drb->base_address);
4529 tree base_offset = unshare_expr (drb->offset);
4530 tree init = unshare_expr (drb->init);
4532 if (loop_vinfo)
4533 base_name = get_name (data_ref_base);
4534 else
4536 base_offset = ssize_int (0);
4537 init = ssize_int (0);
4538 base_name = get_name (DR_REF (dr));
4541 /* Create base_offset */
4542 base_offset = size_binop (PLUS_EXPR,
4543 fold_convert (sizetype, base_offset),
4544 fold_convert (sizetype, init));
4546 if (offset)
4548 offset = fold_build2 (MULT_EXPR, sizetype,
4549 fold_convert (sizetype, offset), step);
4550 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4551 base_offset, offset);
4553 if (byte_offset)
4555 byte_offset = fold_convert (sizetype, byte_offset);
4556 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4557 base_offset, byte_offset);
4560 /* base + base_offset */
4561 if (loop_vinfo)
4562 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4563 else
4565 addr_base = build1 (ADDR_EXPR,
4566 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4567 unshare_expr (DR_REF (dr)));
4570 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4571 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4572 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4573 gimple_seq_add_seq (new_stmt_list, seq);
4575 if (DR_PTR_INFO (dr)
4576 && TREE_CODE (addr_base) == SSA_NAME
4577 && !SSA_NAME_PTR_INFO (addr_base))
4579 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4580 if (offset || byte_offset)
4581 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4584 if (dump_enabled_p ())
4586 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4587 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4588 dump_printf (MSG_NOTE, "\n");
4591 return addr_base;
4595 /* Function vect_create_data_ref_ptr.
4597 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4598 location accessed in the loop by STMT, along with the def-use update
4599 chain to appropriately advance the pointer through the loop iterations.
4600 Also set aliasing information for the pointer. This pointer is used by
4601 the callers to this function to create a memory reference expression for
4602 vector load/store access.
4604 Input:
4605 1. STMT: a stmt that references memory. Expected to be of the form
4606 GIMPLE_ASSIGN <name, data-ref> or
4607 GIMPLE_ASSIGN <data-ref, name>.
4608 2. AGGR_TYPE: the type of the reference, which should be either a vector
4609 or an array.
4610 3. AT_LOOP: the loop where the vector memref is to be created.
4611 4. OFFSET (optional): an offset to be added to the initial address accessed
4612 by the data-ref in STMT.
4613 5. BSI: location where the new stmts are to be placed if there is no loop
4614 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4615 pointing to the initial address.
4616 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4617 to the initial address accessed by the data-ref in STMT. This is
4618 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4619 in bytes.
4620 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4621 to the IV during each iteration of the loop. NULL says to move
4622 by one copy of AGGR_TYPE up or down, depending on the step of the
4623 data reference.
4625 Output:
4626 1. Declare a new ptr to vector_type, and have it point to the base of the
4627 data reference (initial addressed accessed by the data reference).
4628 For example, for vector of type V8HI, the following code is generated:
4630 v8hi *ap;
4631 ap = (v8hi *)initial_address;
4633 if OFFSET is not supplied:
4634 initial_address = &a[init];
4635 if OFFSET is supplied:
4636 initial_address = &a[init + OFFSET];
4637 if BYTE_OFFSET is supplied:
4638 initial_address = &a[init] + BYTE_OFFSET;
4640 Return the initial_address in INITIAL_ADDRESS.
4642 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4643 update the pointer in each iteration of the loop.
4645 Return the increment stmt that updates the pointer in PTR_INCR.
4647 3. Set INV_P to true if the access pattern of the data reference in the
4648 vectorized loop is invariant. Set it to false otherwise.
4650 4. Return the pointer. */
4652 tree
4653 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4654 tree offset, tree *initial_address,
4655 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4656 bool only_init, bool *inv_p, tree byte_offset,
4657 tree iv_step)
4659 const char *base_name;
4660 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4661 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4662 struct loop *loop = NULL;
4663 bool nested_in_vect_loop = false;
4664 struct loop *containing_loop = NULL;
4665 tree aggr_ptr_type;
4666 tree aggr_ptr;
4667 tree new_temp;
4668 gimple_seq new_stmt_list = NULL;
4669 edge pe = NULL;
4670 basic_block new_bb;
4671 tree aggr_ptr_init;
4672 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4673 tree aptr;
4674 gimple_stmt_iterator incr_gsi;
4675 bool insert_after;
4676 tree indx_before_incr, indx_after_incr;
4677 gimple *incr;
4678 tree step;
4679 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4681 gcc_assert (iv_step != NULL_TREE
4682 || TREE_CODE (aggr_type) == ARRAY_TYPE
4683 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4685 if (loop_vinfo)
4687 loop = LOOP_VINFO_LOOP (loop_vinfo);
4688 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4689 containing_loop = (gimple_bb (stmt))->loop_father;
4690 pe = loop_preheader_edge (loop);
4692 else
4694 gcc_assert (bb_vinfo);
4695 only_init = true;
4696 *ptr_incr = NULL;
4699 /* Check the step (evolution) of the load in LOOP, and record
4700 whether it's invariant. */
4701 step = vect_dr_behavior (dr)->step;
4702 if (integer_zerop (step))
4703 *inv_p = true;
4704 else
4705 *inv_p = false;
4707 /* Create an expression for the first address accessed by this load
4708 in LOOP. */
4709 base_name = get_name (DR_BASE_ADDRESS (dr));
4711 if (dump_enabled_p ())
4713 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4714 dump_printf_loc (MSG_NOTE, vect_location,
4715 "create %s-pointer variable to type: ",
4716 get_tree_code_name (TREE_CODE (aggr_type)));
4717 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4718 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4719 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4720 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4721 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4722 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4723 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4724 else
4725 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4726 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4727 dump_printf (MSG_NOTE, "\n");
4730 /* (1) Create the new aggregate-pointer variable.
4731 Vector and array types inherit the alias set of their component
4732 type by default so we need to use a ref-all pointer if the data
4733 reference does not conflict with the created aggregated data
4734 reference because it is not addressable. */
4735 bool need_ref_all = false;
4736 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4737 get_alias_set (DR_REF (dr))))
4738 need_ref_all = true;
4739 /* Likewise for any of the data references in the stmt group. */
4740 else if (DR_GROUP_SIZE (stmt_info) > 1)
4742 gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4745 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4746 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4747 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4748 get_alias_set (DR_REF (sdr))))
4750 need_ref_all = true;
4751 break;
4753 orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4755 while (orig_stmt);
4757 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4758 need_ref_all);
4759 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4762 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4763 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4764 def-use update cycles for the pointer: one relative to the outer-loop
4765 (LOOP), which is what steps (3) and (4) below do. The other is relative
4766 to the inner-loop (which is the inner-most loop containing the dataref),
4767 and this is done be step (5) below.
4769 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4770 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4771 redundant. Steps (3),(4) create the following:
4773 vp0 = &base_addr;
4774 LOOP: vp1 = phi(vp0,vp2)
4777 vp2 = vp1 + step
4778 goto LOOP
4780 If there is an inner-loop nested in loop, then step (5) will also be
4781 applied, and an additional update in the inner-loop will be created:
4783 vp0 = &base_addr;
4784 LOOP: vp1 = phi(vp0,vp2)
4786 inner: vp3 = phi(vp1,vp4)
4787 vp4 = vp3 + inner_step
4788 if () goto inner
4790 vp2 = vp1 + step
4791 if () goto LOOP */
4793 /* (2) Calculate the initial address of the aggregate-pointer, and set
4794 the aggregate-pointer to point to it before the loop. */
4796 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4798 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4799 offset, byte_offset);
4800 if (new_stmt_list)
4802 if (pe)
4804 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4805 gcc_assert (!new_bb);
4807 else
4808 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4811 *initial_address = new_temp;
4812 aggr_ptr_init = new_temp;
4814 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4815 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4816 inner-loop nested in LOOP (during outer-loop vectorization). */
4818 /* No update in loop is required. */
4819 if (only_init && (!loop_vinfo || at_loop == loop))
4820 aptr = aggr_ptr_init;
4821 else
4823 if (iv_step == NULL_TREE)
4825 /* The step of the aggregate pointer is the type size. */
4826 iv_step = TYPE_SIZE_UNIT (aggr_type);
4827 /* One exception to the above is when the scalar step of the load in
4828 LOOP is zero. In this case the step here is also zero. */
4829 if (*inv_p)
4830 iv_step = size_zero_node;
4831 else if (tree_int_cst_sgn (step) == -1)
4832 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4835 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4837 create_iv (aggr_ptr_init,
4838 fold_convert (aggr_ptr_type, iv_step),
4839 aggr_ptr, loop, &incr_gsi, insert_after,
4840 &indx_before_incr, &indx_after_incr);
4841 incr = gsi_stmt (incr_gsi);
4842 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4844 /* Copy the points-to information if it exists. */
4845 if (DR_PTR_INFO (dr))
4847 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4848 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4850 if (ptr_incr)
4851 *ptr_incr = incr;
4853 aptr = indx_before_incr;
4856 if (!nested_in_vect_loop || only_init)
4857 return aptr;
4860 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4861 nested in LOOP, if exists. */
4863 gcc_assert (nested_in_vect_loop);
4864 if (!only_init)
4866 standard_iv_increment_position (containing_loop, &incr_gsi,
4867 &insert_after);
4868 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4869 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4870 &indx_after_incr);
4871 incr = gsi_stmt (incr_gsi);
4872 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4874 /* Copy the points-to information if it exists. */
4875 if (DR_PTR_INFO (dr))
4877 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4878 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4880 if (ptr_incr)
4881 *ptr_incr = incr;
4883 return indx_before_incr;
4885 else
4886 gcc_unreachable ();
4890 /* Function bump_vector_ptr
4892 Increment a pointer (to a vector type) by vector-size. If requested,
4893 i.e. if PTR-INCR is given, then also connect the new increment stmt
4894 to the existing def-use update-chain of the pointer, by modifying
4895 the PTR_INCR as illustrated below:
4897 The pointer def-use update-chain before this function:
4898 DATAREF_PTR = phi (p_0, p_2)
4899 ....
4900 PTR_INCR: p_2 = DATAREF_PTR + step
4902 The pointer def-use update-chain after this function:
4903 DATAREF_PTR = phi (p_0, p_2)
4904 ....
4905 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4906 ....
4907 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4909 Input:
4910 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4911 in the loop.
4912 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4913 the loop. The increment amount across iterations is expected
4914 to be vector_size.
4915 BSI - location where the new update stmt is to be placed.
4916 STMT - the original scalar memory-access stmt that is being vectorized.
4917 BUMP - optional. The offset by which to bump the pointer. If not given,
4918 the offset is assumed to be vector_size.
4920 Output: Return NEW_DATAREF_PTR as illustrated above.
4924 tree
4925 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4926 gimple *stmt, tree bump)
4928 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4929 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4930 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4931 tree update = TYPE_SIZE_UNIT (vectype);
4932 gassign *incr_stmt;
4933 ssa_op_iter iter;
4934 use_operand_p use_p;
4935 tree new_dataref_ptr;
4937 if (bump)
4938 update = bump;
4940 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4941 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4942 else
4943 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4944 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4945 dataref_ptr, update);
4946 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4948 /* Copy the points-to information if it exists. */
4949 if (DR_PTR_INFO (dr))
4951 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4952 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4955 if (!ptr_incr)
4956 return new_dataref_ptr;
4958 /* Update the vector-pointer's cross-iteration increment. */
4959 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4961 tree use = USE_FROM_PTR (use_p);
4963 if (use == dataref_ptr)
4964 SET_USE (use_p, new_dataref_ptr);
4965 else
4966 gcc_assert (operand_equal_p (use, update, 0));
4969 return new_dataref_ptr;
4973 /* Copy memory reference info such as base/clique from the SRC reference
4974 to the DEST MEM_REF. */
4976 void
4977 vect_copy_ref_info (tree dest, tree src)
4979 if (TREE_CODE (dest) != MEM_REF)
4980 return;
4982 tree src_base = src;
4983 while (handled_component_p (src_base))
4984 src_base = TREE_OPERAND (src_base, 0);
4985 if (TREE_CODE (src_base) != MEM_REF
4986 && TREE_CODE (src_base) != TARGET_MEM_REF)
4987 return;
4989 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4990 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4994 /* Function vect_create_destination_var.
4996 Create a new temporary of type VECTYPE. */
4998 tree
4999 vect_create_destination_var (tree scalar_dest, tree vectype)
5001 tree vec_dest;
5002 const char *name;
5003 char *new_name;
5004 tree type;
5005 enum vect_var_kind kind;
5007 kind = vectype
5008 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5009 ? vect_mask_var
5010 : vect_simple_var
5011 : vect_scalar_var;
5012 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5014 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5016 name = get_name (scalar_dest);
5017 if (name)
5018 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5019 else
5020 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5021 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5022 free (new_name);
5024 return vec_dest;
5027 /* Function vect_grouped_store_supported.
5029 Returns TRUE if interleave high and interleave low permutations
5030 are supported, and FALSE otherwise. */
5032 bool
5033 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5035 machine_mode mode = TYPE_MODE (vectype);
5037 /* vect_permute_store_chain requires the group size to be equal to 3 or
5038 be a power of two. */
5039 if (count != 3 && exact_log2 (count) == -1)
5041 if (dump_enabled_p ())
5042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5043 "the size of the group of accesses"
5044 " is not a power of 2 or not eqaul to 3\n");
5045 return false;
5048 /* Check that the permutation is supported. */
5049 if (VECTOR_MODE_P (mode))
5051 unsigned int i;
5052 if (count == 3)
5054 unsigned int j0 = 0, j1 = 0, j2 = 0;
5055 unsigned int i, j;
5057 unsigned int nelt;
5058 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5060 if (dump_enabled_p ())
5061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5062 "cannot handle groups of 3 stores for"
5063 " variable-length vectors\n");
5064 return false;
5067 vec_perm_builder sel (nelt, nelt, 1);
5068 sel.quick_grow (nelt);
5069 vec_perm_indices indices;
5070 for (j = 0; j < 3; j++)
5072 int nelt0 = ((3 - j) * nelt) % 3;
5073 int nelt1 = ((3 - j) * nelt + 1) % 3;
5074 int nelt2 = ((3 - j) * nelt + 2) % 3;
5075 for (i = 0; i < nelt; i++)
5077 if (3 * i + nelt0 < nelt)
5078 sel[3 * i + nelt0] = j0++;
5079 if (3 * i + nelt1 < nelt)
5080 sel[3 * i + nelt1] = nelt + j1++;
5081 if (3 * i + nelt2 < nelt)
5082 sel[3 * i + nelt2] = 0;
5084 indices.new_vector (sel, 2, nelt);
5085 if (!can_vec_perm_const_p (mode, indices))
5087 if (dump_enabled_p ())
5088 dump_printf (MSG_MISSED_OPTIMIZATION,
5089 "permutation op not supported by target.\n");
5090 return false;
5093 for (i = 0; i < nelt; i++)
5095 if (3 * i + nelt0 < nelt)
5096 sel[3 * i + nelt0] = 3 * i + nelt0;
5097 if (3 * i + nelt1 < nelt)
5098 sel[3 * i + nelt1] = 3 * i + nelt1;
5099 if (3 * i + nelt2 < nelt)
5100 sel[3 * i + nelt2] = nelt + j2++;
5102 indices.new_vector (sel, 2, nelt);
5103 if (!can_vec_perm_const_p (mode, indices))
5105 if (dump_enabled_p ())
5106 dump_printf (MSG_MISSED_OPTIMIZATION,
5107 "permutation op not supported by target.\n");
5108 return false;
5111 return true;
5113 else
5115 /* If length is not equal to 3 then only power of 2 is supported. */
5116 gcc_assert (pow2p_hwi (count));
5117 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5119 /* The encoding has 2 interleaved stepped patterns. */
5120 vec_perm_builder sel (nelt, 2, 3);
5121 sel.quick_grow (6);
5122 for (i = 0; i < 3; i++)
5124 sel[i * 2] = i;
5125 sel[i * 2 + 1] = i + nelt;
5127 vec_perm_indices indices (sel, 2, nelt);
5128 if (can_vec_perm_const_p (mode, indices))
5130 for (i = 0; i < 6; i++)
5131 sel[i] += exact_div (nelt, 2);
5132 indices.new_vector (sel, 2, nelt);
5133 if (can_vec_perm_const_p (mode, indices))
5134 return true;
5139 if (dump_enabled_p ())
5140 dump_printf (MSG_MISSED_OPTIMIZATION,
5141 "permutaion op not supported by target.\n");
5142 return false;
5146 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5147 type VECTYPE. MASKED_P says whether the masked form is needed. */
5149 bool
5150 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5151 bool masked_p)
5153 if (masked_p)
5154 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5155 vec_mask_store_lanes_optab,
5156 vectype, count);
5157 else
5158 return vect_lanes_optab_supported_p ("vec_store_lanes",
5159 vec_store_lanes_optab,
5160 vectype, count);
5164 /* Function vect_permute_store_chain.
5166 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5167 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5168 the data correctly for the stores. Return the final references for stores
5169 in RESULT_CHAIN.
5171 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5172 The input is 4 vectors each containing 8 elements. We assign a number to
5173 each element, the input sequence is:
5175 1st vec: 0 1 2 3 4 5 6 7
5176 2nd vec: 8 9 10 11 12 13 14 15
5177 3rd vec: 16 17 18 19 20 21 22 23
5178 4th vec: 24 25 26 27 28 29 30 31
5180 The output sequence should be:
5182 1st vec: 0 8 16 24 1 9 17 25
5183 2nd vec: 2 10 18 26 3 11 19 27
5184 3rd vec: 4 12 20 28 5 13 21 30
5185 4th vec: 6 14 22 30 7 15 23 31
5187 i.e., we interleave the contents of the four vectors in their order.
5189 We use interleave_high/low instructions to create such output. The input of
5190 each interleave_high/low operation is two vectors:
5191 1st vec 2nd vec
5192 0 1 2 3 4 5 6 7
5193 the even elements of the result vector are obtained left-to-right from the
5194 high/low elements of the first vector. The odd elements of the result are
5195 obtained left-to-right from the high/low elements of the second vector.
5196 The output of interleave_high will be: 0 4 1 5
5197 and of interleave_low: 2 6 3 7
5200 The permutation is done in log LENGTH stages. In each stage interleave_high
5201 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5202 where the first argument is taken from the first half of DR_CHAIN and the
5203 second argument from it's second half.
5204 In our example,
5206 I1: interleave_high (1st vec, 3rd vec)
5207 I2: interleave_low (1st vec, 3rd vec)
5208 I3: interleave_high (2nd vec, 4th vec)
5209 I4: interleave_low (2nd vec, 4th vec)
5211 The output for the first stage is:
5213 I1: 0 16 1 17 2 18 3 19
5214 I2: 4 20 5 21 6 22 7 23
5215 I3: 8 24 9 25 10 26 11 27
5216 I4: 12 28 13 29 14 30 15 31
5218 The output of the second stage, i.e. the final result is:
5220 I1: 0 8 16 24 1 9 17 25
5221 I2: 2 10 18 26 3 11 19 27
5222 I3: 4 12 20 28 5 13 21 30
5223 I4: 6 14 22 30 7 15 23 31. */
5225 void
5226 vect_permute_store_chain (vec<tree> dr_chain,
5227 unsigned int length,
5228 gimple *stmt,
5229 gimple_stmt_iterator *gsi,
5230 vec<tree> *result_chain)
5232 tree vect1, vect2, high, low;
5233 gimple *perm_stmt;
5234 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5235 tree perm_mask_low, perm_mask_high;
5236 tree data_ref;
5237 tree perm3_mask_low, perm3_mask_high;
5238 unsigned int i, j, n, log_length = exact_log2 (length);
5240 result_chain->quick_grow (length);
5241 memcpy (result_chain->address (), dr_chain.address (),
5242 length * sizeof (tree));
5244 if (length == 3)
5246 /* vect_grouped_store_supported ensures that this is constant. */
5247 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5248 unsigned int j0 = 0, j1 = 0, j2 = 0;
5250 vec_perm_builder sel (nelt, nelt, 1);
5251 sel.quick_grow (nelt);
5252 vec_perm_indices indices;
5253 for (j = 0; j < 3; j++)
5255 int nelt0 = ((3 - j) * nelt) % 3;
5256 int nelt1 = ((3 - j) * nelt + 1) % 3;
5257 int nelt2 = ((3 - j) * nelt + 2) % 3;
5259 for (i = 0; i < nelt; i++)
5261 if (3 * i + nelt0 < nelt)
5262 sel[3 * i + nelt0] = j0++;
5263 if (3 * i + nelt1 < nelt)
5264 sel[3 * i + nelt1] = nelt + j1++;
5265 if (3 * i + nelt2 < nelt)
5266 sel[3 * i + nelt2] = 0;
5268 indices.new_vector (sel, 2, nelt);
5269 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5271 for (i = 0; i < nelt; i++)
5273 if (3 * i + nelt0 < nelt)
5274 sel[3 * i + nelt0] = 3 * i + nelt0;
5275 if (3 * i + nelt1 < nelt)
5276 sel[3 * i + nelt1] = 3 * i + nelt1;
5277 if (3 * i + nelt2 < nelt)
5278 sel[3 * i + nelt2] = nelt + j2++;
5280 indices.new_vector (sel, 2, nelt);
5281 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5283 vect1 = dr_chain[0];
5284 vect2 = dr_chain[1];
5286 /* Create interleaving stmt:
5287 low = VEC_PERM_EXPR <vect1, vect2,
5288 {j, nelt, *, j + 1, nelt + j + 1, *,
5289 j + 2, nelt + j + 2, *, ...}> */
5290 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5291 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5292 vect2, perm3_mask_low);
5293 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5295 vect1 = data_ref;
5296 vect2 = dr_chain[2];
5297 /* Create interleaving stmt:
5298 low = VEC_PERM_EXPR <vect1, vect2,
5299 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5300 6, 7, nelt + j + 2, ...}> */
5301 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5302 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5303 vect2, perm3_mask_high);
5304 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5305 (*result_chain)[j] = data_ref;
5308 else
5310 /* If length is not equal to 3 then only power of 2 is supported. */
5311 gcc_assert (pow2p_hwi (length));
5313 /* The encoding has 2 interleaved stepped patterns. */
5314 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5315 vec_perm_builder sel (nelt, 2, 3);
5316 sel.quick_grow (6);
5317 for (i = 0; i < 3; i++)
5319 sel[i * 2] = i;
5320 sel[i * 2 + 1] = i + nelt;
5322 vec_perm_indices indices (sel, 2, nelt);
5323 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5325 for (i = 0; i < 6; i++)
5326 sel[i] += exact_div (nelt, 2);
5327 indices.new_vector (sel, 2, nelt);
5328 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5330 for (i = 0, n = log_length; i < n; i++)
5332 for (j = 0; j < length/2; j++)
5334 vect1 = dr_chain[j];
5335 vect2 = dr_chain[j+length/2];
5337 /* Create interleaving stmt:
5338 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5339 ...}> */
5340 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5341 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5342 vect2, perm_mask_high);
5343 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5344 (*result_chain)[2*j] = high;
5346 /* Create interleaving stmt:
5347 low = VEC_PERM_EXPR <vect1, vect2,
5348 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5349 ...}> */
5350 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5351 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5352 vect2, perm_mask_low);
5353 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5354 (*result_chain)[2*j+1] = low;
5356 memcpy (dr_chain.address (), result_chain->address (),
5357 length * sizeof (tree));
5362 /* Function vect_setup_realignment
5364 This function is called when vectorizing an unaligned load using
5365 the dr_explicit_realign[_optimized] scheme.
5366 This function generates the following code at the loop prolog:
5368 p = initial_addr;
5369 x msq_init = *(floor(p)); # prolog load
5370 realignment_token = call target_builtin;
5371 loop:
5372 x msq = phi (msq_init, ---)
5374 The stmts marked with x are generated only for the case of
5375 dr_explicit_realign_optimized.
5377 The code above sets up a new (vector) pointer, pointing to the first
5378 location accessed by STMT, and a "floor-aligned" load using that pointer.
5379 It also generates code to compute the "realignment-token" (if the relevant
5380 target hook was defined), and creates a phi-node at the loop-header bb
5381 whose arguments are the result of the prolog-load (created by this
5382 function) and the result of a load that takes place in the loop (to be
5383 created by the caller to this function).
5385 For the case of dr_explicit_realign_optimized:
5386 The caller to this function uses the phi-result (msq) to create the
5387 realignment code inside the loop, and sets up the missing phi argument,
5388 as follows:
5389 loop:
5390 msq = phi (msq_init, lsq)
5391 lsq = *(floor(p')); # load in loop
5392 result = realign_load (msq, lsq, realignment_token);
5394 For the case of dr_explicit_realign:
5395 loop:
5396 msq = *(floor(p)); # load in loop
5397 p' = p + (VS-1);
5398 lsq = *(floor(p')); # load in loop
5399 result = realign_load (msq, lsq, realignment_token);
5401 Input:
5402 STMT - (scalar) load stmt to be vectorized. This load accesses
5403 a memory location that may be unaligned.
5404 BSI - place where new code is to be inserted.
5405 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5406 is used.
5408 Output:
5409 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5410 target hook, if defined.
5411 Return value - the result of the loop-header phi node. */
5413 tree
5414 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5415 tree *realignment_token,
5416 enum dr_alignment_support alignment_support_scheme,
5417 tree init_addr,
5418 struct loop **at_loop)
5420 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5421 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5422 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5423 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5424 struct loop *loop = NULL;
5425 edge pe = NULL;
5426 tree scalar_dest = gimple_assign_lhs (stmt);
5427 tree vec_dest;
5428 gimple *inc;
5429 tree ptr;
5430 tree data_ref;
5431 basic_block new_bb;
5432 tree msq_init = NULL_TREE;
5433 tree new_temp;
5434 gphi *phi_stmt;
5435 tree msq = NULL_TREE;
5436 gimple_seq stmts = NULL;
5437 bool inv_p;
5438 bool compute_in_loop = false;
5439 bool nested_in_vect_loop = false;
5440 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5441 struct loop *loop_for_initial_load = NULL;
5443 if (loop_vinfo)
5445 loop = LOOP_VINFO_LOOP (loop_vinfo);
5446 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5449 gcc_assert (alignment_support_scheme == dr_explicit_realign
5450 || alignment_support_scheme == dr_explicit_realign_optimized);
5452 /* We need to generate three things:
5453 1. the misalignment computation
5454 2. the extra vector load (for the optimized realignment scheme).
5455 3. the phi node for the two vectors from which the realignment is
5456 done (for the optimized realignment scheme). */
5458 /* 1. Determine where to generate the misalignment computation.
5460 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5461 calculation will be generated by this function, outside the loop (in the
5462 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5463 caller, inside the loop.
5465 Background: If the misalignment remains fixed throughout the iterations of
5466 the loop, then both realignment schemes are applicable, and also the
5467 misalignment computation can be done outside LOOP. This is because we are
5468 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5469 are a multiple of VS (the Vector Size), and therefore the misalignment in
5470 different vectorized LOOP iterations is always the same.
5471 The problem arises only if the memory access is in an inner-loop nested
5472 inside LOOP, which is now being vectorized using outer-loop vectorization.
5473 This is the only case when the misalignment of the memory access may not
5474 remain fixed throughout the iterations of the inner-loop (as explained in
5475 detail in vect_supportable_dr_alignment). In this case, not only is the
5476 optimized realignment scheme not applicable, but also the misalignment
5477 computation (and generation of the realignment token that is passed to
5478 REALIGN_LOAD) have to be done inside the loop.
5480 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5481 or not, which in turn determines if the misalignment is computed inside
5482 the inner-loop, or outside LOOP. */
5484 if (init_addr != NULL_TREE || !loop_vinfo)
5486 compute_in_loop = true;
5487 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5491 /* 2. Determine where to generate the extra vector load.
5493 For the optimized realignment scheme, instead of generating two vector
5494 loads in each iteration, we generate a single extra vector load in the
5495 preheader of the loop, and in each iteration reuse the result of the
5496 vector load from the previous iteration. In case the memory access is in
5497 an inner-loop nested inside LOOP, which is now being vectorized using
5498 outer-loop vectorization, we need to determine whether this initial vector
5499 load should be generated at the preheader of the inner-loop, or can be
5500 generated at the preheader of LOOP. If the memory access has no evolution
5501 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5502 to be generated inside LOOP (in the preheader of the inner-loop). */
5504 if (nested_in_vect_loop)
5506 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5507 bool invariant_in_outerloop =
5508 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5509 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5511 else
5512 loop_for_initial_load = loop;
5513 if (at_loop)
5514 *at_loop = loop_for_initial_load;
5516 if (loop_for_initial_load)
5517 pe = loop_preheader_edge (loop_for_initial_load);
5519 /* 3. For the case of the optimized realignment, create the first vector
5520 load at the loop preheader. */
5522 if (alignment_support_scheme == dr_explicit_realign_optimized)
5524 /* Create msq_init = *(floor(p1)) in the loop preheader */
5525 gassign *new_stmt;
5527 gcc_assert (!compute_in_loop);
5528 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5529 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5530 NULL_TREE, &init_addr, NULL, &inc,
5531 true, &inv_p);
5532 if (TREE_CODE (ptr) == SSA_NAME)
5533 new_temp = copy_ssa_name (ptr);
5534 else
5535 new_temp = make_ssa_name (TREE_TYPE (ptr));
5536 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5537 new_stmt = gimple_build_assign
5538 (new_temp, BIT_AND_EXPR, ptr,
5539 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5540 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5541 gcc_assert (!new_bb);
5542 data_ref
5543 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5544 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5545 vect_copy_ref_info (data_ref, DR_REF (dr));
5546 new_stmt = gimple_build_assign (vec_dest, data_ref);
5547 new_temp = make_ssa_name (vec_dest, new_stmt);
5548 gimple_assign_set_lhs (new_stmt, new_temp);
5549 if (pe)
5551 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5552 gcc_assert (!new_bb);
5554 else
5555 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5557 msq_init = gimple_assign_lhs (new_stmt);
5560 /* 4. Create realignment token using a target builtin, if available.
5561 It is done either inside the containing loop, or before LOOP (as
5562 determined above). */
5564 if (targetm.vectorize.builtin_mask_for_load)
5566 gcall *new_stmt;
5567 tree builtin_decl;
5569 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5570 if (!init_addr)
5572 /* Generate the INIT_ADDR computation outside LOOP. */
5573 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5574 NULL_TREE);
5575 if (loop)
5577 pe = loop_preheader_edge (loop);
5578 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5579 gcc_assert (!new_bb);
5581 else
5582 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5585 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5586 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5587 vec_dest =
5588 vect_create_destination_var (scalar_dest,
5589 gimple_call_return_type (new_stmt));
5590 new_temp = make_ssa_name (vec_dest, new_stmt);
5591 gimple_call_set_lhs (new_stmt, new_temp);
5593 if (compute_in_loop)
5594 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5595 else
5597 /* Generate the misalignment computation outside LOOP. */
5598 pe = loop_preheader_edge (loop);
5599 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5600 gcc_assert (!new_bb);
5603 *realignment_token = gimple_call_lhs (new_stmt);
5605 /* The result of the CALL_EXPR to this builtin is determined from
5606 the value of the parameter and no global variables are touched
5607 which makes the builtin a "const" function. Requiring the
5608 builtin to have the "const" attribute makes it unnecessary
5609 to call mark_call_clobbered. */
5610 gcc_assert (TREE_READONLY (builtin_decl));
5613 if (alignment_support_scheme == dr_explicit_realign)
5614 return msq;
5616 gcc_assert (!compute_in_loop);
5617 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5620 /* 5. Create msq = phi <msq_init, lsq> in loop */
5622 pe = loop_preheader_edge (containing_loop);
5623 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5624 msq = make_ssa_name (vec_dest);
5625 phi_stmt = create_phi_node (msq, containing_loop->header);
5626 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5628 return msq;
5632 /* Function vect_grouped_load_supported.
5634 COUNT is the size of the load group (the number of statements plus the
5635 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5636 only one statement, with a gap of COUNT - 1.
5638 Returns true if a suitable permute exists. */
5640 bool
5641 vect_grouped_load_supported (tree vectype, bool single_element_p,
5642 unsigned HOST_WIDE_INT count)
5644 machine_mode mode = TYPE_MODE (vectype);
5646 /* If this is single-element interleaving with an element distance
5647 that leaves unused vector loads around punt - we at least create
5648 very sub-optimal code in that case (and blow up memory,
5649 see PR65518). */
5650 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5652 if (dump_enabled_p ())
5653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5654 "single-element interleaving not supported "
5655 "for not adjacent vector loads\n");
5656 return false;
5659 /* vect_permute_load_chain requires the group size to be equal to 3 or
5660 be a power of two. */
5661 if (count != 3 && exact_log2 (count) == -1)
5663 if (dump_enabled_p ())
5664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5665 "the size of the group of accesses"
5666 " is not a power of 2 or not equal to 3\n");
5667 return false;
5670 /* Check that the permutation is supported. */
5671 if (VECTOR_MODE_P (mode))
5673 unsigned int i, j;
5674 if (count == 3)
5676 unsigned int nelt;
5677 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5679 if (dump_enabled_p ())
5680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5681 "cannot handle groups of 3 loads for"
5682 " variable-length vectors\n");
5683 return false;
5686 vec_perm_builder sel (nelt, nelt, 1);
5687 sel.quick_grow (nelt);
5688 vec_perm_indices indices;
5689 unsigned int k;
5690 for (k = 0; k < 3; k++)
5692 for (i = 0; i < nelt; i++)
5693 if (3 * i + k < 2 * nelt)
5694 sel[i] = 3 * i + k;
5695 else
5696 sel[i] = 0;
5697 indices.new_vector (sel, 2, nelt);
5698 if (!can_vec_perm_const_p (mode, indices))
5700 if (dump_enabled_p ())
5701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702 "shuffle of 3 loads is not supported by"
5703 " target\n");
5704 return false;
5706 for (i = 0, j = 0; i < nelt; i++)
5707 if (3 * i + k < 2 * nelt)
5708 sel[i] = i;
5709 else
5710 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5711 indices.new_vector (sel, 2, nelt);
5712 if (!can_vec_perm_const_p (mode, indices))
5714 if (dump_enabled_p ())
5715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5716 "shuffle of 3 loads is not supported by"
5717 " target\n");
5718 return false;
5721 return true;
5723 else
5725 /* If length is not equal to 3 then only power of 2 is supported. */
5726 gcc_assert (pow2p_hwi (count));
5727 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5729 /* The encoding has a single stepped pattern. */
5730 vec_perm_builder sel (nelt, 1, 3);
5731 sel.quick_grow (3);
5732 for (i = 0; i < 3; i++)
5733 sel[i] = i * 2;
5734 vec_perm_indices indices (sel, 2, nelt);
5735 if (can_vec_perm_const_p (mode, indices))
5737 for (i = 0; i < 3; i++)
5738 sel[i] = i * 2 + 1;
5739 indices.new_vector (sel, 2, nelt);
5740 if (can_vec_perm_const_p (mode, indices))
5741 return true;
5746 if (dump_enabled_p ())
5747 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5748 "extract even/odd not supported by target\n");
5749 return false;
5752 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5753 type VECTYPE. MASKED_P says whether the masked form is needed. */
5755 bool
5756 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5757 bool masked_p)
5759 if (masked_p)
5760 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5761 vec_mask_load_lanes_optab,
5762 vectype, count);
5763 else
5764 return vect_lanes_optab_supported_p ("vec_load_lanes",
5765 vec_load_lanes_optab,
5766 vectype, count);
5769 /* Function vect_permute_load_chain.
5771 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5772 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5773 the input data correctly. Return the final references for loads in
5774 RESULT_CHAIN.
5776 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5777 The input is 4 vectors each containing 8 elements. We assign a number to each
5778 element, the input sequence is:
5780 1st vec: 0 1 2 3 4 5 6 7
5781 2nd vec: 8 9 10 11 12 13 14 15
5782 3rd vec: 16 17 18 19 20 21 22 23
5783 4th vec: 24 25 26 27 28 29 30 31
5785 The output sequence should be:
5787 1st vec: 0 4 8 12 16 20 24 28
5788 2nd vec: 1 5 9 13 17 21 25 29
5789 3rd vec: 2 6 10 14 18 22 26 30
5790 4th vec: 3 7 11 15 19 23 27 31
5792 i.e., the first output vector should contain the first elements of each
5793 interleaving group, etc.
5795 We use extract_even/odd instructions to create such output. The input of
5796 each extract_even/odd operation is two vectors
5797 1st vec 2nd vec
5798 0 1 2 3 4 5 6 7
5800 and the output is the vector of extracted even/odd elements. The output of
5801 extract_even will be: 0 2 4 6
5802 and of extract_odd: 1 3 5 7
5805 The permutation is done in log LENGTH stages. In each stage extract_even
5806 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5807 their order. In our example,
5809 E1: extract_even (1st vec, 2nd vec)
5810 E2: extract_odd (1st vec, 2nd vec)
5811 E3: extract_even (3rd vec, 4th vec)
5812 E4: extract_odd (3rd vec, 4th vec)
5814 The output for the first stage will be:
5816 E1: 0 2 4 6 8 10 12 14
5817 E2: 1 3 5 7 9 11 13 15
5818 E3: 16 18 20 22 24 26 28 30
5819 E4: 17 19 21 23 25 27 29 31
5821 In order to proceed and create the correct sequence for the next stage (or
5822 for the correct output, if the second stage is the last one, as in our
5823 example), we first put the output of extract_even operation and then the
5824 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5825 The input for the second stage is:
5827 1st vec (E1): 0 2 4 6 8 10 12 14
5828 2nd vec (E3): 16 18 20 22 24 26 28 30
5829 3rd vec (E2): 1 3 5 7 9 11 13 15
5830 4th vec (E4): 17 19 21 23 25 27 29 31
5832 The output of the second stage:
5834 E1: 0 4 8 12 16 20 24 28
5835 E2: 2 6 10 14 18 22 26 30
5836 E3: 1 5 9 13 17 21 25 29
5837 E4: 3 7 11 15 19 23 27 31
5839 And RESULT_CHAIN after reordering:
5841 1st vec (E1): 0 4 8 12 16 20 24 28
5842 2nd vec (E3): 1 5 9 13 17 21 25 29
5843 3rd vec (E2): 2 6 10 14 18 22 26 30
5844 4th vec (E4): 3 7 11 15 19 23 27 31. */
5846 static void
5847 vect_permute_load_chain (vec<tree> dr_chain,
5848 unsigned int length,
5849 gimple *stmt,
5850 gimple_stmt_iterator *gsi,
5851 vec<tree> *result_chain)
5853 tree data_ref, first_vect, second_vect;
5854 tree perm_mask_even, perm_mask_odd;
5855 tree perm3_mask_low, perm3_mask_high;
5856 gimple *perm_stmt;
5857 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5858 unsigned int i, j, log_length = exact_log2 (length);
5860 result_chain->quick_grow (length);
5861 memcpy (result_chain->address (), dr_chain.address (),
5862 length * sizeof (tree));
5864 if (length == 3)
5866 /* vect_grouped_load_supported ensures that this is constant. */
5867 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5868 unsigned int k;
5870 vec_perm_builder sel (nelt, nelt, 1);
5871 sel.quick_grow (nelt);
5872 vec_perm_indices indices;
5873 for (k = 0; k < 3; k++)
5875 for (i = 0; i < nelt; i++)
5876 if (3 * i + k < 2 * nelt)
5877 sel[i] = 3 * i + k;
5878 else
5879 sel[i] = 0;
5880 indices.new_vector (sel, 2, nelt);
5881 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5883 for (i = 0, j = 0; i < nelt; i++)
5884 if (3 * i + k < 2 * nelt)
5885 sel[i] = i;
5886 else
5887 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5888 indices.new_vector (sel, 2, nelt);
5889 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5891 first_vect = dr_chain[0];
5892 second_vect = dr_chain[1];
5894 /* Create interleaving stmt (low part of):
5895 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5896 ...}> */
5897 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5898 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5899 second_vect, perm3_mask_low);
5900 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5902 /* Create interleaving stmt (high part of):
5903 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5904 ...}> */
5905 first_vect = data_ref;
5906 second_vect = dr_chain[2];
5907 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5908 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5909 second_vect, perm3_mask_high);
5910 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5911 (*result_chain)[k] = data_ref;
5914 else
5916 /* If length is not equal to 3 then only power of 2 is supported. */
5917 gcc_assert (pow2p_hwi (length));
5919 /* The encoding has a single stepped pattern. */
5920 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5921 vec_perm_builder sel (nelt, 1, 3);
5922 sel.quick_grow (3);
5923 for (i = 0; i < 3; ++i)
5924 sel[i] = i * 2;
5925 vec_perm_indices indices (sel, 2, nelt);
5926 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5928 for (i = 0; i < 3; ++i)
5929 sel[i] = i * 2 + 1;
5930 indices.new_vector (sel, 2, nelt);
5931 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5933 for (i = 0; i < log_length; i++)
5935 for (j = 0; j < length; j += 2)
5937 first_vect = dr_chain[j];
5938 second_vect = dr_chain[j+1];
5940 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5941 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5942 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5943 first_vect, second_vect,
5944 perm_mask_even);
5945 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5946 (*result_chain)[j/2] = data_ref;
5948 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5949 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5950 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5951 first_vect, second_vect,
5952 perm_mask_odd);
5953 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5954 (*result_chain)[j/2+length/2] = data_ref;
5956 memcpy (dr_chain.address (), result_chain->address (),
5957 length * sizeof (tree));
5962 /* Function vect_shift_permute_load_chain.
5964 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5965 sequence of stmts to reorder the input data accordingly.
5966 Return the final references for loads in RESULT_CHAIN.
5967 Return true if successed, false otherwise.
5969 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5970 The input is 3 vectors each containing 8 elements. We assign a
5971 number to each element, the input sequence is:
5973 1st vec: 0 1 2 3 4 5 6 7
5974 2nd vec: 8 9 10 11 12 13 14 15
5975 3rd vec: 16 17 18 19 20 21 22 23
5977 The output sequence should be:
5979 1st vec: 0 3 6 9 12 15 18 21
5980 2nd vec: 1 4 7 10 13 16 19 22
5981 3rd vec: 2 5 8 11 14 17 20 23
5983 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5985 First we shuffle all 3 vectors to get correct elements order:
5987 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5988 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5989 3rd vec: (16 19 22) (17 20 23) (18 21)
5991 Next we unite and shift vector 3 times:
5993 1st step:
5994 shift right by 6 the concatenation of:
5995 "1st vec" and "2nd vec"
5996 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5997 "2nd vec" and "3rd vec"
5998 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5999 "3rd vec" and "1st vec"
6000 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6001 | New vectors |
6003 So that now new vectors are:
6005 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6006 2nd vec: (10 13) (16 19 22) (17 20 23)
6007 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6009 2nd step:
6010 shift right by 5 the concatenation of:
6011 "1st vec" and "3rd vec"
6012 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6013 "2nd vec" and "1st vec"
6014 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6015 "3rd vec" and "2nd vec"
6016 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6017 | New vectors |
6019 So that now new vectors are:
6021 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6022 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6023 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6025 3rd step:
6026 shift right by 5 the concatenation of:
6027 "1st vec" and "1st vec"
6028 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6029 shift right by 3 the concatenation of:
6030 "2nd vec" and "2nd vec"
6031 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6032 | New vectors |
6034 So that now all vectors are READY:
6035 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6036 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6037 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6039 This algorithm is faster than one in vect_permute_load_chain if:
6040 1. "shift of a concatination" is faster than general permutation.
6041 This is usually so.
6042 2. The TARGET machine can't execute vector instructions in parallel.
6043 This is because each step of the algorithm depends on previous.
6044 The algorithm in vect_permute_load_chain is much more parallel.
6046 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6049 static bool
6050 vect_shift_permute_load_chain (vec<tree> dr_chain,
6051 unsigned int length,
6052 gimple *stmt,
6053 gimple_stmt_iterator *gsi,
6054 vec<tree> *result_chain)
6056 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6057 tree perm2_mask1, perm2_mask2, perm3_mask;
6058 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6059 gimple *perm_stmt;
6061 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6062 unsigned int i;
6063 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6064 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6066 unsigned HOST_WIDE_INT nelt, vf;
6067 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6068 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6069 /* Not supported for variable-length vectors. */
6070 return false;
6072 vec_perm_builder sel (nelt, nelt, 1);
6073 sel.quick_grow (nelt);
6075 result_chain->quick_grow (length);
6076 memcpy (result_chain->address (), dr_chain.address (),
6077 length * sizeof (tree));
6079 if (pow2p_hwi (length) && vf > 4)
6081 unsigned int j, log_length = exact_log2 (length);
6082 for (i = 0; i < nelt / 2; ++i)
6083 sel[i] = i * 2;
6084 for (i = 0; i < nelt / 2; ++i)
6085 sel[nelt / 2 + i] = i * 2 + 1;
6086 vec_perm_indices indices (sel, 2, nelt);
6087 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6089 if (dump_enabled_p ())
6090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6091 "shuffle of 2 fields structure is not \
6092 supported by target\n");
6093 return false;
6095 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6097 for (i = 0; i < nelt / 2; ++i)
6098 sel[i] = i * 2 + 1;
6099 for (i = 0; i < nelt / 2; ++i)
6100 sel[nelt / 2 + i] = i * 2;
6101 indices.new_vector (sel, 2, nelt);
6102 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6104 if (dump_enabled_p ())
6105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6106 "shuffle of 2 fields structure is not \
6107 supported by target\n");
6108 return false;
6110 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6112 /* Generating permutation constant to shift all elements.
6113 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6114 for (i = 0; i < nelt; i++)
6115 sel[i] = nelt / 2 + i;
6116 indices.new_vector (sel, 2, nelt);
6117 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6119 if (dump_enabled_p ())
6120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6121 "shift permutation is not supported by target\n");
6122 return false;
6124 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6126 /* Generating permutation constant to select vector from 2.
6127 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6128 for (i = 0; i < nelt / 2; i++)
6129 sel[i] = i;
6130 for (i = nelt / 2; i < nelt; i++)
6131 sel[i] = nelt + i;
6132 indices.new_vector (sel, 2, nelt);
6133 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6135 if (dump_enabled_p ())
6136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6137 "select is not supported by target\n");
6138 return false;
6140 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6142 for (i = 0; i < log_length; i++)
6144 for (j = 0; j < length; j += 2)
6146 first_vect = dr_chain[j];
6147 second_vect = dr_chain[j + 1];
6149 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6150 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6151 first_vect, first_vect,
6152 perm2_mask1);
6153 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6154 vect[0] = data_ref;
6156 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6157 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6158 second_vect, second_vect,
6159 perm2_mask2);
6160 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6161 vect[1] = data_ref;
6163 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6164 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6165 vect[0], vect[1], shift1_mask);
6166 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6167 (*result_chain)[j/2 + length/2] = data_ref;
6169 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6170 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6171 vect[0], vect[1], select_mask);
6172 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6173 (*result_chain)[j/2] = data_ref;
6175 memcpy (dr_chain.address (), result_chain->address (),
6176 length * sizeof (tree));
6178 return true;
6180 if (length == 3 && vf > 2)
6182 unsigned int k = 0, l = 0;
6184 /* Generating permutation constant to get all elements in rigth order.
6185 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6186 for (i = 0; i < nelt; i++)
6188 if (3 * k + (l % 3) >= nelt)
6190 k = 0;
6191 l += (3 - (nelt % 3));
6193 sel[i] = 3 * k + (l % 3);
6194 k++;
6196 vec_perm_indices indices (sel, 2, nelt);
6197 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6199 if (dump_enabled_p ())
6200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6201 "shuffle of 3 fields structure is not \
6202 supported by target\n");
6203 return false;
6205 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6207 /* Generating permutation constant to shift all elements.
6208 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6209 for (i = 0; i < nelt; i++)
6210 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6211 indices.new_vector (sel, 2, nelt);
6212 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6214 if (dump_enabled_p ())
6215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6216 "shift permutation is not supported by target\n");
6217 return false;
6219 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6221 /* Generating permutation constant to shift all elements.
6222 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6223 for (i = 0; i < nelt; i++)
6224 sel[i] = 2 * (nelt / 3) + 1 + i;
6225 indices.new_vector (sel, 2, nelt);
6226 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6228 if (dump_enabled_p ())
6229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6230 "shift permutation is not supported by target\n");
6231 return false;
6233 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6235 /* Generating permutation constant to shift all elements.
6236 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6237 for (i = 0; i < nelt; i++)
6238 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6239 indices.new_vector (sel, 2, nelt);
6240 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6242 if (dump_enabled_p ())
6243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6244 "shift permutation is not supported by target\n");
6245 return false;
6247 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6249 /* Generating permutation constant to shift all elements.
6250 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6251 for (i = 0; i < nelt; i++)
6252 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6253 indices.new_vector (sel, 2, nelt);
6254 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6256 if (dump_enabled_p ())
6257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6258 "shift permutation is not supported by target\n");
6259 return false;
6261 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6263 for (k = 0; k < 3; k++)
6265 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6266 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6267 dr_chain[k], dr_chain[k],
6268 perm3_mask);
6269 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6270 vect[k] = data_ref;
6273 for (k = 0; k < 3; k++)
6275 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6276 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6277 vect[k % 3], vect[(k + 1) % 3],
6278 shift1_mask);
6279 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6280 vect_shift[k] = data_ref;
6283 for (k = 0; k < 3; k++)
6285 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6286 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6287 vect_shift[(4 - k) % 3],
6288 vect_shift[(3 - k) % 3],
6289 shift2_mask);
6290 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6291 vect[k] = data_ref;
6294 (*result_chain)[3 - (nelt % 3)] = vect[2];
6296 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6297 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6298 vect[0], shift3_mask);
6299 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6300 (*result_chain)[nelt % 3] = data_ref;
6302 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6303 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6304 vect[1], shift4_mask);
6305 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6306 (*result_chain)[0] = data_ref;
6307 return true;
6309 return false;
6312 /* Function vect_transform_grouped_load.
6314 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6315 to perform their permutation and ascribe the result vectorized statements to
6316 the scalar statements.
6319 void
6320 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6321 gimple_stmt_iterator *gsi)
6323 machine_mode mode;
6324 vec<tree> result_chain = vNULL;
6326 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6327 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6328 vectors, that are ready for vector computation. */
6329 result_chain.create (size);
6331 /* If reassociation width for vector type is 2 or greater target machine can
6332 execute 2 or more vector instructions in parallel. Otherwise try to
6333 get chain for loads group using vect_shift_permute_load_chain. */
6334 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6335 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6336 || pow2p_hwi (size)
6337 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6338 gsi, &result_chain))
6339 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6340 vect_record_grouped_load_vectors (stmt, result_chain);
6341 result_chain.release ();
6344 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6345 generated as part of the vectorization of STMT. Assign the statement
6346 for each vector to the associated scalar statement. */
6348 void
6349 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6351 gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6352 gimple *next_stmt, *new_stmt;
6353 unsigned int i, gap_count;
6354 tree tmp_data_ref;
6356 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6357 Since we scan the chain starting from it's first node, their order
6358 corresponds the order of data-refs in RESULT_CHAIN. */
6359 next_stmt = first_stmt;
6360 gap_count = 1;
6361 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6363 if (!next_stmt)
6364 break;
6366 /* Skip the gaps. Loads created for the gaps will be removed by dead
6367 code elimination pass later. No need to check for the first stmt in
6368 the group, since it always exists.
6369 DR_GROUP_GAP is the number of steps in elements from the previous
6370 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6371 correspond to the gaps. */
6372 if (next_stmt != first_stmt
6373 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6375 gap_count++;
6376 continue;
6379 while (next_stmt)
6381 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6382 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6383 copies, and we put the new vector statement in the first available
6384 RELATED_STMT. */
6385 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6386 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6387 else
6389 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6391 gimple *prev_stmt =
6392 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6393 gimple *rel_stmt =
6394 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6395 while (rel_stmt)
6397 prev_stmt = rel_stmt;
6398 rel_stmt =
6399 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6402 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6403 new_stmt;
6407 next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6408 gap_count = 1;
6409 /* If NEXT_STMT accesses the same DR as the previous statement,
6410 put the same TMP_DATA_REF as its vectorized statement; otherwise
6411 get the next data-ref from RESULT_CHAIN. */
6412 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6413 break;
6418 /* Function vect_force_dr_alignment_p.
6420 Returns whether the alignment of a DECL can be forced to be aligned
6421 on ALIGNMENT bit boundary. */
6423 bool
6424 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6426 if (!VAR_P (decl))
6427 return false;
6429 if (decl_in_symtab_p (decl)
6430 && !symtab_node::get (decl)->can_increase_alignment_p ())
6431 return false;
6433 if (TREE_STATIC (decl))
6434 return (alignment <= MAX_OFILE_ALIGNMENT);
6435 else
6436 return (alignment <= MAX_STACK_ALIGNMENT);
6440 /* Return whether the data reference DR is supported with respect to its
6441 alignment.
6442 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6443 it is aligned, i.e., check if it is possible to vectorize it with different
6444 alignment. */
6446 enum dr_alignment_support
6447 vect_supportable_dr_alignment (struct data_reference *dr,
6448 bool check_aligned_accesses)
6450 gimple *stmt = vect_dr_stmt (dr);
6451 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6452 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6453 machine_mode mode = TYPE_MODE (vectype);
6454 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6455 struct loop *vect_loop = NULL;
6456 bool nested_in_vect_loop = false;
6458 if (aligned_access_p (dr) && !check_aligned_accesses)
6459 return dr_aligned;
6461 /* For now assume all conditional loads/stores support unaligned
6462 access without any special code. */
6463 if (is_gimple_call (stmt)
6464 && gimple_call_internal_p (stmt)
6465 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6466 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6467 return dr_unaligned_supported;
6469 if (loop_vinfo)
6471 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6472 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6475 /* Possibly unaligned access. */
6477 /* We can choose between using the implicit realignment scheme (generating
6478 a misaligned_move stmt) and the explicit realignment scheme (generating
6479 aligned loads with a REALIGN_LOAD). There are two variants to the
6480 explicit realignment scheme: optimized, and unoptimized.
6481 We can optimize the realignment only if the step between consecutive
6482 vector loads is equal to the vector size. Since the vector memory
6483 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6484 is guaranteed that the misalignment amount remains the same throughout the
6485 execution of the vectorized loop. Therefore, we can create the
6486 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6487 at the loop preheader.
6489 However, in the case of outer-loop vectorization, when vectorizing a
6490 memory access in the inner-loop nested within the LOOP that is now being
6491 vectorized, while it is guaranteed that the misalignment of the
6492 vectorized memory access will remain the same in different outer-loop
6493 iterations, it is *not* guaranteed that is will remain the same throughout
6494 the execution of the inner-loop. This is because the inner-loop advances
6495 with the original scalar step (and not in steps of VS). If the inner-loop
6496 step happens to be a multiple of VS, then the misalignment remains fixed
6497 and we can use the optimized realignment scheme. For example:
6499 for (i=0; i<N; i++)
6500 for (j=0; j<M; j++)
6501 s += a[i+j];
6503 When vectorizing the i-loop in the above example, the step between
6504 consecutive vector loads is 1, and so the misalignment does not remain
6505 fixed across the execution of the inner-loop, and the realignment cannot
6506 be optimized (as illustrated in the following pseudo vectorized loop):
6508 for (i=0; i<N; i+=4)
6509 for (j=0; j<M; j++){
6510 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6511 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6512 // (assuming that we start from an aligned address).
6515 We therefore have to use the unoptimized realignment scheme:
6517 for (i=0; i<N; i+=4)
6518 for (j=k; j<M; j+=4)
6519 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6520 // that the misalignment of the initial address is
6521 // 0).
6523 The loop can then be vectorized as follows:
6525 for (k=0; k<4; k++){
6526 rt = get_realignment_token (&vp[k]);
6527 for (i=0; i<N; i+=4){
6528 v1 = vp[i+k];
6529 for (j=k; j<M; j+=4){
6530 v2 = vp[i+j+VS-1];
6531 va = REALIGN_LOAD <v1,v2,rt>;
6532 vs += va;
6533 v1 = v2;
6536 } */
6538 if (DR_IS_READ (dr))
6540 bool is_packed = false;
6541 tree type = (TREE_TYPE (DR_REF (dr)));
6543 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6544 && (!targetm.vectorize.builtin_mask_for_load
6545 || targetm.vectorize.builtin_mask_for_load ()))
6547 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6549 /* If we are doing SLP then the accesses need not have the
6550 same alignment, instead it depends on the SLP group size. */
6551 if (loop_vinfo
6552 && STMT_SLP_TYPE (stmt_info)
6553 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6554 * DR_GROUP_SIZE (vinfo_for_stmt
6555 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6556 TYPE_VECTOR_SUBPARTS (vectype)))
6558 else if (!loop_vinfo
6559 || (nested_in_vect_loop
6560 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6561 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6562 return dr_explicit_realign;
6563 else
6564 return dr_explicit_realign_optimized;
6566 if (!known_alignment_for_access_p (dr))
6567 is_packed = not_size_aligned (DR_REF (dr));
6569 if (targetm.vectorize.support_vector_misalignment
6570 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6571 /* Can't software pipeline the loads, but can at least do them. */
6572 return dr_unaligned_supported;
6574 else
6576 bool is_packed = false;
6577 tree type = (TREE_TYPE (DR_REF (dr)));
6579 if (!known_alignment_for_access_p (dr))
6580 is_packed = not_size_aligned (DR_REF (dr));
6582 if (targetm.vectorize.support_vector_misalignment
6583 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6584 return dr_unaligned_supported;
6587 /* Unsupported. */
6588 return dr_unaligned_unsupported;