[AArch64] SVE tests
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobeb82594259807060c3f20483c3a3171c940b3533
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
64 machine_mode mode;
65 scalar_int_mode array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 limit_p = !targetm.array_mode_supported_p (mode, count);
70 if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
71 limit_p).exists (&array_mode))
73 if (dump_enabled_p ())
74 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
75 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
76 GET_MODE_NAME (mode), count);
77 return false;
80 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
82 if (dump_enabled_p ())
83 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
84 "cannot use %s<%s><%s>\n", name,
85 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
86 return false;
89 if (dump_enabled_p ())
90 dump_printf_loc (MSG_NOTE, vect_location,
91 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
92 GET_MODE_NAME (mode));
94 return true;
98 /* Return the smallest scalar part of STMT.
99 This is used to determine the vectype of the stmt. We generally set the
100 vectype according to the type of the result (lhs). For stmts whose
101 result-type is different than the type of the arguments (e.g., demotion,
102 promotion), vectype will be reset appropriately (later). Note that we have
103 to visit the smallest datatype in this function, because that determines the
104 VF. If the smallest datatype in the loop is present only as the rhs of a
105 promotion operation - we'd miss it.
106 Such a case, where a variable of this datatype does not appear in the lhs
107 anywhere in the loop, can only occur if it's an invariant: e.g.:
108 'int_x = (int) short_inv', which we'd expect to have been optimized away by
109 invariant motion. However, we cannot rely on invariant motion to always
110 take invariants out of the loop, and so in the case of promotion we also
111 have to check the rhs.
112 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
113 types. */
115 tree
116 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
117 HOST_WIDE_INT *rhs_size_unit)
119 tree scalar_type = gimple_expr_type (stmt);
120 HOST_WIDE_INT lhs, rhs;
122 /* During the analysis phase, this function is called on arbitrary
123 statements that might not have scalar results. */
124 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
125 return scalar_type;
127 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 if (is_gimple_assign (stmt)
130 && (gimple_assign_cast_p (stmt)
131 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
135 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
137 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
138 if (rhs < lhs)
139 scalar_type = rhs_type;
142 *lhs_size_unit = lhs;
143 *rhs_size_unit = rhs;
144 return scalar_type;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
152 static bool
153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
158 return false;
160 if (!runtime_alias_check_p (ddr, loop,
161 optimize_loop_nest_for_speed_p (loop)))
162 return false;
164 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
165 return true;
169 /* A subroutine of vect_analyze_data_ref_dependence. Handle
170 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
171 distances. These distances are conservatively correct but they don't
172 reflect a guaranteed dependence.
174 Return true if this function does all the work necessary to avoid
175 an alias or false if the caller should use the dependence distances
176 to limit the vectorization factor in the usual way. LOOP_DEPTH is
177 the depth of the loop described by LOOP_VINFO and the other arguments
178 are as for vect_analyze_data_ref_dependence. */
180 static bool
181 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
182 loop_vec_info loop_vinfo,
183 int loop_depth, unsigned int *max_vf)
185 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186 lambda_vector dist_v;
187 unsigned int i;
188 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
190 int dist = dist_v[loop_depth];
191 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
193 /* If the user asserted safelen >= DIST consecutive iterations
194 can be executed concurrently, assume independence.
196 ??? An alternative would be to add the alias check even
197 in this case, and vectorize the fallback loop with the
198 maximum VF set to safelen. However, if the user has
199 explicitly given a length, it's less likely that that
200 would be a win. */
201 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
203 if ((unsigned int) loop->safelen < *max_vf)
204 *max_vf = loop->safelen;
205 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
206 continue;
209 /* For dependence distances of 2 or more, we have the option
210 of limiting VF or checking for an alias at runtime.
211 Prefer to check at runtime if we can, to avoid limiting
212 the VF unnecessarily when the bases are in fact independent.
214 Note that the alias checks will be removed if the VF ends up
215 being small enough. */
216 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
219 return true;
223 /* Function vect_analyze_data_ref_dependence.
225 Return TRUE if there (might) exist a dependence between a memory-reference
226 DRA and a memory-reference DRB. When versioning for alias may check a
227 dependence at run-time, return FALSE. Adjust *MAX_VF according to
228 the data dependence. */
230 static bool
231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 unsigned int *max_vf)
235 unsigned int i;
236 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
237 struct data_reference *dra = DDR_A (ddr);
238 struct data_reference *drb = DDR_B (ddr);
239 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
240 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
241 lambda_vector dist_v;
242 unsigned int loop_depth;
244 /* In loop analysis all data references should be vectorizable. */
245 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
246 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
247 gcc_unreachable ();
249 /* Independent data accesses. */
250 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
251 return false;
253 if (dra == drb
254 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
255 return false;
257 /* We do not have to consider dependences between accesses that belong
258 to the same group. */
259 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
260 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
261 return false;
263 /* Even if we have an anti-dependence then, as the vectorized loop covers at
264 least two scalar iterations, there is always also a true dependence.
265 As the vectorizer does not re-order loads and stores we can ignore
266 the anti-dependence if TBAA can disambiguate both DRs similar to the
267 case with known negative distance anti-dependences (positive
268 distance anti-dependences would violate TBAA constraints). */
269 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
270 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
271 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
272 get_alias_set (DR_REF (drb))))
273 return false;
275 /* Unknown data dependence. */
276 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
278 /* If user asserted safelen consecutive iterations can be
279 executed concurrently, assume independence. */
280 if (loop->safelen >= 2)
282 if ((unsigned int) loop->safelen < *max_vf)
283 *max_vf = loop->safelen;
284 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
285 return false;
288 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
289 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
291 if (dump_enabled_p ())
293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
294 "versioning for alias not supported for: "
295 "can't determine dependence between ");
296 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
297 DR_REF (dra));
298 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
299 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
300 DR_REF (drb));
301 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
303 return true;
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias required: "
310 "can't determine dependence between ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
319 /* Add to list of ddrs that need to be tested at run-time. */
320 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
323 /* Known data dependence. */
324 if (DDR_NUM_DIST_VECTS (ddr) == 0)
326 /* If user asserted safelen consecutive iterations can be
327 executed concurrently, assume independence. */
328 if (loop->safelen >= 2)
330 if ((unsigned int) loop->safelen < *max_vf)
331 *max_vf = loop->safelen;
332 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
333 return false;
336 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
337 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
339 if (dump_enabled_p ())
341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
342 "versioning for alias not supported for: "
343 "bad dist vector for ");
344 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
345 DR_REF (dra));
346 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
347 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
348 DR_REF (drb));
349 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
351 return true;
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
357 "versioning for alias required: "
358 "bad dist vector for ");
359 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
360 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
361 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
362 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 /* Add to list of ddrs that need to be tested at run-time. */
365 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
368 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
370 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
371 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
372 loop_depth, max_vf))
373 return false;
375 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
377 int dist = dist_v[loop_depth];
379 if (dump_enabled_p ())
380 dump_printf_loc (MSG_NOTE, vect_location,
381 "dependence distance = %d.\n", dist);
383 if (dist == 0)
385 if (dump_enabled_p ())
387 dump_printf_loc (MSG_NOTE, vect_location,
388 "dependence distance == 0 between ");
389 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
390 dump_printf (MSG_NOTE, " and ");
391 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
392 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
395 /* When we perform grouped accesses and perform implicit CSE
396 by detecting equal accesses and doing disambiguation with
397 runtime alias tests like for
398 .. = a[i];
399 .. = a[i+1];
400 a[i] = ..;
401 a[i+1] = ..;
402 *p = ..;
403 .. = a[i];
404 .. = a[i+1];
405 where we will end up loading { a[i], a[i+1] } once, make
406 sure that inserting group loads before the first load and
407 stores after the last store will do the right thing.
408 Similar for groups like
409 a[i] = ...;
410 ... = a[i];
411 a[i+1] = ...;
412 where loads from the group interleave with the store. */
413 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
414 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
416 gimple *earlier_stmt;
417 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
418 if (DR_IS_WRITE
419 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
421 if (dump_enabled_p ())
422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
423 "READ_WRITE dependence in interleaving."
424 "\n");
425 return true;
429 continue;
432 if (dist > 0 && DDR_REVERSED_P (ddr))
434 /* If DDR_REVERSED_P the order of the data-refs in DDR was
435 reversed (to make distance vector positive), and the actual
436 distance is negative. */
437 if (dump_enabled_p ())
438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
439 "dependence distance negative.\n");
440 /* Record a negative dependence distance to later limit the
441 amount of stmt copying / unrolling we can perform.
442 Only need to handle read-after-write dependence. */
443 if (DR_IS_READ (drb)
444 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
445 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
446 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
447 continue;
450 unsigned int abs_dist = abs (dist);
451 if (abs_dist >= 2 && abs_dist < *max_vf)
453 /* The dependence distance requires reduction of the maximal
454 vectorization factor. */
455 *max_vf = abs (dist);
456 if (dump_enabled_p ())
457 dump_printf_loc (MSG_NOTE, vect_location,
458 "adjusting maximal vectorization factor to %i\n",
459 *max_vf);
462 if (abs_dist >= *max_vf)
464 /* Dependence distance does not create dependence, as far as
465 vectorization is concerned, in this case. */
466 if (dump_enabled_p ())
467 dump_printf_loc (MSG_NOTE, vect_location,
468 "dependence distance >= VF.\n");
469 continue;
472 if (dump_enabled_p ())
474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
475 "not vectorized, possible dependence "
476 "between data-refs ");
477 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
478 dump_printf (MSG_NOTE, " and ");
479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
480 dump_printf (MSG_NOTE, "\n");
483 return true;
486 return false;
489 /* Function vect_analyze_data_ref_dependences.
491 Examine all the data references in the loop, and make sure there do not
492 exist any data dependences between them. Set *MAX_VF according to
493 the maximum vectorization factor the data dependences allow. */
495 bool
496 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
497 unsigned int *max_vf)
499 unsigned int i;
500 struct data_dependence_relation *ddr;
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE, vect_location,
504 "=== vect_analyze_data_ref_dependences ===\n");
506 LOOP_VINFO_DDRS (loop_vinfo)
507 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
508 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
509 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
510 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
511 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
512 &LOOP_VINFO_DDRS (loop_vinfo),
513 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
514 return false;
516 /* For epilogues we either have no aliases or alias versioning
517 was applied to original loop. Therefore we may just get max_vf
518 using VF of original loop. */
519 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
520 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
521 else
522 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
523 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
524 return false;
526 return true;
530 /* Function vect_slp_analyze_data_ref_dependence.
532 Return TRUE if there (might) exist a dependence between a memory-reference
533 DRA and a memory-reference DRB. When versioning for alias may check a
534 dependence at run-time, return FALSE. Adjust *MAX_VF according to
535 the data dependence. */
537 static bool
538 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
540 struct data_reference *dra = DDR_A (ddr);
541 struct data_reference *drb = DDR_B (ddr);
543 /* We need to check dependences of statements marked as unvectorizable
544 as well, they still can prohibit vectorization. */
546 /* Independent data accesses. */
547 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
548 return false;
550 if (dra == drb)
551 return false;
553 /* Read-read is OK. */
554 if (DR_IS_READ (dra) && DR_IS_READ (drb))
555 return false;
557 /* If dra and drb are part of the same interleaving chain consider
558 them independent. */
559 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
560 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
561 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
562 return false;
564 /* Unknown data dependence. */
565 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
567 if (dump_enabled_p ())
569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
570 "can't determine dependence between ");
571 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
572 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
573 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
574 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
577 else if (dump_enabled_p ())
579 dump_printf_loc (MSG_NOTE, vect_location,
580 "determined dependence between ");
581 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
582 dump_printf (MSG_NOTE, " and ");
583 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
584 dump_printf (MSG_NOTE, "\n");
587 return true;
591 /* Analyze dependences involved in the transform of SLP NODE. STORES
592 contain the vector of scalar stores of this instance if we are
593 disambiguating the loads. */
595 static bool
596 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
597 vec<gimple *> stores, gimple *last_store)
599 /* This walks over all stmts involved in the SLP load/store done
600 in NODE verifying we can sink them up to the last stmt in the
601 group. */
602 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
603 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
605 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
606 if (access == last_access)
607 continue;
608 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
609 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
610 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
612 gimple *stmt = gsi_stmt (gsi);
613 if (! gimple_vuse (stmt)
614 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
615 continue;
617 /* If we couldn't record a (single) data reference for this
618 stmt we have to give up. */
619 /* ??? Here and below if dependence analysis fails we can resort
620 to the alias oracle which can handle more kinds of stmts. */
621 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
622 if (!dr_b)
623 return false;
625 bool dependent = false;
626 /* If we run into a store of this same instance (we've just
627 marked those) then delay dependence checking until we run
628 into the last store because this is where it will have
629 been sunk to (and we verify if we can do that as well). */
630 if (gimple_visited_p (stmt))
632 if (stmt != last_store)
633 continue;
634 unsigned i;
635 gimple *store;
636 FOR_EACH_VEC_ELT (stores, i, store)
638 data_reference *store_dr
639 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
640 ddr_p ddr = initialize_data_dependence_relation
641 (dr_a, store_dr, vNULL);
642 dependent = vect_slp_analyze_data_ref_dependence (ddr);
643 free_dependence_relation (ddr);
644 if (dependent)
645 break;
648 else
650 ddr_p ddr = initialize_data_dependence_relation (dr_a,
651 dr_b, vNULL);
652 dependent = vect_slp_analyze_data_ref_dependence (ddr);
653 free_dependence_relation (ddr);
655 if (dependent)
656 return false;
659 return true;
663 /* Function vect_analyze_data_ref_dependences.
665 Examine all the data references in the basic-block, and make sure there
666 do not exist any data dependences between them. Set *MAX_VF according to
667 the maximum vectorization factor the data dependences allow. */
669 bool
670 vect_slp_analyze_instance_dependence (slp_instance instance)
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_NOTE, vect_location,
674 "=== vect_slp_analyze_instance_dependence ===\n");
676 /* The stores of this instance are at the root of the SLP tree. */
677 slp_tree store = SLP_INSTANCE_TREE (instance);
678 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
679 store = NULL;
681 /* Verify we can sink stores to the vectorized stmt insert location. */
682 gimple *last_store = NULL;
683 if (store)
685 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
686 return false;
688 /* Mark stores in this instance and remember the last one. */
689 last_store = vect_find_last_scalar_stmt_in_slp (store);
690 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
691 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
694 bool res = true;
696 /* Verify we can sink loads to the vectorized stmt insert location,
697 special-casing stores of this instance. */
698 slp_tree load;
699 unsigned int i;
700 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
701 if (! vect_slp_analyze_node_dependences (instance, load,
702 store
703 ? SLP_TREE_SCALAR_STMTS (store)
704 : vNULL, last_store))
706 res = false;
707 break;
710 /* Unset the visited flag. */
711 if (store)
712 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
713 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
715 return res;
718 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
719 the statement that contains DRB, which is useful for recording in the
720 dump file. */
722 static void
723 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
724 innermost_loop_behavior *drb)
726 bool existed;
727 innermost_loop_behavior *&entry
728 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
729 if (!existed || entry->base_alignment < drb->base_alignment)
731 entry = drb;
732 if (dump_enabled_p ())
734 dump_printf_loc (MSG_NOTE, vect_location,
735 "recording new base alignment for ");
736 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
737 dump_printf (MSG_NOTE, "\n");
738 dump_printf_loc (MSG_NOTE, vect_location,
739 " alignment: %d\n", drb->base_alignment);
740 dump_printf_loc (MSG_NOTE, vect_location,
741 " misalignment: %d\n", drb->base_misalignment);
742 dump_printf_loc (MSG_NOTE, vect_location,
743 " based on: ");
744 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
749 /* If the region we're going to vectorize is reached, all unconditional
750 data references occur at least once. We can therefore pool the base
751 alignment guarantees from each unconditional reference. Do this by
752 going through all the data references in VINFO and checking whether
753 the containing statement makes the reference unconditionally. If so,
754 record the alignment of the base address in VINFO so that it can be
755 used for all other references with the same base. */
757 void
758 vect_record_base_alignments (vec_info *vinfo)
760 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
761 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
762 data_reference *dr;
763 unsigned int i;
764 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
765 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
767 gimple *stmt = DR_STMT (dr);
768 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
770 /* If DR is nested in the loop that is being vectorized, we can also
771 record the alignment of the base wrt the outer loop. */
772 if (loop && nested_in_vect_loop_p (loop, stmt))
774 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
775 vect_record_base_alignment
776 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
781 /* Return the target alignment for the vectorized form of DR. */
783 static unsigned int
784 vect_calculate_target_alignment (struct data_reference *dr)
786 gimple *stmt = DR_STMT (dr);
787 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
788 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
789 return targetm.vectorize.preferred_vector_alignment (vectype);
792 /* Function vect_compute_data_ref_alignment
794 Compute the misalignment of the data reference DR.
796 Output:
797 1. If during the misalignment computation it is found that the data reference
798 cannot be vectorized then false is returned.
799 2. DR_MISALIGNMENT (DR) is defined.
801 FOR NOW: No analysis is actually performed. Misalignment is calculated
802 only for trivial cases. TODO. */
804 bool
805 vect_compute_data_ref_alignment (struct data_reference *dr)
807 gimple *stmt = DR_STMT (dr);
808 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
809 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
810 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
811 struct loop *loop = NULL;
812 tree ref = DR_REF (dr);
813 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
815 if (dump_enabled_p ())
816 dump_printf_loc (MSG_NOTE, vect_location,
817 "vect_compute_data_ref_alignment:\n");
819 if (loop_vinfo)
820 loop = LOOP_VINFO_LOOP (loop_vinfo);
822 /* Initialize misalignment to unknown. */
823 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
825 innermost_loop_behavior *drb = vect_dr_behavior (dr);
826 bool step_preserves_misalignment_p;
828 unsigned HOST_WIDE_INT vector_alignment
829 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
830 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
832 /* No step for BB vectorization. */
833 if (!loop)
835 gcc_assert (integer_zerop (drb->step));
836 step_preserves_misalignment_p = true;
839 /* In case the dataref is in an inner-loop of the loop that is being
840 vectorized (LOOP), we use the base and misalignment information
841 relative to the outer-loop (LOOP). This is ok only if the misalignment
842 stays the same throughout the execution of the inner-loop, which is why
843 we have to check that the stride of the dataref in the inner-loop evenly
844 divides by the vector alignment. */
845 else if (nested_in_vect_loop_p (loop, stmt))
847 step_preserves_misalignment_p
848 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
850 if (dump_enabled_p ())
852 if (step_preserves_misalignment_p)
853 dump_printf_loc (MSG_NOTE, vect_location,
854 "inner step divides the vector alignment.\n");
855 else
856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
857 "inner step doesn't divide the vector"
858 " alignment.\n");
862 /* Similarly we can only use base and misalignment information relative to
863 an innermost loop if the misalignment stays the same throughout the
864 execution of the loop. As above, this is the case if the stride of
865 the dataref evenly divides by the alignment. */
866 else
868 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
869 step_preserves_misalignment_p
870 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
872 if (!step_preserves_misalignment_p && dump_enabled_p ())
873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
874 "step doesn't divide the vector alignment.\n");
877 unsigned int base_alignment = drb->base_alignment;
878 unsigned int base_misalignment = drb->base_misalignment;
880 /* Calculate the maximum of the pooled base address alignment and the
881 alignment that we can compute for DR itself. */
882 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
883 if (entry && base_alignment < (*entry)->base_alignment)
885 base_alignment = (*entry)->base_alignment;
886 base_misalignment = (*entry)->base_misalignment;
889 if (drb->offset_alignment < vector_alignment
890 || !step_preserves_misalignment_p
891 /* We need to know whether the step wrt the vectorized loop is
892 negative when computing the starting misalignment below. */
893 || TREE_CODE (drb->step) != INTEGER_CST)
895 if (dump_enabled_p ())
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "Unknown alignment for access: ");
899 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
900 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
902 return true;
905 if (base_alignment < vector_alignment)
907 tree base = drb->base_address;
908 if (TREE_CODE (base) == ADDR_EXPR)
909 base = TREE_OPERAND (base, 0);
910 if (!vect_can_force_dr_alignment_p (base,
911 vector_alignment * BITS_PER_UNIT))
913 if (dump_enabled_p ())
915 dump_printf_loc (MSG_NOTE, vect_location,
916 "can't force alignment of ref: ");
917 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
918 dump_printf (MSG_NOTE, "\n");
920 return true;
923 /* Force the alignment of the decl.
924 NOTE: This is the only change to the code we make during
925 the analysis phase, before deciding to vectorize the loop. */
926 if (dump_enabled_p ())
928 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
929 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
930 dump_printf (MSG_NOTE, "\n");
933 DR_VECT_AUX (dr)->base_decl = base;
934 DR_VECT_AUX (dr)->base_misaligned = true;
935 base_misalignment = 0;
937 poly_int64 misalignment
938 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
940 /* If this is a backward running DR then first access in the larger
941 vectype actually is N-1 elements before the address in the DR.
942 Adjust misalign accordingly. */
943 if (tree_int_cst_sgn (drb->step) < 0)
944 /* PLUS because STEP is negative. */
945 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
946 * TREE_INT_CST_LOW (drb->step));
948 unsigned int const_misalignment;
949 if (!known_misalignment (misalignment, vector_alignment,
950 &const_misalignment))
952 if (dump_enabled_p ())
954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
955 "Non-constant misalignment for access: ");
956 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
957 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
959 return true;
962 SET_DR_MISALIGNMENT (dr, const_misalignment);
964 if (dump_enabled_p ())
966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
967 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
968 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
969 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
972 return true;
975 /* Function vect_update_misalignment_for_peel.
976 Sets DR's misalignment
977 - to 0 if it has the same alignment as DR_PEEL,
978 - to the misalignment computed using NPEEL if DR's salignment is known,
979 - to -1 (unknown) otherwise.
981 DR - the data reference whose misalignment is to be adjusted.
982 DR_PEEL - the data reference whose misalignment is being made
983 zero in the vector loop by the peel.
984 NPEEL - the number of iterations in the peel loop if the misalignment
985 of DR_PEEL is known at compile time. */
987 static void
988 vect_update_misalignment_for_peel (struct data_reference *dr,
989 struct data_reference *dr_peel, int npeel)
991 unsigned int i;
992 vec<dr_p> same_aligned_drs;
993 struct data_reference *current_dr;
994 int dr_size = vect_get_scalar_dr_size (dr);
995 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
996 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
997 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
999 /* For interleaved data accesses the step in the loop must be multiplied by
1000 the size of the interleaving group. */
1001 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1002 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1003 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1004 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1006 /* It can be assumed that the data refs with the same alignment as dr_peel
1007 are aligned in the vector loop. */
1008 same_aligned_drs
1009 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1010 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1012 if (current_dr != dr)
1013 continue;
1014 gcc_assert (!known_alignment_for_access_p (dr)
1015 || !known_alignment_for_access_p (dr_peel)
1016 || (DR_MISALIGNMENT (dr) / dr_size
1017 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1018 SET_DR_MISALIGNMENT (dr, 0);
1019 return;
1022 if (known_alignment_for_access_p (dr)
1023 && known_alignment_for_access_p (dr_peel))
1025 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1026 int misal = DR_MISALIGNMENT (dr);
1027 misal += negative ? -npeel * dr_size : npeel * dr_size;
1028 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1029 SET_DR_MISALIGNMENT (dr, misal);
1030 return;
1033 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1035 "to unknown (-1).\n");
1036 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1040 /* Function verify_data_ref_alignment
1042 Return TRUE if DR can be handled with respect to alignment. */
1044 static bool
1045 verify_data_ref_alignment (data_reference_p dr)
1047 enum dr_alignment_support supportable_dr_alignment
1048 = vect_supportable_dr_alignment (dr, false);
1049 if (!supportable_dr_alignment)
1051 if (dump_enabled_p ())
1053 if (DR_IS_READ (dr))
1054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055 "not vectorized: unsupported unaligned load.");
1056 else
1057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058 "not vectorized: unsupported unaligned "
1059 "store.");
1061 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1062 DR_REF (dr));
1063 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1065 return false;
1068 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1069 dump_printf_loc (MSG_NOTE, vect_location,
1070 "Vectorizing an unaligned access.\n");
1072 return true;
1075 /* Function vect_verify_datarefs_alignment
1077 Return TRUE if all data references in the loop can be
1078 handled with respect to alignment. */
1080 bool
1081 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1083 vec<data_reference_p> datarefs = vinfo->datarefs;
1084 struct data_reference *dr;
1085 unsigned int i;
1087 FOR_EACH_VEC_ELT (datarefs, i, dr)
1089 gimple *stmt = DR_STMT (dr);
1090 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1092 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1093 continue;
1095 /* For interleaving, only the alignment of the first access matters. */
1096 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1097 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1098 continue;
1100 /* Strided accesses perform only component accesses, alignment is
1101 irrelevant for them. */
1102 if (STMT_VINFO_STRIDED_P (stmt_info)
1103 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1104 continue;
1106 if (! verify_data_ref_alignment (dr))
1107 return false;
1110 return true;
1113 /* Given an memory reference EXP return whether its alignment is less
1114 than its size. */
1116 static bool
1117 not_size_aligned (tree exp)
1119 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1120 return true;
1122 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1123 > get_object_alignment (exp));
1126 /* Function vector_alignment_reachable_p
1128 Return true if vector alignment for DR is reachable by peeling
1129 a few loop iterations. Return false otherwise. */
1131 static bool
1132 vector_alignment_reachable_p (struct data_reference *dr)
1134 gimple *stmt = DR_STMT (dr);
1135 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1136 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1138 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1140 /* For interleaved access we peel only if number of iterations in
1141 the prolog loop ({VF - misalignment}), is a multiple of the
1142 number of the interleaved accesses. */
1143 int elem_size, mis_in_elements;
1145 /* FORNOW: handle only known alignment. */
1146 if (!known_alignment_for_access_p (dr))
1147 return false;
1149 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1150 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1151 elem_size = vector_element_size (vector_size, nelements);
1152 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1154 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1155 return false;
1158 /* If misalignment is known at the compile time then allow peeling
1159 only if natural alignment is reachable through peeling. */
1160 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1162 HOST_WIDE_INT elmsize =
1163 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1164 if (dump_enabled_p ())
1166 dump_printf_loc (MSG_NOTE, vect_location,
1167 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1168 dump_printf (MSG_NOTE,
1169 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1171 if (DR_MISALIGNMENT (dr) % elmsize)
1173 if (dump_enabled_p ())
1174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1175 "data size does not divide the misalignment.\n");
1176 return false;
1180 if (!known_alignment_for_access_p (dr))
1182 tree type = TREE_TYPE (DR_REF (dr));
1183 bool is_packed = not_size_aligned (DR_REF (dr));
1184 if (dump_enabled_p ())
1185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186 "Unknown misalignment, %snaturally aligned\n",
1187 is_packed ? "not " : "");
1188 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1191 return true;
1195 /* Calculate the cost of the memory access represented by DR. */
1197 static void
1198 vect_get_data_access_cost (struct data_reference *dr,
1199 unsigned int *inside_cost,
1200 unsigned int *outside_cost,
1201 stmt_vector_for_cost *body_cost_vec)
1203 gimple *stmt = DR_STMT (dr);
1204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1205 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1206 int ncopies;
1208 if (PURE_SLP_STMT (stmt_info))
1209 ncopies = 1;
1210 else
1211 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1213 if (DR_IS_READ (dr))
1214 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1215 NULL, body_cost_vec, false);
1216 else
1217 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1219 if (dump_enabled_p ())
1220 dump_printf_loc (MSG_NOTE, vect_location,
1221 "vect_get_data_access_cost: inside_cost = %d, "
1222 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1226 typedef struct _vect_peel_info
1228 struct data_reference *dr;
1229 int npeel;
1230 unsigned int count;
1231 } *vect_peel_info;
1233 typedef struct _vect_peel_extended_info
1235 struct _vect_peel_info peel_info;
1236 unsigned int inside_cost;
1237 unsigned int outside_cost;
1238 } *vect_peel_extended_info;
1241 /* Peeling hashtable helpers. */
1243 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1245 static inline hashval_t hash (const _vect_peel_info *);
1246 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1249 inline hashval_t
1250 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1252 return (hashval_t) peel_info->npeel;
1255 inline bool
1256 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1258 return (a->npeel == b->npeel);
1262 /* Insert DR into peeling hash table with NPEEL as key. */
1264 static void
1265 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1266 loop_vec_info loop_vinfo, struct data_reference *dr,
1267 int npeel)
1269 struct _vect_peel_info elem, *slot;
1270 _vect_peel_info **new_slot;
1271 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1273 elem.npeel = npeel;
1274 slot = peeling_htab->find (&elem);
1275 if (slot)
1276 slot->count++;
1277 else
1279 slot = XNEW (struct _vect_peel_info);
1280 slot->npeel = npeel;
1281 slot->dr = dr;
1282 slot->count = 1;
1283 new_slot = peeling_htab->find_slot (slot, INSERT);
1284 *new_slot = slot;
1287 if (!supportable_dr_alignment
1288 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1289 slot->count += VECT_MAX_COST;
1293 /* Traverse peeling hash table to find peeling option that aligns maximum
1294 number of data accesses. */
1297 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1298 _vect_peel_extended_info *max)
1300 vect_peel_info elem = *slot;
1302 if (elem->count > max->peel_info.count
1303 || (elem->count == max->peel_info.count
1304 && max->peel_info.npeel > elem->npeel))
1306 max->peel_info.npeel = elem->npeel;
1307 max->peel_info.count = elem->count;
1308 max->peel_info.dr = elem->dr;
1311 return 1;
1314 /* Get the costs of peeling NPEEL iterations checking data access costs
1315 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1316 misalignment will be zero after peeling. */
1318 static void
1319 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1320 struct data_reference *dr0,
1321 unsigned int *inside_cost,
1322 unsigned int *outside_cost,
1323 stmt_vector_for_cost *body_cost_vec,
1324 unsigned int npeel,
1325 bool unknown_misalignment)
1327 unsigned i;
1328 data_reference *dr;
1330 FOR_EACH_VEC_ELT (datarefs, i, dr)
1332 gimple *stmt = DR_STMT (dr);
1333 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1334 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1335 continue;
1337 /* For interleaving, only the alignment of the first access
1338 matters. */
1339 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1340 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1341 continue;
1343 /* Strided accesses perform only component accesses, alignment is
1344 irrelevant for them. */
1345 if (STMT_VINFO_STRIDED_P (stmt_info)
1346 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1347 continue;
1349 int save_misalignment;
1350 save_misalignment = DR_MISALIGNMENT (dr);
1351 if (npeel == 0)
1353 else if (unknown_misalignment && dr == dr0)
1354 SET_DR_MISALIGNMENT (dr, 0);
1355 else
1356 vect_update_misalignment_for_peel (dr, dr0, npeel);
1357 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1358 body_cost_vec);
1359 SET_DR_MISALIGNMENT (dr, save_misalignment);
1363 /* Traverse peeling hash table and calculate cost for each peeling option.
1364 Find the one with the lowest cost. */
1367 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1368 _vect_peel_extended_info *min)
1370 vect_peel_info elem = *slot;
1371 int dummy;
1372 unsigned int inside_cost = 0, outside_cost = 0;
1373 gimple *stmt = DR_STMT (elem->dr);
1374 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1375 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1376 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1377 epilogue_cost_vec;
1379 prologue_cost_vec.create (2);
1380 body_cost_vec.create (2);
1381 epilogue_cost_vec.create (2);
1383 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1384 elem->dr, &inside_cost, &outside_cost,
1385 &body_cost_vec, elem->npeel, false);
1387 body_cost_vec.release ();
1389 outside_cost += vect_get_known_peeling_cost
1390 (loop_vinfo, elem->npeel, &dummy,
1391 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1392 &prologue_cost_vec, &epilogue_cost_vec);
1394 /* Prologue and epilogue costs are added to the target model later.
1395 These costs depend only on the scalar iteration cost, the
1396 number of peeling iterations finally chosen, and the number of
1397 misaligned statements. So discard the information found here. */
1398 prologue_cost_vec.release ();
1399 epilogue_cost_vec.release ();
1401 if (inside_cost < min->inside_cost
1402 || (inside_cost == min->inside_cost
1403 && outside_cost < min->outside_cost))
1405 min->inside_cost = inside_cost;
1406 min->outside_cost = outside_cost;
1407 min->peel_info.dr = elem->dr;
1408 min->peel_info.npeel = elem->npeel;
1409 min->peel_info.count = elem->count;
1412 return 1;
1416 /* Choose best peeling option by traversing peeling hash table and either
1417 choosing an option with the lowest cost (if cost model is enabled) or the
1418 option that aligns as many accesses as possible. */
1420 static struct _vect_peel_extended_info
1421 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1422 loop_vec_info loop_vinfo)
1424 struct _vect_peel_extended_info res;
1426 res.peel_info.dr = NULL;
1428 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1430 res.inside_cost = INT_MAX;
1431 res.outside_cost = INT_MAX;
1432 peeling_htab->traverse <_vect_peel_extended_info *,
1433 vect_peeling_hash_get_lowest_cost> (&res);
1435 else
1437 res.peel_info.count = 0;
1438 peeling_htab->traverse <_vect_peel_extended_info *,
1439 vect_peeling_hash_get_most_frequent> (&res);
1440 res.inside_cost = 0;
1441 res.outside_cost = 0;
1444 return res;
1447 /* Return true if the new peeling NPEEL is supported. */
1449 static bool
1450 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1451 unsigned npeel)
1453 unsigned i;
1454 struct data_reference *dr = NULL;
1455 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1456 gimple *stmt;
1457 stmt_vec_info stmt_info;
1458 enum dr_alignment_support supportable_dr_alignment;
1460 /* Ensure that all data refs can be vectorized after the peel. */
1461 FOR_EACH_VEC_ELT (datarefs, i, dr)
1463 int save_misalignment;
1465 if (dr == dr0)
1466 continue;
1468 stmt = DR_STMT (dr);
1469 stmt_info = vinfo_for_stmt (stmt);
1470 /* For interleaving, only the alignment of the first access
1471 matters. */
1472 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1473 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1474 continue;
1476 /* Strided accesses perform only component accesses, alignment is
1477 irrelevant for them. */
1478 if (STMT_VINFO_STRIDED_P (stmt_info)
1479 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1480 continue;
1482 save_misalignment = DR_MISALIGNMENT (dr);
1483 vect_update_misalignment_for_peel (dr, dr0, npeel);
1484 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1485 SET_DR_MISALIGNMENT (dr, save_misalignment);
1487 if (!supportable_dr_alignment)
1488 return false;
1491 return true;
1494 /* Function vect_enhance_data_refs_alignment
1496 This pass will use loop versioning and loop peeling in order to enhance
1497 the alignment of data references in the loop.
1499 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1500 original loop is to be vectorized. Any other loops that are created by
1501 the transformations performed in this pass - are not supposed to be
1502 vectorized. This restriction will be relaxed.
1504 This pass will require a cost model to guide it whether to apply peeling
1505 or versioning or a combination of the two. For example, the scheme that
1506 intel uses when given a loop with several memory accesses, is as follows:
1507 choose one memory access ('p') which alignment you want to force by doing
1508 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1509 other accesses are not necessarily aligned, or (2) use loop versioning to
1510 generate one loop in which all accesses are aligned, and another loop in
1511 which only 'p' is necessarily aligned.
1513 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1514 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1515 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1517 Devising a cost model is the most critical aspect of this work. It will
1518 guide us on which access to peel for, whether to use loop versioning, how
1519 many versions to create, etc. The cost model will probably consist of
1520 generic considerations as well as target specific considerations (on
1521 powerpc for example, misaligned stores are more painful than misaligned
1522 loads).
1524 Here are the general steps involved in alignment enhancements:
1526 -- original loop, before alignment analysis:
1527 for (i=0; i<N; i++){
1528 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1529 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1532 -- After vect_compute_data_refs_alignment:
1533 for (i=0; i<N; i++){
1534 x = q[i]; # DR_MISALIGNMENT(q) = 3
1535 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1538 -- Possibility 1: we do loop versioning:
1539 if (p is aligned) {
1540 for (i=0; i<N; i++){ # loop 1A
1541 x = q[i]; # DR_MISALIGNMENT(q) = 3
1542 p[i] = y; # DR_MISALIGNMENT(p) = 0
1545 else {
1546 for (i=0; i<N; i++){ # loop 1B
1547 x = q[i]; # DR_MISALIGNMENT(q) = 3
1548 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1552 -- Possibility 2: we do loop peeling:
1553 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1554 x = q[i];
1555 p[i] = y;
1557 for (i = 3; i < N; i++){ # loop 2A
1558 x = q[i]; # DR_MISALIGNMENT(q) = 0
1559 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1562 -- Possibility 3: combination of loop peeling and versioning:
1563 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1564 x = q[i];
1565 p[i] = y;
1567 if (p is aligned) {
1568 for (i = 3; i<N; i++){ # loop 3A
1569 x = q[i]; # DR_MISALIGNMENT(q) = 0
1570 p[i] = y; # DR_MISALIGNMENT(p) = 0
1573 else {
1574 for (i = 3; i<N; i++){ # loop 3B
1575 x = q[i]; # DR_MISALIGNMENT(q) = 0
1576 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1580 These loops are later passed to loop_transform to be vectorized. The
1581 vectorizer will use the alignment information to guide the transformation
1582 (whether to generate regular loads/stores, or with special handling for
1583 misalignment). */
1585 bool
1586 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1588 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1589 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1590 enum dr_alignment_support supportable_dr_alignment;
1591 struct data_reference *dr0 = NULL, *first_store = NULL;
1592 struct data_reference *dr;
1593 unsigned int i, j;
1594 bool do_peeling = false;
1595 bool do_versioning = false;
1596 bool stat;
1597 gimple *stmt;
1598 stmt_vec_info stmt_info;
1599 unsigned int npeel = 0;
1600 bool one_misalignment_known = false;
1601 bool one_misalignment_unknown = false;
1602 bool one_dr_unsupportable = false;
1603 struct data_reference *unsupportable_dr = NULL;
1604 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1605 unsigned possible_npeel_number = 1;
1606 tree vectype;
1607 unsigned int mis, same_align_drs_max = 0;
1608 hash_table<peel_info_hasher> peeling_htab (1);
1610 if (dump_enabled_p ())
1611 dump_printf_loc (MSG_NOTE, vect_location,
1612 "=== vect_enhance_data_refs_alignment ===\n");
1614 /* Reset data so we can safely be called multiple times. */
1615 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1616 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1618 /* While cost model enhancements are expected in the future, the high level
1619 view of the code at this time is as follows:
1621 A) If there is a misaligned access then see if peeling to align
1622 this access can make all data references satisfy
1623 vect_supportable_dr_alignment. If so, update data structures
1624 as needed and return true.
1626 B) If peeling wasn't possible and there is a data reference with an
1627 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1628 then see if loop versioning checks can be used to make all data
1629 references satisfy vect_supportable_dr_alignment. If so, update
1630 data structures as needed and return true.
1632 C) If neither peeling nor versioning were successful then return false if
1633 any data reference does not satisfy vect_supportable_dr_alignment.
1635 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1637 Note, Possibility 3 above (which is peeling and versioning together) is not
1638 being done at this time. */
1640 /* (1) Peeling to force alignment. */
1642 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1643 Considerations:
1644 + How many accesses will become aligned due to the peeling
1645 - How many accesses will become unaligned due to the peeling,
1646 and the cost of misaligned accesses.
1647 - The cost of peeling (the extra runtime checks, the increase
1648 in code size). */
1650 FOR_EACH_VEC_ELT (datarefs, i, dr)
1652 stmt = DR_STMT (dr);
1653 stmt_info = vinfo_for_stmt (stmt);
1655 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1656 continue;
1658 /* For interleaving, only the alignment of the first access
1659 matters. */
1660 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1661 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1662 continue;
1664 /* For invariant accesses there is nothing to enhance. */
1665 if (integer_zerop (DR_STEP (dr)))
1666 continue;
1668 /* Strided accesses perform only component accesses, alignment is
1669 irrelevant for them. */
1670 if (STMT_VINFO_STRIDED_P (stmt_info)
1671 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1672 continue;
1674 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1675 do_peeling = vector_alignment_reachable_p (dr);
1676 if (do_peeling)
1678 if (known_alignment_for_access_p (dr))
1680 unsigned int npeel_tmp = 0;
1681 bool negative = tree_int_cst_compare (DR_STEP (dr),
1682 size_zero_node) < 0;
1684 vectype = STMT_VINFO_VECTYPE (stmt_info);
1685 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1686 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1687 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1688 if (DR_MISALIGNMENT (dr) != 0)
1689 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1691 /* For multiple types, it is possible that the bigger type access
1692 will have more than one peeling option. E.g., a loop with two
1693 types: one of size (vector size / 4), and the other one of
1694 size (vector size / 8). Vectorization factor will 8. If both
1695 accesses are misaligned by 3, the first one needs one scalar
1696 iteration to be aligned, and the second one needs 5. But the
1697 first one will be aligned also by peeling 5 scalar
1698 iterations, and in that case both accesses will be aligned.
1699 Hence, except for the immediate peeling amount, we also want
1700 to try to add full vector size, while we don't exceed
1701 vectorization factor.
1702 We do this automatically for cost model, since we calculate
1703 cost for every peeling option. */
1704 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1706 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1707 ? vf * GROUP_SIZE (stmt_info) : vf);
1708 possible_npeel_number
1709 = vect_get_num_vectors (nscalars, vectype);
1711 /* NPEEL_TMP is 0 when there is no misalignment, but also
1712 allow peeling NELEMENTS. */
1713 if (DR_MISALIGNMENT (dr) == 0)
1714 possible_npeel_number++;
1717 /* Save info about DR in the hash table. Also include peeling
1718 amounts according to the explanation above. */
1719 for (j = 0; j < possible_npeel_number; j++)
1721 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1722 dr, npeel_tmp);
1723 npeel_tmp += target_align / dr_size;
1726 one_misalignment_known = true;
1728 else
1730 /* If we don't know any misalignment values, we prefer
1731 peeling for data-ref that has the maximum number of data-refs
1732 with the same alignment, unless the target prefers to align
1733 stores over load. */
1734 unsigned same_align_drs
1735 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1736 if (!dr0
1737 || same_align_drs_max < same_align_drs)
1739 same_align_drs_max = same_align_drs;
1740 dr0 = dr;
1742 /* For data-refs with the same number of related
1743 accesses prefer the one where the misalign
1744 computation will be invariant in the outermost loop. */
1745 else if (same_align_drs_max == same_align_drs)
1747 struct loop *ivloop0, *ivloop;
1748 ivloop0 = outermost_invariant_loop_for_expr
1749 (loop, DR_BASE_ADDRESS (dr0));
1750 ivloop = outermost_invariant_loop_for_expr
1751 (loop, DR_BASE_ADDRESS (dr));
1752 if ((ivloop && !ivloop0)
1753 || (ivloop && ivloop0
1754 && flow_loop_nested_p (ivloop, ivloop0)))
1755 dr0 = dr;
1758 one_misalignment_unknown = true;
1760 /* Check for data refs with unsupportable alignment that
1761 can be peeled. */
1762 if (!supportable_dr_alignment)
1764 one_dr_unsupportable = true;
1765 unsupportable_dr = dr;
1768 if (!first_store && DR_IS_WRITE (dr))
1769 first_store = dr;
1772 else
1774 if (!aligned_access_p (dr))
1776 if (dump_enabled_p ())
1777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1778 "vector alignment may not be reachable\n");
1779 break;
1784 /* Check if we can possibly peel the loop. */
1785 if (!vect_can_advance_ivs_p (loop_vinfo)
1786 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1787 || loop->inner)
1788 do_peeling = false;
1790 struct _vect_peel_extended_info peel_for_known_alignment;
1791 struct _vect_peel_extended_info peel_for_unknown_alignment;
1792 struct _vect_peel_extended_info best_peel;
1794 peel_for_unknown_alignment.inside_cost = INT_MAX;
1795 peel_for_unknown_alignment.outside_cost = INT_MAX;
1796 peel_for_unknown_alignment.peel_info.count = 0;
1798 if (do_peeling
1799 && one_misalignment_unknown)
1801 /* Check if the target requires to prefer stores over loads, i.e., if
1802 misaligned stores are more expensive than misaligned loads (taking
1803 drs with same alignment into account). */
1804 unsigned int load_inside_cost = 0;
1805 unsigned int load_outside_cost = 0;
1806 unsigned int store_inside_cost = 0;
1807 unsigned int store_outside_cost = 0;
1808 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1810 stmt_vector_for_cost dummy;
1811 dummy.create (2);
1812 vect_get_peeling_costs_all_drs (datarefs, dr0,
1813 &load_inside_cost,
1814 &load_outside_cost,
1815 &dummy, estimated_npeels, true);
1816 dummy.release ();
1818 if (first_store)
1820 dummy.create (2);
1821 vect_get_peeling_costs_all_drs (datarefs, first_store,
1822 &store_inside_cost,
1823 &store_outside_cost,
1824 &dummy, estimated_npeels, true);
1825 dummy.release ();
1827 else
1829 store_inside_cost = INT_MAX;
1830 store_outside_cost = INT_MAX;
1833 if (load_inside_cost > store_inside_cost
1834 || (load_inside_cost == store_inside_cost
1835 && load_outside_cost > store_outside_cost))
1837 dr0 = first_store;
1838 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1839 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1841 else
1843 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1844 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1847 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1848 prologue_cost_vec.create (2);
1849 epilogue_cost_vec.create (2);
1851 int dummy2;
1852 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1853 (loop_vinfo, estimated_npeels, &dummy2,
1854 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1855 &prologue_cost_vec, &epilogue_cost_vec);
1857 prologue_cost_vec.release ();
1858 epilogue_cost_vec.release ();
1860 peel_for_unknown_alignment.peel_info.count = 1
1861 + STMT_VINFO_SAME_ALIGN_REFS
1862 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1865 peel_for_unknown_alignment.peel_info.npeel = 0;
1866 peel_for_unknown_alignment.peel_info.dr = dr0;
1868 best_peel = peel_for_unknown_alignment;
1870 peel_for_known_alignment.inside_cost = INT_MAX;
1871 peel_for_known_alignment.outside_cost = INT_MAX;
1872 peel_for_known_alignment.peel_info.count = 0;
1873 peel_for_known_alignment.peel_info.dr = NULL;
1875 if (do_peeling && one_misalignment_known)
1877 /* Peeling is possible, but there is no data access that is not supported
1878 unless aligned. So we try to choose the best possible peeling from
1879 the hash table. */
1880 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1881 (&peeling_htab, loop_vinfo);
1884 /* Compare costs of peeling for known and unknown alignment. */
1885 if (peel_for_known_alignment.peel_info.dr != NULL
1886 && peel_for_unknown_alignment.inside_cost
1887 >= peel_for_known_alignment.inside_cost)
1889 best_peel = peel_for_known_alignment;
1891 /* If the best peeling for known alignment has NPEEL == 0, perform no
1892 peeling at all except if there is an unsupportable dr that we can
1893 align. */
1894 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1895 do_peeling = false;
1898 /* If there is an unsupportable data ref, prefer this over all choices so far
1899 since we'd have to discard a chosen peeling except when it accidentally
1900 aligned the unsupportable data ref. */
1901 if (one_dr_unsupportable)
1902 dr0 = unsupportable_dr;
1903 else if (do_peeling)
1905 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1906 TODO: Use nopeel_outside_cost or get rid of it? */
1907 unsigned nopeel_inside_cost = 0;
1908 unsigned nopeel_outside_cost = 0;
1910 stmt_vector_for_cost dummy;
1911 dummy.create (2);
1912 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1913 &nopeel_outside_cost, &dummy, 0, false);
1914 dummy.release ();
1916 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1917 costs will be recorded. */
1918 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1919 prologue_cost_vec.create (2);
1920 epilogue_cost_vec.create (2);
1922 int dummy2;
1923 nopeel_outside_cost += vect_get_known_peeling_cost
1924 (loop_vinfo, 0, &dummy2,
1925 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1926 &prologue_cost_vec, &epilogue_cost_vec);
1928 prologue_cost_vec.release ();
1929 epilogue_cost_vec.release ();
1931 npeel = best_peel.peel_info.npeel;
1932 dr0 = best_peel.peel_info.dr;
1934 /* If no peeling is not more expensive than the best peeling we
1935 have so far, don't perform any peeling. */
1936 if (nopeel_inside_cost <= best_peel.inside_cost)
1937 do_peeling = false;
1940 if (do_peeling)
1942 stmt = DR_STMT (dr0);
1943 stmt_info = vinfo_for_stmt (stmt);
1944 vectype = STMT_VINFO_VECTYPE (stmt_info);
1946 if (known_alignment_for_access_p (dr0))
1948 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1949 size_zero_node) < 0;
1950 if (!npeel)
1952 /* Since it's known at compile time, compute the number of
1953 iterations in the peeled loop (the peeling factor) for use in
1954 updating DR_MISALIGNMENT values. The peeling factor is the
1955 vectorization factor minus the misalignment as an element
1956 count. */
1957 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1958 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1959 npeel = ((mis & (target_align - 1))
1960 / vect_get_scalar_dr_size (dr0));
1963 /* For interleaved data access every iteration accesses all the
1964 members of the group, therefore we divide the number of iterations
1965 by the group size. */
1966 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1967 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1968 npeel /= GROUP_SIZE (stmt_info);
1970 if (dump_enabled_p ())
1971 dump_printf_loc (MSG_NOTE, vect_location,
1972 "Try peeling by %d\n", npeel);
1975 /* Ensure that all datarefs can be vectorized after the peel. */
1976 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1977 do_peeling = false;
1979 /* Check if all datarefs are supportable and log. */
1980 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1982 stat = vect_verify_datarefs_alignment (loop_vinfo);
1983 if (!stat)
1984 do_peeling = false;
1985 else
1986 return stat;
1989 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1990 if (do_peeling)
1992 unsigned max_allowed_peel
1993 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1994 if (max_allowed_peel != (unsigned)-1)
1996 unsigned max_peel = npeel;
1997 if (max_peel == 0)
1999 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2000 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2002 if (max_peel > max_allowed_peel)
2004 do_peeling = false;
2005 if (dump_enabled_p ())
2006 dump_printf_loc (MSG_NOTE, vect_location,
2007 "Disable peeling, max peels reached: %d\n", max_peel);
2012 /* Cost model #2 - if peeling may result in a remaining loop not
2013 iterating enough to be vectorized then do not peel. Since this
2014 is a cost heuristic rather than a correctness decision, use the
2015 most likely runtime value for variable vectorization factors. */
2016 if (do_peeling
2017 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2019 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2020 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2021 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2022 < assumed_vf + max_peel)
2023 do_peeling = false;
2026 if (do_peeling)
2028 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2029 If the misalignment of DR_i is identical to that of dr0 then set
2030 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2031 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2032 by the peeling factor times the element size of DR_i (MOD the
2033 vectorization factor times the size). Otherwise, the
2034 misalignment of DR_i must be set to unknown. */
2035 FOR_EACH_VEC_ELT (datarefs, i, dr)
2036 if (dr != dr0)
2038 /* Strided accesses perform only component accesses, alignment
2039 is irrelevant for them. */
2040 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2041 if (STMT_VINFO_STRIDED_P (stmt_info)
2042 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2043 continue;
2045 vect_update_misalignment_for_peel (dr, dr0, npeel);
2048 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2049 if (npeel)
2050 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2051 else
2052 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2053 = DR_MISALIGNMENT (dr0);
2054 SET_DR_MISALIGNMENT (dr0, 0);
2055 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_NOTE, vect_location,
2058 "Alignment of access forced using peeling.\n");
2059 dump_printf_loc (MSG_NOTE, vect_location,
2060 "Peeling for alignment will be applied.\n");
2063 /* The inside-loop cost will be accounted for in vectorizable_load
2064 and vectorizable_store correctly with adjusted alignments.
2065 Drop the body_cst_vec on the floor here. */
2066 stat = vect_verify_datarefs_alignment (loop_vinfo);
2067 gcc_assert (stat);
2068 return stat;
2072 /* (2) Versioning to force alignment. */
2074 /* Try versioning if:
2075 1) optimize loop for speed
2076 2) there is at least one unsupported misaligned data ref with an unknown
2077 misalignment, and
2078 3) all misaligned data refs with a known misalignment are supported, and
2079 4) the number of runtime alignment checks is within reason. */
2081 do_versioning =
2082 optimize_loop_nest_for_speed_p (loop)
2083 && (!loop->inner); /* FORNOW */
2085 if (do_versioning)
2087 FOR_EACH_VEC_ELT (datarefs, i, dr)
2089 stmt = DR_STMT (dr);
2090 stmt_info = vinfo_for_stmt (stmt);
2092 /* For interleaving, only the alignment of the first access
2093 matters. */
2094 if (aligned_access_p (dr)
2095 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2096 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2097 continue;
2099 if (STMT_VINFO_STRIDED_P (stmt_info))
2101 /* Strided loads perform only component accesses, alignment is
2102 irrelevant for them. */
2103 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2104 continue;
2105 do_versioning = false;
2106 break;
2109 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2111 if (!supportable_dr_alignment)
2113 gimple *stmt;
2114 int mask;
2115 tree vectype;
2117 if (known_alignment_for_access_p (dr)
2118 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2119 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2121 do_versioning = false;
2122 break;
2125 stmt = DR_STMT (dr);
2126 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2127 gcc_assert (vectype);
2129 /* At present we don't support versioning for alignment
2130 with variable VF, since there's no guarantee that the
2131 VF is a power of two. We could relax this if we added
2132 a way of enforcing a power-of-two size. */
2133 unsigned HOST_WIDE_INT size;
2134 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2136 do_versioning = false;
2137 break;
2140 /* The rightmost bits of an aligned address must be zeros.
2141 Construct the mask needed for this test. For example,
2142 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2143 mask must be 15 = 0xf. */
2144 mask = size - 1;
2146 /* FORNOW: use the same mask to test all potentially unaligned
2147 references in the loop. The vectorizer currently supports
2148 a single vector size, see the reference to
2149 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2150 vectorization factor is computed. */
2151 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2152 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2153 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2154 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2155 DR_STMT (dr));
2159 /* Versioning requires at least one misaligned data reference. */
2160 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2161 do_versioning = false;
2162 else if (!do_versioning)
2163 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2166 if (do_versioning)
2168 vec<gimple *> may_misalign_stmts
2169 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2170 gimple *stmt;
2172 /* It can now be assumed that the data references in the statements
2173 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2174 of the loop being vectorized. */
2175 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2177 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2178 dr = STMT_VINFO_DATA_REF (stmt_info);
2179 SET_DR_MISALIGNMENT (dr, 0);
2180 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_NOTE, vect_location,
2182 "Alignment of access forced using versioning.\n");
2185 if (dump_enabled_p ())
2186 dump_printf_loc (MSG_NOTE, vect_location,
2187 "Versioning for alignment will be applied.\n");
2189 /* Peeling and versioning can't be done together at this time. */
2190 gcc_assert (! (do_peeling && do_versioning));
2192 stat = vect_verify_datarefs_alignment (loop_vinfo);
2193 gcc_assert (stat);
2194 return stat;
2197 /* This point is reached if neither peeling nor versioning is being done. */
2198 gcc_assert (! (do_peeling || do_versioning));
2200 stat = vect_verify_datarefs_alignment (loop_vinfo);
2201 return stat;
2205 /* Function vect_find_same_alignment_drs.
2207 Update group and alignment relations according to the chosen
2208 vectorization factor. */
2210 static void
2211 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2213 struct data_reference *dra = DDR_A (ddr);
2214 struct data_reference *drb = DDR_B (ddr);
2215 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2216 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2218 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2219 return;
2221 if (dra == drb)
2222 return;
2224 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2225 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2226 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2227 return;
2229 /* Two references with distance zero have the same alignment. */
2230 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2231 - wi::to_poly_offset (DR_INIT (drb)));
2232 if (maybe_ne (diff, 0))
2234 /* Get the wider of the two alignments. */
2235 unsigned int align_a = (vect_calculate_target_alignment (dra)
2236 / BITS_PER_UNIT);
2237 unsigned int align_b = (vect_calculate_target_alignment (drb)
2238 / BITS_PER_UNIT);
2239 unsigned int max_align = MAX (align_a, align_b);
2241 /* Require the gap to be a multiple of the larger vector alignment. */
2242 if (!multiple_p (diff, max_align))
2243 return;
2246 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2247 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2248 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE, vect_location,
2251 "accesses have the same alignment: ");
2252 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2253 dump_printf (MSG_NOTE, " and ");
2254 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2255 dump_printf (MSG_NOTE, "\n");
2260 /* Function vect_analyze_data_refs_alignment
2262 Analyze the alignment of the data-references in the loop.
2263 Return FALSE if a data reference is found that cannot be vectorized. */
2265 bool
2266 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2268 if (dump_enabled_p ())
2269 dump_printf_loc (MSG_NOTE, vect_location,
2270 "=== vect_analyze_data_refs_alignment ===\n");
2272 /* Mark groups of data references with same alignment using
2273 data dependence information. */
2274 vec<ddr_p> ddrs = vinfo->ddrs;
2275 struct data_dependence_relation *ddr;
2276 unsigned int i;
2278 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2279 vect_find_same_alignment_drs (ddr);
2281 vec<data_reference_p> datarefs = vinfo->datarefs;
2282 struct data_reference *dr;
2284 vect_record_base_alignments (vinfo);
2285 FOR_EACH_VEC_ELT (datarefs, i, dr)
2287 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2288 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2289 && !vect_compute_data_ref_alignment (dr))
2291 /* Strided accesses perform only component accesses, misalignment
2292 information is irrelevant for them. */
2293 if (STMT_VINFO_STRIDED_P (stmt_info)
2294 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2295 continue;
2297 if (dump_enabled_p ())
2298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299 "not vectorized: can't calculate alignment "
2300 "for data ref.\n");
2302 return false;
2306 return true;
2310 /* Analyze alignment of DRs of stmts in NODE. */
2312 static bool
2313 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2315 /* We vectorize from the first scalar stmt in the node unless
2316 the node is permuted in which case we start from the first
2317 element in the group. */
2318 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2319 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2320 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2321 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2323 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2324 if (! vect_compute_data_ref_alignment (dr)
2325 /* For creating the data-ref pointer we need alignment of the
2326 first element anyway. */
2327 || (dr != first_dr
2328 && ! vect_compute_data_ref_alignment (first_dr))
2329 || ! verify_data_ref_alignment (dr))
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "not vectorized: bad data alignment in basic "
2334 "block.\n");
2335 return false;
2338 return true;
2341 /* Function vect_slp_analyze_instance_alignment
2343 Analyze the alignment of the data-references in the SLP instance.
2344 Return FALSE if a data reference is found that cannot be vectorized. */
2346 bool
2347 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2349 if (dump_enabled_p ())
2350 dump_printf_loc (MSG_NOTE, vect_location,
2351 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2353 slp_tree node;
2354 unsigned i;
2355 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2356 if (! vect_slp_analyze_and_verify_node_alignment (node))
2357 return false;
2359 node = SLP_INSTANCE_TREE (instance);
2360 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2361 && ! vect_slp_analyze_and_verify_node_alignment
2362 (SLP_INSTANCE_TREE (instance)))
2363 return false;
2365 return true;
2369 /* Analyze groups of accesses: check that DR belongs to a group of
2370 accesses of legal size, step, etc. Detect gaps, single element
2371 interleaving, and other special cases. Set grouped access info.
2372 Collect groups of strided stores for further use in SLP analysis.
2373 Worker for vect_analyze_group_access. */
2375 static bool
2376 vect_analyze_group_access_1 (struct data_reference *dr)
2378 tree step = DR_STEP (dr);
2379 tree scalar_type = TREE_TYPE (DR_REF (dr));
2380 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2381 gimple *stmt = DR_STMT (dr);
2382 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2383 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2384 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2385 HOST_WIDE_INT dr_step = -1;
2386 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2387 bool slp_impossible = false;
2389 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2390 size of the interleaving group (including gaps). */
2391 if (tree_fits_shwi_p (step))
2393 dr_step = tree_to_shwi (step);
2394 /* Check that STEP is a multiple of type size. Otherwise there is
2395 a non-element-sized gap at the end of the group which we
2396 cannot represent in GROUP_GAP or GROUP_SIZE.
2397 ??? As we can handle non-constant step fine here we should
2398 simply remove uses of GROUP_GAP between the last and first
2399 element and instead rely on DR_STEP. GROUP_SIZE then would
2400 simply not include that gap. */
2401 if ((dr_step % type_size) != 0)
2403 if (dump_enabled_p ())
2405 dump_printf_loc (MSG_NOTE, vect_location,
2406 "Step ");
2407 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2408 dump_printf (MSG_NOTE,
2409 " is not a multiple of the element size for ");
2410 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2411 dump_printf (MSG_NOTE, "\n");
2413 return false;
2415 groupsize = absu_hwi (dr_step) / type_size;
2417 else
2418 groupsize = 0;
2420 /* Not consecutive access is possible only if it is a part of interleaving. */
2421 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2423 /* Check if it this DR is a part of interleaving, and is a single
2424 element of the group that is accessed in the loop. */
2426 /* Gaps are supported only for loads. STEP must be a multiple of the type
2427 size. The size of the group must be a power of 2. */
2428 if (DR_IS_READ (dr)
2429 && (dr_step % type_size) == 0
2430 && groupsize > 0
2431 && pow2p_hwi (groupsize))
2433 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2434 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2435 GROUP_GAP (stmt_info) = groupsize - 1;
2436 if (dump_enabled_p ())
2438 dump_printf_loc (MSG_NOTE, vect_location,
2439 "Detected single element interleaving ");
2440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2441 dump_printf (MSG_NOTE, " step ");
2442 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2443 dump_printf (MSG_NOTE, "\n");
2446 return true;
2449 if (dump_enabled_p ())
2451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2452 "not consecutive access ");
2453 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2456 if (bb_vinfo)
2458 /* Mark the statement as unvectorizable. */
2459 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2460 return true;
2463 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2464 STMT_VINFO_STRIDED_P (stmt_info) = true;
2465 return true;
2468 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2470 /* First stmt in the interleaving chain. Check the chain. */
2471 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2472 struct data_reference *data_ref = dr;
2473 unsigned int count = 1;
2474 tree prev_init = DR_INIT (data_ref);
2475 gimple *prev = stmt;
2476 HOST_WIDE_INT diff, gaps = 0;
2478 /* By construction, all group members have INTEGER_CST DR_INITs. */
2479 while (next)
2481 /* Skip same data-refs. In case that two or more stmts share
2482 data-ref (supported only for loads), we vectorize only the first
2483 stmt, and the rest get their vectorized loads from the first
2484 one. */
2485 if (!tree_int_cst_compare (DR_INIT (data_ref),
2486 DR_INIT (STMT_VINFO_DATA_REF (
2487 vinfo_for_stmt (next)))))
2489 if (DR_IS_WRITE (data_ref))
2491 if (dump_enabled_p ())
2492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2493 "Two store stmts share the same dr.\n");
2494 return false;
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2499 "Two or more load stmts share the same dr.\n");
2501 /* For load use the same data-ref load. */
2502 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2504 prev = next;
2505 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2506 continue;
2509 prev = next;
2510 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2512 /* All group members have the same STEP by construction. */
2513 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2515 /* Check that the distance between two accesses is equal to the type
2516 size. Otherwise, we have gaps. */
2517 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2518 - TREE_INT_CST_LOW (prev_init)) / type_size;
2519 if (diff != 1)
2521 /* FORNOW: SLP of accesses with gaps is not supported. */
2522 slp_impossible = true;
2523 if (DR_IS_WRITE (data_ref))
2525 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2527 "interleaved store with gaps\n");
2528 return false;
2531 gaps += diff - 1;
2534 last_accessed_element += diff;
2536 /* Store the gap from the previous member of the group. If there is no
2537 gap in the access, GROUP_GAP is always 1. */
2538 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2540 prev_init = DR_INIT (data_ref);
2541 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2542 /* Count the number of data-refs in the chain. */
2543 count++;
2546 if (groupsize == 0)
2547 groupsize = count + gaps;
2549 /* This could be UINT_MAX but as we are generating code in a very
2550 inefficient way we have to cap earlier. See PR78699 for example. */
2551 if (groupsize > 4096)
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555 "group is too large\n");
2556 return false;
2559 /* Check that the size of the interleaving is equal to count for stores,
2560 i.e., that there are no gaps. */
2561 if (groupsize != count
2562 && !DR_IS_READ (dr))
2564 if (dump_enabled_p ())
2565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2566 "interleaved store with gaps\n");
2567 return false;
2570 /* If there is a gap after the last load in the group it is the
2571 difference between the groupsize and the last accessed
2572 element.
2573 When there is no gap, this difference should be 0. */
2574 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2576 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2577 if (dump_enabled_p ())
2579 dump_printf_loc (MSG_NOTE, vect_location,
2580 "Detected interleaving ");
2581 if (DR_IS_READ (dr))
2582 dump_printf (MSG_NOTE, "load ");
2583 else
2584 dump_printf (MSG_NOTE, "store ");
2585 dump_printf (MSG_NOTE, "of size %u starting with ",
2586 (unsigned)groupsize);
2587 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2588 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2589 dump_printf_loc (MSG_NOTE, vect_location,
2590 "There is a gap of %u elements after the group\n",
2591 GROUP_GAP (vinfo_for_stmt (stmt)));
2594 /* SLP: create an SLP data structure for every interleaving group of
2595 stores for further analysis in vect_analyse_slp. */
2596 if (DR_IS_WRITE (dr) && !slp_impossible)
2598 if (loop_vinfo)
2599 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2600 if (bb_vinfo)
2601 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2605 return true;
2608 /* Analyze groups of accesses: check that DR belongs to a group of
2609 accesses of legal size, step, etc. Detect gaps, single element
2610 interleaving, and other special cases. Set grouped access info.
2611 Collect groups of strided stores for further use in SLP analysis. */
2613 static bool
2614 vect_analyze_group_access (struct data_reference *dr)
2616 if (!vect_analyze_group_access_1 (dr))
2618 /* Dissolve the group if present. */
2619 gimple *next;
2620 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2621 while (stmt)
2623 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2624 next = GROUP_NEXT_ELEMENT (vinfo);
2625 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2626 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2627 stmt = next;
2629 return false;
2631 return true;
2634 /* Analyze the access pattern of the data-reference DR.
2635 In case of non-consecutive accesses call vect_analyze_group_access() to
2636 analyze groups of accesses. */
2638 static bool
2639 vect_analyze_data_ref_access (struct data_reference *dr)
2641 tree step = DR_STEP (dr);
2642 tree scalar_type = TREE_TYPE (DR_REF (dr));
2643 gimple *stmt = DR_STMT (dr);
2644 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2645 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2646 struct loop *loop = NULL;
2648 if (loop_vinfo)
2649 loop = LOOP_VINFO_LOOP (loop_vinfo);
2651 if (loop_vinfo && !step)
2653 if (dump_enabled_p ())
2654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2655 "bad data-ref access in loop\n");
2656 return false;
2659 /* Allow loads with zero step in inner-loop vectorization. */
2660 if (loop_vinfo && integer_zerop (step))
2662 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2663 if (!nested_in_vect_loop_p (loop, stmt))
2664 return DR_IS_READ (dr);
2665 /* Allow references with zero step for outer loops marked
2666 with pragma omp simd only - it guarantees absence of
2667 loop-carried dependencies between inner loop iterations. */
2668 if (!loop->force_vectorize)
2670 if (dump_enabled_p ())
2671 dump_printf_loc (MSG_NOTE, vect_location,
2672 "zero step in inner loop of nest\n");
2673 return false;
2677 if (loop && nested_in_vect_loop_p (loop, stmt))
2679 /* Interleaved accesses are not yet supported within outer-loop
2680 vectorization for references in the inner-loop. */
2681 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2683 /* For the rest of the analysis we use the outer-loop step. */
2684 step = STMT_VINFO_DR_STEP (stmt_info);
2685 if (integer_zerop (step))
2687 if (dump_enabled_p ())
2688 dump_printf_loc (MSG_NOTE, vect_location,
2689 "zero step in outer loop.\n");
2690 return DR_IS_READ (dr);
2694 /* Consecutive? */
2695 if (TREE_CODE (step) == INTEGER_CST)
2697 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2698 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2699 || (dr_step < 0
2700 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2702 /* Mark that it is not interleaving. */
2703 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2704 return true;
2708 if (loop && nested_in_vect_loop_p (loop, stmt))
2710 if (dump_enabled_p ())
2711 dump_printf_loc (MSG_NOTE, vect_location,
2712 "grouped access in outer loop.\n");
2713 return false;
2717 /* Assume this is a DR handled by non-constant strided load case. */
2718 if (TREE_CODE (step) != INTEGER_CST)
2719 return (STMT_VINFO_STRIDED_P (stmt_info)
2720 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2721 || vect_analyze_group_access (dr)));
2723 /* Not consecutive access - check if it's a part of interleaving group. */
2724 return vect_analyze_group_access (dr);
2727 /* Compare two data-references DRA and DRB to group them into chunks
2728 suitable for grouping. */
2730 static int
2731 dr_group_sort_cmp (const void *dra_, const void *drb_)
2733 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2734 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2735 int cmp;
2737 /* Stabilize sort. */
2738 if (dra == drb)
2739 return 0;
2741 /* DRs in different loops never belong to the same group. */
2742 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2743 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2744 if (loopa != loopb)
2745 return loopa->num < loopb->num ? -1 : 1;
2747 /* Ordering of DRs according to base. */
2748 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2749 DR_BASE_ADDRESS (drb));
2750 if (cmp != 0)
2751 return cmp;
2753 /* And according to DR_OFFSET. */
2754 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2755 if (cmp != 0)
2756 return cmp;
2758 /* Put reads before writes. */
2759 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2760 return DR_IS_READ (dra) ? -1 : 1;
2762 /* Then sort after access size. */
2763 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2764 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2765 if (cmp != 0)
2766 return cmp;
2768 /* And after step. */
2769 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2770 if (cmp != 0)
2771 return cmp;
2773 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2774 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2775 if (cmp == 0)
2776 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2777 return cmp;
2780 /* Function vect_analyze_data_ref_accesses.
2782 Analyze the access pattern of all the data references in the loop.
2784 FORNOW: the only access pattern that is considered vectorizable is a
2785 simple step 1 (consecutive) access.
2787 FORNOW: handle only arrays and pointer accesses. */
2789 bool
2790 vect_analyze_data_ref_accesses (vec_info *vinfo)
2792 unsigned int i;
2793 vec<data_reference_p> datarefs = vinfo->datarefs;
2794 struct data_reference *dr;
2796 if (dump_enabled_p ())
2797 dump_printf_loc (MSG_NOTE, vect_location,
2798 "=== vect_analyze_data_ref_accesses ===\n");
2800 if (datarefs.is_empty ())
2801 return true;
2803 /* Sort the array of datarefs to make building the interleaving chains
2804 linear. Don't modify the original vector's order, it is needed for
2805 determining what dependencies are reversed. */
2806 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2807 datarefs_copy.qsort (dr_group_sort_cmp);
2809 /* Build the interleaving chains. */
2810 for (i = 0; i < datarefs_copy.length () - 1;)
2812 data_reference_p dra = datarefs_copy[i];
2813 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2814 stmt_vec_info lastinfo = NULL;
2815 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2817 ++i;
2818 continue;
2820 for (i = i + 1; i < datarefs_copy.length (); ++i)
2822 data_reference_p drb = datarefs_copy[i];
2823 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2824 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2825 break;
2827 /* ??? Imperfect sorting (non-compatible types, non-modulo
2828 accesses, same accesses) can lead to a group to be artificially
2829 split here as we don't just skip over those. If it really
2830 matters we can push those to a worklist and re-iterate
2831 over them. The we can just skip ahead to the next DR here. */
2833 /* DRs in a different loop should not be put into the same
2834 interleaving group. */
2835 if (gimple_bb (DR_STMT (dra))->loop_father
2836 != gimple_bb (DR_STMT (drb))->loop_father)
2837 break;
2839 /* Check that the data-refs have same first location (except init)
2840 and they are both either store or load (not load and store,
2841 not masked loads or stores). */
2842 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2843 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2844 DR_BASE_ADDRESS (drb)) != 0
2845 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2846 || !gimple_assign_single_p (DR_STMT (dra))
2847 || !gimple_assign_single_p (DR_STMT (drb)))
2848 break;
2850 /* Check that the data-refs have the same constant size. */
2851 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2852 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2853 if (!tree_fits_uhwi_p (sza)
2854 || !tree_fits_uhwi_p (szb)
2855 || !tree_int_cst_equal (sza, szb))
2856 break;
2858 /* Check that the data-refs have the same step. */
2859 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2860 break;
2862 /* Check the types are compatible.
2863 ??? We don't distinguish this during sorting. */
2864 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2865 TREE_TYPE (DR_REF (drb))))
2866 break;
2868 /* Check that the DR_INITs are compile-time constants. */
2869 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2870 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2871 break;
2873 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2874 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2875 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2876 HOST_WIDE_INT init_prev
2877 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2878 gcc_assert (init_a <= init_b
2879 && init_a <= init_prev
2880 && init_prev <= init_b);
2882 /* Do not place the same access in the interleaving chain twice. */
2883 if (init_b == init_prev)
2885 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2886 < gimple_uid (DR_STMT (drb)));
2887 /* ??? For now we simply "drop" the later reference which is
2888 otherwise the same rather than finishing off this group.
2889 In the end we'd want to re-process duplicates forming
2890 multiple groups from the refs, likely by just collecting
2891 all candidates (including duplicates and split points
2892 below) in a vector and then process them together. */
2893 continue;
2896 /* If init_b == init_a + the size of the type * k, we have an
2897 interleaving, and DRA is accessed before DRB. */
2898 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2899 if (type_size_a == 0
2900 || (init_b - init_a) % type_size_a != 0)
2901 break;
2903 /* If we have a store, the accesses are adjacent. This splits
2904 groups into chunks we support (we don't support vectorization
2905 of stores with gaps). */
2906 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2907 break;
2909 /* If the step (if not zero or non-constant) is greater than the
2910 difference between data-refs' inits this splits groups into
2911 suitable sizes. */
2912 if (tree_fits_shwi_p (DR_STEP (dra)))
2914 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2915 if (step != 0 && step <= (init_b - init_a))
2916 break;
2919 if (dump_enabled_p ())
2921 dump_printf_loc (MSG_NOTE, vect_location,
2922 "Detected interleaving ");
2923 if (DR_IS_READ (dra))
2924 dump_printf (MSG_NOTE, "load ");
2925 else
2926 dump_printf (MSG_NOTE, "store ");
2927 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2928 dump_printf (MSG_NOTE, " and ");
2929 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2930 dump_printf (MSG_NOTE, "\n");
2933 /* Link the found element into the group list. */
2934 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2936 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2937 lastinfo = stmtinfo_a;
2939 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2940 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2941 lastinfo = stmtinfo_b;
2945 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2946 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2947 && !vect_analyze_data_ref_access (dr))
2949 if (dump_enabled_p ())
2950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2951 "not vectorized: complicated access pattern.\n");
2953 if (is_a <bb_vec_info> (vinfo))
2955 /* Mark the statement as not vectorizable. */
2956 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2957 continue;
2959 else
2961 datarefs_copy.release ();
2962 return false;
2966 datarefs_copy.release ();
2967 return true;
2970 /* Function vect_vfa_segment_size.
2972 Create an expression that computes the size of segment
2973 that will be accessed for a data reference. The functions takes into
2974 account that realignment loads may access one more vector.
2976 Input:
2977 DR: The data reference.
2978 LENGTH_FACTOR: segment length to consider.
2980 Return an expression whose value is the size of segment which will be
2981 accessed by DR. */
2983 static tree
2984 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2986 tree segment_length;
2988 if (integer_zerop (DR_STEP (dr)))
2989 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2990 else
2991 segment_length = size_binop (MULT_EXPR,
2992 fold_convert (sizetype, DR_STEP (dr)),
2993 fold_convert (sizetype, length_factor));
2995 if (vect_supportable_dr_alignment (dr, false)
2996 == dr_explicit_realign_optimized)
2998 tree vector_size = TYPE_SIZE_UNIT
2999 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
3001 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
3003 return segment_length;
3006 /* Function vect_no_alias_p.
3008 Given data references A and B with equal base and offset, see whether
3009 the alias relation can be decided at compilation time. Return 1 if
3010 it can and the references alias, 0 if it can and the references do
3011 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A
3012 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3013 respectively. */
3015 static int
3016 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3017 tree segment_length_a, tree segment_length_b)
3019 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3020 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3021 poly_uint64 const_length_a;
3022 poly_uint64 const_length_b;
3024 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3025 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3026 [a, a+12) */
3027 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3029 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3030 offset_a = (offset_a + vect_get_scalar_dr_size (a)) - const_length_a;
3032 else
3033 const_length_a = tree_to_poly_uint64 (segment_length_a);
3034 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3036 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3037 offset_b = (offset_b + vect_get_scalar_dr_size (b)) - const_length_b;
3039 else
3040 const_length_b = tree_to_poly_uint64 (segment_length_b);
3042 if (ranges_known_overlap_p (offset_a, const_length_a,
3043 offset_b, const_length_b))
3044 return 1;
3046 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3047 offset_b, const_length_b))
3048 return 0;
3050 return -1;
3053 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3054 in DDR is >= VF. */
3056 static bool
3057 dependence_distance_ge_vf (data_dependence_relation *ddr,
3058 unsigned int loop_depth, poly_uint64 vf)
3060 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3061 || DDR_NUM_DIST_VECTS (ddr) == 0)
3062 return false;
3064 /* If the dependence is exact, we should have limited the VF instead. */
3065 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3067 unsigned int i;
3068 lambda_vector dist_v;
3069 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3071 HOST_WIDE_INT dist = dist_v[loop_depth];
3072 if (dist != 0
3073 && !(dist > 0 && DDR_REVERSED_P (ddr))
3074 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3075 return false;
3078 if (dump_enabled_p ())
3080 dump_printf_loc (MSG_NOTE, vect_location,
3081 "dependence distance between ");
3082 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3083 dump_printf (MSG_NOTE, " and ");
3084 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3085 dump_printf (MSG_NOTE, " is >= VF\n");
3088 return true;
3091 /* Function vect_prune_runtime_alias_test_list.
3093 Prune a list of ddrs to be tested at run-time by versioning for alias.
3094 Merge several alias checks into one if possible.
3095 Return FALSE if resulting list of ddrs is longer then allowed by
3096 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3098 bool
3099 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3101 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3102 hash_set <tree_pair_hash> compared_objects;
3104 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3105 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3106 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3107 vec<vec_object_pair> &check_unequal_addrs
3108 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3109 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3110 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3112 ddr_p ddr;
3113 unsigned int i;
3114 tree length_factor;
3116 if (dump_enabled_p ())
3117 dump_printf_loc (MSG_NOTE, vect_location,
3118 "=== vect_prune_runtime_alias_test_list ===\n");
3120 if (may_alias_ddrs.is_empty ())
3121 return true;
3123 comp_alias_ddrs.create (may_alias_ddrs.length ());
3125 unsigned int loop_depth
3126 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3127 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3129 /* First, we collect all data ref pairs for aliasing checks. */
3130 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3132 int comp_res;
3133 struct data_reference *dr_a, *dr_b;
3134 gimple *dr_group_first_a, *dr_group_first_b;
3135 tree segment_length_a, segment_length_b;
3136 gimple *stmt_a, *stmt_b;
3138 /* Ignore the alias if the VF we chose ended up being no greater
3139 than the dependence distance. */
3140 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3141 continue;
3143 if (DDR_OBJECT_A (ddr))
3145 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3146 if (!compared_objects.add (new_pair))
3148 if (dump_enabled_p ())
3150 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3151 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3152 dump_printf (MSG_NOTE, " and ");
3153 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3154 dump_printf (MSG_NOTE, " have different addresses\n");
3156 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3158 continue;
3161 dr_a = DDR_A (ddr);
3162 stmt_a = DR_STMT (DDR_A (ddr));
3163 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3164 if (dr_group_first_a)
3166 stmt_a = dr_group_first_a;
3167 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3170 dr_b = DDR_B (ddr);
3171 stmt_b = DR_STMT (DDR_B (ddr));
3172 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3173 if (dr_group_first_b)
3175 stmt_b = dr_group_first_b;
3176 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3179 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3180 length_factor = scalar_loop_iters;
3181 else
3182 length_factor = size_int (vect_factor);
3183 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3184 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3186 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3187 DR_BASE_ADDRESS (dr_b));
3188 if (comp_res == 0)
3189 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3190 DR_OFFSET (dr_b));
3192 /* See whether the alias is known at compilation time. */
3193 if (comp_res == 0
3194 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3195 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3196 && poly_int_tree_p (segment_length_a)
3197 && poly_int_tree_p (segment_length_b))
3199 int res = vect_compile_time_alias (dr_a, dr_b,
3200 segment_length_a,
3201 segment_length_b);
3202 if (res == 0)
3203 continue;
3205 if (res == 1)
3207 if (dump_enabled_p ())
3208 dump_printf_loc (MSG_NOTE, vect_location,
3209 "not vectorized: compilation time alias.\n");
3210 return false;
3214 dr_with_seg_len_pair_t dr_with_seg_len_pair
3215 (dr_with_seg_len (dr_a, segment_length_a),
3216 dr_with_seg_len (dr_b, segment_length_b));
3218 /* Canonicalize pairs by sorting the two DR members. */
3219 if (comp_res > 0)
3220 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3222 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3225 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3227 unsigned int count = (comp_alias_ddrs.length ()
3228 + check_unequal_addrs.length ());
3229 dump_printf_loc (MSG_NOTE, vect_location,
3230 "improved number of alias checks from %d to %d\n",
3231 may_alias_ddrs.length (), count);
3232 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3234 if (dump_enabled_p ())
3235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3236 "number of versioning for alias "
3237 "run-time tests exceeds %d "
3238 "(--param vect-max-version-for-alias-checks)\n",
3239 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3240 return false;
3243 return true;
3246 /* Return true if a non-affine read or write in STMT is suitable for a
3247 gather load or scatter store. Describe the operation in *INFO if so. */
3249 bool
3250 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3251 gather_scatter_info *info)
3253 HOST_WIDE_INT scale = 1;
3254 poly_int64 pbitpos, pbitsize;
3255 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3256 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3257 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3258 tree offtype = NULL_TREE;
3259 tree decl, base, off;
3260 machine_mode pmode;
3261 int punsignedp, reversep, pvolatilep = 0;
3263 base = DR_REF (dr);
3264 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3265 see if we can use the def stmt of the address. */
3266 if (is_gimple_call (stmt)
3267 && gimple_call_internal_p (stmt)
3268 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3269 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3270 && TREE_CODE (base) == MEM_REF
3271 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3272 && integer_zerop (TREE_OPERAND (base, 1))
3273 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3275 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3276 if (is_gimple_assign (def_stmt)
3277 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3278 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3281 /* The gather and scatter builtins need address of the form
3282 loop_invariant + vector * {1, 2, 4, 8}
3284 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3285 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3286 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3287 multiplications and additions in it. To get a vector, we need
3288 a single SSA_NAME that will be defined in the loop and will
3289 contain everything that is not loop invariant and that can be
3290 vectorized. The following code attempts to find such a preexistng
3291 SSA_NAME OFF and put the loop invariants into a tree BASE
3292 that can be gimplified before the loop. */
3293 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3294 &punsignedp, &reversep, &pvolatilep);
3295 gcc_assert (base && !reversep);
3296 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3298 if (TREE_CODE (base) == MEM_REF)
3300 if (!integer_zerop (TREE_OPERAND (base, 1)))
3302 if (off == NULL_TREE)
3303 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3304 else
3305 off = size_binop (PLUS_EXPR, off,
3306 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3308 base = TREE_OPERAND (base, 0);
3310 else
3311 base = build_fold_addr_expr (base);
3313 if (off == NULL_TREE)
3314 off = size_zero_node;
3316 /* If base is not loop invariant, either off is 0, then we start with just
3317 the constant offset in the loop invariant BASE and continue with base
3318 as OFF, otherwise give up.
3319 We could handle that case by gimplifying the addition of base + off
3320 into some SSA_NAME and use that as off, but for now punt. */
3321 if (!expr_invariant_in_loop_p (loop, base))
3323 if (!integer_zerop (off))
3324 return false;
3325 off = base;
3326 base = size_int (pbytepos);
3328 /* Otherwise put base + constant offset into the loop invariant BASE
3329 and continue with OFF. */
3330 else
3332 base = fold_convert (sizetype, base);
3333 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3336 /* OFF at this point may be either a SSA_NAME or some tree expression
3337 from get_inner_reference. Try to peel off loop invariants from it
3338 into BASE as long as possible. */
3339 STRIP_NOPS (off);
3340 while (offtype == NULL_TREE)
3342 enum tree_code code;
3343 tree op0, op1, add = NULL_TREE;
3345 if (TREE_CODE (off) == SSA_NAME)
3347 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3349 if (expr_invariant_in_loop_p (loop, off))
3350 return false;
3352 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3353 break;
3355 op0 = gimple_assign_rhs1 (def_stmt);
3356 code = gimple_assign_rhs_code (def_stmt);
3357 op1 = gimple_assign_rhs2 (def_stmt);
3359 else
3361 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3362 return false;
3363 code = TREE_CODE (off);
3364 extract_ops_from_tree (off, &code, &op0, &op1);
3366 switch (code)
3368 case POINTER_PLUS_EXPR:
3369 case PLUS_EXPR:
3370 if (expr_invariant_in_loop_p (loop, op0))
3372 add = op0;
3373 off = op1;
3374 do_add:
3375 add = fold_convert (sizetype, add);
3376 if (scale != 1)
3377 add = size_binop (MULT_EXPR, add, size_int (scale));
3378 base = size_binop (PLUS_EXPR, base, add);
3379 continue;
3381 if (expr_invariant_in_loop_p (loop, op1))
3383 add = op1;
3384 off = op0;
3385 goto do_add;
3387 break;
3388 case MINUS_EXPR:
3389 if (expr_invariant_in_loop_p (loop, op1))
3391 add = fold_convert (sizetype, op1);
3392 add = size_binop (MINUS_EXPR, size_zero_node, add);
3393 off = op0;
3394 goto do_add;
3396 break;
3397 case MULT_EXPR:
3398 if (scale == 1 && tree_fits_shwi_p (op1))
3400 scale = tree_to_shwi (op1);
3401 off = op0;
3402 continue;
3404 break;
3405 case SSA_NAME:
3406 off = op0;
3407 continue;
3408 CASE_CONVERT:
3409 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3410 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3411 break;
3412 if (TYPE_PRECISION (TREE_TYPE (op0))
3413 == TYPE_PRECISION (TREE_TYPE (off)))
3415 off = op0;
3416 continue;
3418 if (TYPE_PRECISION (TREE_TYPE (op0))
3419 < TYPE_PRECISION (TREE_TYPE (off)))
3421 off = op0;
3422 offtype = TREE_TYPE (off);
3423 STRIP_NOPS (off);
3424 continue;
3426 break;
3427 default:
3428 break;
3430 break;
3433 /* If at the end OFF still isn't a SSA_NAME or isn't
3434 defined in the loop, punt. */
3435 if (TREE_CODE (off) != SSA_NAME
3436 || expr_invariant_in_loop_p (loop, off))
3437 return false;
3439 if (offtype == NULL_TREE)
3440 offtype = TREE_TYPE (off);
3442 if (DR_IS_READ (dr))
3443 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3444 offtype, scale);
3445 else
3446 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3447 offtype, scale);
3449 if (decl == NULL_TREE)
3450 return false;
3452 info->decl = decl;
3453 info->base = base;
3454 info->offset = off;
3455 info->offset_dt = vect_unknown_def_type;
3456 info->offset_vectype = NULL_TREE;
3457 info->scale = scale;
3458 return true;
3461 /* Function vect_analyze_data_refs.
3463 Find all the data references in the loop or basic block.
3465 The general structure of the analysis of data refs in the vectorizer is as
3466 follows:
3467 1- vect_analyze_data_refs(loop/bb): call
3468 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3469 in the loop/bb and their dependences.
3470 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3471 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3472 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3476 bool
3477 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3479 struct loop *loop = NULL;
3480 unsigned int i;
3481 struct data_reference *dr;
3482 tree scalar_type;
3484 if (dump_enabled_p ())
3485 dump_printf_loc (MSG_NOTE, vect_location,
3486 "=== vect_analyze_data_refs ===\n");
3488 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3489 loop = LOOP_VINFO_LOOP (loop_vinfo);
3491 /* Go through the data-refs, check that the analysis succeeded. Update
3492 pointer from stmt_vec_info struct to DR and vectype. */
3494 vec<data_reference_p> datarefs = vinfo->datarefs;
3495 FOR_EACH_VEC_ELT (datarefs, i, dr)
3497 gimple *stmt;
3498 stmt_vec_info stmt_info;
3499 tree base, offset, init;
3500 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3501 bool simd_lane_access = false;
3502 poly_uint64 vf;
3504 again:
3505 if (!dr || !DR_REF (dr))
3507 if (dump_enabled_p ())
3508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3509 "not vectorized: unhandled data-ref\n");
3510 return false;
3513 stmt = DR_STMT (dr);
3514 stmt_info = vinfo_for_stmt (stmt);
3516 /* Discard clobbers from the dataref vector. We will remove
3517 clobber stmts during vectorization. */
3518 if (gimple_clobber_p (stmt))
3520 free_data_ref (dr);
3521 if (i == datarefs.length () - 1)
3523 datarefs.pop ();
3524 break;
3526 datarefs.ordered_remove (i);
3527 dr = datarefs[i];
3528 goto again;
3531 /* Check that analysis of the data-ref succeeded. */
3532 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3533 || !DR_STEP (dr))
3535 bool maybe_gather
3536 = DR_IS_READ (dr)
3537 && !TREE_THIS_VOLATILE (DR_REF (dr))
3538 && targetm.vectorize.builtin_gather != NULL;
3539 bool maybe_scatter
3540 = DR_IS_WRITE (dr)
3541 && !TREE_THIS_VOLATILE (DR_REF (dr))
3542 && targetm.vectorize.builtin_scatter != NULL;
3543 bool maybe_simd_lane_access
3544 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3546 /* If target supports vector gather loads or scatter stores, or if
3547 this might be a SIMD lane access, see if they can't be used. */
3548 if (is_a <loop_vec_info> (vinfo)
3549 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3550 && !nested_in_vect_loop_p (loop, stmt))
3552 struct data_reference *newdr
3553 = create_data_ref (NULL, loop_containing_stmt (stmt),
3554 DR_REF (dr), stmt, !maybe_scatter,
3555 DR_IS_CONDITIONAL_IN_STMT (dr));
3556 gcc_assert (newdr != NULL && DR_REF (newdr));
3557 if (DR_BASE_ADDRESS (newdr)
3558 && DR_OFFSET (newdr)
3559 && DR_INIT (newdr)
3560 && DR_STEP (newdr)
3561 && integer_zerop (DR_STEP (newdr)))
3563 if (maybe_simd_lane_access)
3565 tree off = DR_OFFSET (newdr);
3566 STRIP_NOPS (off);
3567 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3568 && TREE_CODE (off) == MULT_EXPR
3569 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3571 tree step = TREE_OPERAND (off, 1);
3572 off = TREE_OPERAND (off, 0);
3573 STRIP_NOPS (off);
3574 if (CONVERT_EXPR_P (off)
3575 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3576 0)))
3577 < TYPE_PRECISION (TREE_TYPE (off)))
3578 off = TREE_OPERAND (off, 0);
3579 if (TREE_CODE (off) == SSA_NAME)
3581 gimple *def = SSA_NAME_DEF_STMT (off);
3582 tree reft = TREE_TYPE (DR_REF (newdr));
3583 if (is_gimple_call (def)
3584 && gimple_call_internal_p (def)
3585 && (gimple_call_internal_fn (def)
3586 == IFN_GOMP_SIMD_LANE))
3588 tree arg = gimple_call_arg (def, 0);
3589 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3590 arg = SSA_NAME_VAR (arg);
3591 if (arg == loop->simduid
3592 /* For now. */
3593 && tree_int_cst_equal
3594 (TYPE_SIZE_UNIT (reft),
3595 step))
3597 DR_OFFSET (newdr) = ssize_int (0);
3598 DR_STEP (newdr) = step;
3599 DR_OFFSET_ALIGNMENT (newdr)
3600 = BIGGEST_ALIGNMENT;
3601 DR_STEP_ALIGNMENT (newdr)
3602 = highest_pow2_factor (step);
3603 dr = newdr;
3604 simd_lane_access = true;
3610 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3612 dr = newdr;
3613 if (maybe_gather)
3614 gatherscatter = GATHER;
3615 else
3616 gatherscatter = SCATTER;
3619 if (gatherscatter == SG_NONE && !simd_lane_access)
3620 free_data_ref (newdr);
3623 if (gatherscatter == SG_NONE && !simd_lane_access)
3625 if (dump_enabled_p ())
3627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3628 "not vectorized: data ref analysis "
3629 "failed ");
3630 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3633 if (is_a <bb_vec_info> (vinfo))
3634 break;
3636 return false;
3640 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3642 if (dump_enabled_p ())
3643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3644 "not vectorized: base addr of dr is a "
3645 "constant\n");
3647 if (is_a <bb_vec_info> (vinfo))
3648 break;
3650 if (gatherscatter != SG_NONE || simd_lane_access)
3651 free_data_ref (dr);
3652 return false;
3655 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3657 if (dump_enabled_p ())
3659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3660 "not vectorized: volatile type ");
3661 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3664 if (is_a <bb_vec_info> (vinfo))
3665 break;
3667 return false;
3670 if (stmt_can_throw_internal (stmt))
3672 if (dump_enabled_p ())
3674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3675 "not vectorized: statement can throw an "
3676 "exception ");
3677 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3680 if (is_a <bb_vec_info> (vinfo))
3681 break;
3683 if (gatherscatter != SG_NONE || simd_lane_access)
3684 free_data_ref (dr);
3685 return false;
3688 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3689 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3691 if (dump_enabled_p ())
3693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3694 "not vectorized: statement is bitfield "
3695 "access ");
3696 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3699 if (is_a <bb_vec_info> (vinfo))
3700 break;
3702 if (gatherscatter != SG_NONE || simd_lane_access)
3703 free_data_ref (dr);
3704 return false;
3707 base = unshare_expr (DR_BASE_ADDRESS (dr));
3708 offset = unshare_expr (DR_OFFSET (dr));
3709 init = unshare_expr (DR_INIT (dr));
3711 if (is_gimple_call (stmt)
3712 && (!gimple_call_internal_p (stmt)
3713 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3714 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3716 if (dump_enabled_p ())
3718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3719 "not vectorized: dr in a call ");
3720 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3723 if (is_a <bb_vec_info> (vinfo))
3724 break;
3726 if (gatherscatter != SG_NONE || simd_lane_access)
3727 free_data_ref (dr);
3728 return false;
3731 /* Update DR field in stmt_vec_info struct. */
3733 /* If the dataref is in an inner-loop of the loop that is considered for
3734 for vectorization, we also want to analyze the access relative to
3735 the outer-loop (DR contains information only relative to the
3736 inner-most enclosing loop). We do that by building a reference to the
3737 first location accessed by the inner-loop, and analyze it relative to
3738 the outer-loop. */
3739 if (loop && nested_in_vect_loop_p (loop, stmt))
3741 /* Build a reference to the first location accessed by the
3742 inner loop: *(BASE + INIT + OFFSET). By construction,
3743 this address must be invariant in the inner loop, so we
3744 can consider it as being used in the outer loop. */
3745 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3746 init, offset);
3747 tree init_addr = fold_build_pointer_plus (base, init_offset);
3748 tree init_ref = build_fold_indirect_ref (init_addr);
3750 if (dump_enabled_p ())
3752 dump_printf_loc (MSG_NOTE, vect_location,
3753 "analyze in outer loop: ");
3754 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3755 dump_printf (MSG_NOTE, "\n");
3758 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3759 init_ref, loop))
3760 /* dr_analyze_innermost already explained the failure. */
3761 return false;
3763 if (dump_enabled_p ())
3765 dump_printf_loc (MSG_NOTE, vect_location,
3766 "\touter base_address: ");
3767 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3768 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3769 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3770 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3771 STMT_VINFO_DR_OFFSET (stmt_info));
3772 dump_printf (MSG_NOTE,
3773 "\n\touter constant offset from base address: ");
3774 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3775 STMT_VINFO_DR_INIT (stmt_info));
3776 dump_printf (MSG_NOTE, "\n\touter step: ");
3777 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3778 STMT_VINFO_DR_STEP (stmt_info));
3779 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3780 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3781 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3782 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3783 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3784 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3785 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3786 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3790 if (STMT_VINFO_DATA_REF (stmt_info))
3792 if (dump_enabled_p ())
3794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3795 "not vectorized: more than one data ref "
3796 "in stmt: ");
3797 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3800 if (is_a <bb_vec_info> (vinfo))
3801 break;
3803 if (gatherscatter != SG_NONE || simd_lane_access)
3804 free_data_ref (dr);
3805 return false;
3808 STMT_VINFO_DATA_REF (stmt_info) = dr;
3809 if (simd_lane_access)
3811 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3812 free_data_ref (datarefs[i]);
3813 datarefs[i] = dr;
3816 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3817 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3818 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3820 if (dump_enabled_p ())
3822 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3823 "not vectorized: base object not addressable "
3824 "for stmt: ");
3825 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3827 if (is_a <bb_vec_info> (vinfo))
3829 /* In BB vectorization the ref can still participate
3830 in dependence analysis, we just can't vectorize it. */
3831 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3832 continue;
3834 return false;
3837 /* Set vectype for STMT. */
3838 scalar_type = TREE_TYPE (DR_REF (dr));
3839 STMT_VINFO_VECTYPE (stmt_info)
3840 = get_vectype_for_scalar_type (scalar_type);
3841 if (!STMT_VINFO_VECTYPE (stmt_info))
3843 if (dump_enabled_p ())
3845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3846 "not vectorized: no vectype for stmt: ");
3847 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3848 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3849 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3850 scalar_type);
3851 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3854 if (is_a <bb_vec_info> (vinfo))
3856 /* No vector type is fine, the ref can still participate
3857 in dependence analysis, we just can't vectorize it. */
3858 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3859 continue;
3862 if (gatherscatter != SG_NONE || simd_lane_access)
3864 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3865 if (gatherscatter != SG_NONE)
3866 free_data_ref (dr);
3868 return false;
3870 else
3872 if (dump_enabled_p ())
3874 dump_printf_loc (MSG_NOTE, vect_location,
3875 "got vectype for stmt: ");
3876 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3877 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3878 STMT_VINFO_VECTYPE (stmt_info));
3879 dump_printf (MSG_NOTE, "\n");
3883 /* Adjust the minimal vectorization factor according to the
3884 vector type. */
3885 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3886 *min_vf = upper_bound (*min_vf, vf);
3888 if (gatherscatter != SG_NONE)
3890 gather_scatter_info gs_info;
3891 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3892 &gs_info)
3893 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3895 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3896 free_data_ref (dr);
3897 if (dump_enabled_p ())
3899 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3900 (gatherscatter == GATHER) ?
3901 "not vectorized: not suitable for gather "
3902 "load " :
3903 "not vectorized: not suitable for scatter "
3904 "store ");
3905 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3907 return false;
3910 free_data_ref (datarefs[i]);
3911 datarefs[i] = dr;
3912 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3915 else if (is_a <loop_vec_info> (vinfo)
3916 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3918 if (nested_in_vect_loop_p (loop, stmt))
3920 if (dump_enabled_p ())
3922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3923 "not vectorized: not suitable for strided "
3924 "load ");
3925 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3927 return false;
3929 STMT_VINFO_STRIDED_P (stmt_info) = true;
3933 /* If we stopped analysis at the first dataref we could not analyze
3934 when trying to vectorize a basic-block mark the rest of the datarefs
3935 as not vectorizable and truncate the vector of datarefs. That
3936 avoids spending useless time in analyzing their dependence. */
3937 if (i != datarefs.length ())
3939 gcc_assert (is_a <bb_vec_info> (vinfo));
3940 for (unsigned j = i; j < datarefs.length (); ++j)
3942 data_reference_p dr = datarefs[j];
3943 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3944 free_data_ref (dr);
3946 datarefs.truncate (i);
3949 return true;
3953 /* Function vect_get_new_vect_var.
3955 Returns a name for a new variable. The current naming scheme appends the
3956 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3957 the name of vectorizer generated variables, and appends that to NAME if
3958 provided. */
3960 tree
3961 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3963 const char *prefix;
3964 tree new_vect_var;
3966 switch (var_kind)
3968 case vect_simple_var:
3969 prefix = "vect";
3970 break;
3971 case vect_scalar_var:
3972 prefix = "stmp";
3973 break;
3974 case vect_mask_var:
3975 prefix = "mask";
3976 break;
3977 case vect_pointer_var:
3978 prefix = "vectp";
3979 break;
3980 default:
3981 gcc_unreachable ();
3984 if (name)
3986 char* tmp = concat (prefix, "_", name, NULL);
3987 new_vect_var = create_tmp_reg (type, tmp);
3988 free (tmp);
3990 else
3991 new_vect_var = create_tmp_reg (type, prefix);
3993 return new_vect_var;
3996 /* Like vect_get_new_vect_var but return an SSA name. */
3998 tree
3999 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4001 const char *prefix;
4002 tree new_vect_var;
4004 switch (var_kind)
4006 case vect_simple_var:
4007 prefix = "vect";
4008 break;
4009 case vect_scalar_var:
4010 prefix = "stmp";
4011 break;
4012 case vect_pointer_var:
4013 prefix = "vectp";
4014 break;
4015 default:
4016 gcc_unreachable ();
4019 if (name)
4021 char* tmp = concat (prefix, "_", name, NULL);
4022 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4023 free (tmp);
4025 else
4026 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4028 return new_vect_var;
4031 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4033 static void
4034 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4036 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4037 int misalign = DR_MISALIGNMENT (dr);
4038 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4039 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4040 else
4041 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4042 DR_TARGET_ALIGNMENT (dr), misalign);
4045 /* Function vect_create_addr_base_for_vector_ref.
4047 Create an expression that computes the address of the first memory location
4048 that will be accessed for a data reference.
4050 Input:
4051 STMT: The statement containing the data reference.
4052 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4053 OFFSET: Optional. If supplied, it is be added to the initial address.
4054 LOOP: Specify relative to which loop-nest should the address be computed.
4055 For example, when the dataref is in an inner-loop nested in an
4056 outer-loop that is now being vectorized, LOOP can be either the
4057 outer-loop, or the inner-loop. The first memory location accessed
4058 by the following dataref ('in' points to short):
4060 for (i=0; i<N; i++)
4061 for (j=0; j<M; j++)
4062 s += in[i+j]
4064 is as follows:
4065 if LOOP=i_loop: &in (relative to i_loop)
4066 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4067 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4068 initial address. Unlike OFFSET, which is number of elements to
4069 be added, BYTE_OFFSET is measured in bytes.
4071 Output:
4072 1. Return an SSA_NAME whose value is the address of the memory location of
4073 the first vector of the data reference.
4074 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4075 these statement(s) which define the returned SSA_NAME.
4077 FORNOW: We are only handling array accesses with step 1. */
4079 tree
4080 vect_create_addr_base_for_vector_ref (gimple *stmt,
4081 gimple_seq *new_stmt_list,
4082 tree offset,
4083 tree byte_offset)
4085 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4086 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4087 const char *base_name;
4088 tree addr_base;
4089 tree dest;
4090 gimple_seq seq = NULL;
4091 tree vect_ptr_type;
4092 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4093 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4094 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4096 tree data_ref_base = unshare_expr (drb->base_address);
4097 tree base_offset = unshare_expr (drb->offset);
4098 tree init = unshare_expr (drb->init);
4100 if (loop_vinfo)
4101 base_name = get_name (data_ref_base);
4102 else
4104 base_offset = ssize_int (0);
4105 init = ssize_int (0);
4106 base_name = get_name (DR_REF (dr));
4109 /* Create base_offset */
4110 base_offset = size_binop (PLUS_EXPR,
4111 fold_convert (sizetype, base_offset),
4112 fold_convert (sizetype, init));
4114 if (offset)
4116 offset = fold_build2 (MULT_EXPR, sizetype,
4117 fold_convert (sizetype, offset), step);
4118 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4119 base_offset, offset);
4121 if (byte_offset)
4123 byte_offset = fold_convert (sizetype, byte_offset);
4124 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4125 base_offset, byte_offset);
4128 /* base + base_offset */
4129 if (loop_vinfo)
4130 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4131 else
4133 addr_base = build1 (ADDR_EXPR,
4134 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4135 unshare_expr (DR_REF (dr)));
4138 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4139 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4140 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4141 gimple_seq_add_seq (new_stmt_list, seq);
4143 if (DR_PTR_INFO (dr)
4144 && TREE_CODE (addr_base) == SSA_NAME
4145 && !SSA_NAME_PTR_INFO (addr_base))
4147 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4148 if (offset || byte_offset)
4149 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4152 if (dump_enabled_p ())
4154 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4155 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4156 dump_printf (MSG_NOTE, "\n");
4159 return addr_base;
4163 /* Function vect_create_data_ref_ptr.
4165 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4166 location accessed in the loop by STMT, along with the def-use update
4167 chain to appropriately advance the pointer through the loop iterations.
4168 Also set aliasing information for the pointer. This pointer is used by
4169 the callers to this function to create a memory reference expression for
4170 vector load/store access.
4172 Input:
4173 1. STMT: a stmt that references memory. Expected to be of the form
4174 GIMPLE_ASSIGN <name, data-ref> or
4175 GIMPLE_ASSIGN <data-ref, name>.
4176 2. AGGR_TYPE: the type of the reference, which should be either a vector
4177 or an array.
4178 3. AT_LOOP: the loop where the vector memref is to be created.
4179 4. OFFSET (optional): an offset to be added to the initial address accessed
4180 by the data-ref in STMT.
4181 5. BSI: location where the new stmts are to be placed if there is no loop
4182 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4183 pointing to the initial address.
4184 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4185 to the initial address accessed by the data-ref in STMT. This is
4186 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4187 in bytes.
4189 Output:
4190 1. Declare a new ptr to vector_type, and have it point to the base of the
4191 data reference (initial addressed accessed by the data reference).
4192 For example, for vector of type V8HI, the following code is generated:
4194 v8hi *ap;
4195 ap = (v8hi *)initial_address;
4197 if OFFSET is not supplied:
4198 initial_address = &a[init];
4199 if OFFSET is supplied:
4200 initial_address = &a[init + OFFSET];
4201 if BYTE_OFFSET is supplied:
4202 initial_address = &a[init] + BYTE_OFFSET;
4204 Return the initial_address in INITIAL_ADDRESS.
4206 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4207 update the pointer in each iteration of the loop.
4209 Return the increment stmt that updates the pointer in PTR_INCR.
4211 3. Set INV_P to true if the access pattern of the data reference in the
4212 vectorized loop is invariant. Set it to false otherwise.
4214 4. Return the pointer. */
4216 tree
4217 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4218 tree offset, tree *initial_address,
4219 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4220 bool only_init, bool *inv_p, tree byte_offset)
4222 const char *base_name;
4223 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4224 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4225 struct loop *loop = NULL;
4226 bool nested_in_vect_loop = false;
4227 struct loop *containing_loop = NULL;
4228 tree aggr_ptr_type;
4229 tree aggr_ptr;
4230 tree new_temp;
4231 gimple_seq new_stmt_list = NULL;
4232 edge pe = NULL;
4233 basic_block new_bb;
4234 tree aggr_ptr_init;
4235 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4236 tree aptr;
4237 gimple_stmt_iterator incr_gsi;
4238 bool insert_after;
4239 tree indx_before_incr, indx_after_incr;
4240 gimple *incr;
4241 tree step;
4242 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4244 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4245 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4247 if (loop_vinfo)
4249 loop = LOOP_VINFO_LOOP (loop_vinfo);
4250 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4251 containing_loop = (gimple_bb (stmt))->loop_father;
4252 pe = loop_preheader_edge (loop);
4254 else
4256 gcc_assert (bb_vinfo);
4257 only_init = true;
4258 *ptr_incr = NULL;
4261 /* Check the step (evolution) of the load in LOOP, and record
4262 whether it's invariant. */
4263 step = vect_dr_behavior (dr)->step;
4264 if (integer_zerop (step))
4265 *inv_p = true;
4266 else
4267 *inv_p = false;
4269 /* Create an expression for the first address accessed by this load
4270 in LOOP. */
4271 base_name = get_name (DR_BASE_ADDRESS (dr));
4273 if (dump_enabled_p ())
4275 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4276 dump_printf_loc (MSG_NOTE, vect_location,
4277 "create %s-pointer variable to type: ",
4278 get_tree_code_name (TREE_CODE (aggr_type)));
4279 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4280 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4281 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4282 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4283 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4284 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4285 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4286 else
4287 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4288 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4289 dump_printf (MSG_NOTE, "\n");
4292 /* (1) Create the new aggregate-pointer variable.
4293 Vector and array types inherit the alias set of their component
4294 type by default so we need to use a ref-all pointer if the data
4295 reference does not conflict with the created aggregated data
4296 reference because it is not addressable. */
4297 bool need_ref_all = false;
4298 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4299 get_alias_set (DR_REF (dr))))
4300 need_ref_all = true;
4301 /* Likewise for any of the data references in the stmt group. */
4302 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4304 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4307 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4308 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4309 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4310 get_alias_set (DR_REF (sdr))))
4312 need_ref_all = true;
4313 break;
4315 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4317 while (orig_stmt);
4319 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4320 need_ref_all);
4321 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4324 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4325 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4326 def-use update cycles for the pointer: one relative to the outer-loop
4327 (LOOP), which is what steps (3) and (4) below do. The other is relative
4328 to the inner-loop (which is the inner-most loop containing the dataref),
4329 and this is done be step (5) below.
4331 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4332 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4333 redundant. Steps (3),(4) create the following:
4335 vp0 = &base_addr;
4336 LOOP: vp1 = phi(vp0,vp2)
4339 vp2 = vp1 + step
4340 goto LOOP
4342 If there is an inner-loop nested in loop, then step (5) will also be
4343 applied, and an additional update in the inner-loop will be created:
4345 vp0 = &base_addr;
4346 LOOP: vp1 = phi(vp0,vp2)
4348 inner: vp3 = phi(vp1,vp4)
4349 vp4 = vp3 + inner_step
4350 if () goto inner
4352 vp2 = vp1 + step
4353 if () goto LOOP */
4355 /* (2) Calculate the initial address of the aggregate-pointer, and set
4356 the aggregate-pointer to point to it before the loop. */
4358 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4360 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4361 offset, byte_offset);
4362 if (new_stmt_list)
4364 if (pe)
4366 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4367 gcc_assert (!new_bb);
4369 else
4370 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4373 *initial_address = new_temp;
4374 aggr_ptr_init = new_temp;
4376 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4377 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4378 inner-loop nested in LOOP (during outer-loop vectorization). */
4380 /* No update in loop is required. */
4381 if (only_init && (!loop_vinfo || at_loop == loop))
4382 aptr = aggr_ptr_init;
4383 else
4385 /* The step of the aggregate pointer is the type size. */
4386 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4387 /* One exception to the above is when the scalar step of the load in
4388 LOOP is zero. In this case the step here is also zero. */
4389 if (*inv_p)
4390 iv_step = size_zero_node;
4391 else if (tree_int_cst_sgn (step) == -1)
4392 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4394 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4396 create_iv (aggr_ptr_init,
4397 fold_convert (aggr_ptr_type, iv_step),
4398 aggr_ptr, loop, &incr_gsi, insert_after,
4399 &indx_before_incr, &indx_after_incr);
4400 incr = gsi_stmt (incr_gsi);
4401 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4403 /* Copy the points-to information if it exists. */
4404 if (DR_PTR_INFO (dr))
4406 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4407 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4409 if (ptr_incr)
4410 *ptr_incr = incr;
4412 aptr = indx_before_incr;
4415 if (!nested_in_vect_loop || only_init)
4416 return aptr;
4419 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4420 nested in LOOP, if exists. */
4422 gcc_assert (nested_in_vect_loop);
4423 if (!only_init)
4425 standard_iv_increment_position (containing_loop, &incr_gsi,
4426 &insert_after);
4427 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4428 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4429 &indx_after_incr);
4430 incr = gsi_stmt (incr_gsi);
4431 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4433 /* Copy the points-to information if it exists. */
4434 if (DR_PTR_INFO (dr))
4436 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4437 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4439 if (ptr_incr)
4440 *ptr_incr = incr;
4442 return indx_before_incr;
4444 else
4445 gcc_unreachable ();
4449 /* Function bump_vector_ptr
4451 Increment a pointer (to a vector type) by vector-size. If requested,
4452 i.e. if PTR-INCR is given, then also connect the new increment stmt
4453 to the existing def-use update-chain of the pointer, by modifying
4454 the PTR_INCR as illustrated below:
4456 The pointer def-use update-chain before this function:
4457 DATAREF_PTR = phi (p_0, p_2)
4458 ....
4459 PTR_INCR: p_2 = DATAREF_PTR + step
4461 The pointer def-use update-chain after this function:
4462 DATAREF_PTR = phi (p_0, p_2)
4463 ....
4464 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4465 ....
4466 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4468 Input:
4469 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4470 in the loop.
4471 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4472 the loop. The increment amount across iterations is expected
4473 to be vector_size.
4474 BSI - location where the new update stmt is to be placed.
4475 STMT - the original scalar memory-access stmt that is being vectorized.
4476 BUMP - optional. The offset by which to bump the pointer. If not given,
4477 the offset is assumed to be vector_size.
4479 Output: Return NEW_DATAREF_PTR as illustrated above.
4483 tree
4484 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4485 gimple *stmt, tree bump)
4487 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4488 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4489 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4490 tree update = TYPE_SIZE_UNIT (vectype);
4491 gassign *incr_stmt;
4492 ssa_op_iter iter;
4493 use_operand_p use_p;
4494 tree new_dataref_ptr;
4496 if (bump)
4497 update = bump;
4499 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4500 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4501 else
4502 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4503 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4504 dataref_ptr, update);
4505 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4507 /* Copy the points-to information if it exists. */
4508 if (DR_PTR_INFO (dr))
4510 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4511 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4514 if (!ptr_incr)
4515 return new_dataref_ptr;
4517 /* Update the vector-pointer's cross-iteration increment. */
4518 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4520 tree use = USE_FROM_PTR (use_p);
4522 if (use == dataref_ptr)
4523 SET_USE (use_p, new_dataref_ptr);
4524 else
4525 gcc_assert (tree_int_cst_compare (use, update) == 0);
4528 return new_dataref_ptr;
4532 /* Function vect_create_destination_var.
4534 Create a new temporary of type VECTYPE. */
4536 tree
4537 vect_create_destination_var (tree scalar_dest, tree vectype)
4539 tree vec_dest;
4540 const char *name;
4541 char *new_name;
4542 tree type;
4543 enum vect_var_kind kind;
4545 kind = vectype
4546 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4547 ? vect_mask_var
4548 : vect_simple_var
4549 : vect_scalar_var;
4550 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4552 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4554 name = get_name (scalar_dest);
4555 if (name)
4556 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4557 else
4558 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4559 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4560 free (new_name);
4562 return vec_dest;
4565 /* Function vect_grouped_store_supported.
4567 Returns TRUE if interleave high and interleave low permutations
4568 are supported, and FALSE otherwise. */
4570 bool
4571 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4573 machine_mode mode = TYPE_MODE (vectype);
4575 /* vect_permute_store_chain requires the group size to be equal to 3 or
4576 be a power of two. */
4577 if (count != 3 && exact_log2 (count) == -1)
4579 if (dump_enabled_p ())
4580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4581 "the size of the group of accesses"
4582 " is not a power of 2 or not eqaul to 3\n");
4583 return false;
4586 /* Check that the permutation is supported. */
4587 if (VECTOR_MODE_P (mode))
4589 unsigned int i;
4590 if (count == 3)
4592 unsigned int j0 = 0, j1 = 0, j2 = 0;
4593 unsigned int i, j;
4595 unsigned int nelt;
4596 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4598 if (dump_enabled_p ())
4599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4600 "cannot handle groups of 3 stores for"
4601 " variable-length vectors\n");
4602 return false;
4605 vec_perm_builder sel (nelt, nelt, 1);
4606 sel.quick_grow (nelt);
4607 vec_perm_indices indices;
4608 for (j = 0; j < 3; j++)
4610 int nelt0 = ((3 - j) * nelt) % 3;
4611 int nelt1 = ((3 - j) * nelt + 1) % 3;
4612 int nelt2 = ((3 - j) * nelt + 2) % 3;
4613 for (i = 0; i < nelt; i++)
4615 if (3 * i + nelt0 < nelt)
4616 sel[3 * i + nelt0] = j0++;
4617 if (3 * i + nelt1 < nelt)
4618 sel[3 * i + nelt1] = nelt + j1++;
4619 if (3 * i + nelt2 < nelt)
4620 sel[3 * i + nelt2] = 0;
4622 indices.new_vector (sel, 2, nelt);
4623 if (!can_vec_perm_const_p (mode, indices))
4625 if (dump_enabled_p ())
4626 dump_printf (MSG_MISSED_OPTIMIZATION,
4627 "permutation op not supported by target.\n");
4628 return false;
4631 for (i = 0; i < nelt; i++)
4633 if (3 * i + nelt0 < nelt)
4634 sel[3 * i + nelt0] = 3 * i + nelt0;
4635 if (3 * i + nelt1 < nelt)
4636 sel[3 * i + nelt1] = 3 * i + nelt1;
4637 if (3 * i + nelt2 < nelt)
4638 sel[3 * i + nelt2] = nelt + j2++;
4640 indices.new_vector (sel, 2, nelt);
4641 if (!can_vec_perm_const_p (mode, indices))
4643 if (dump_enabled_p ())
4644 dump_printf (MSG_MISSED_OPTIMIZATION,
4645 "permutation op not supported by target.\n");
4646 return false;
4649 return true;
4651 else
4653 /* If length is not equal to 3 then only power of 2 is supported. */
4654 gcc_assert (pow2p_hwi (count));
4655 poly_uint64 nelt = GET_MODE_NUNITS (mode);
4657 /* The encoding has 2 interleaved stepped patterns. */
4658 vec_perm_builder sel (nelt, 2, 3);
4659 sel.quick_grow (6);
4660 for (i = 0; i < 3; i++)
4662 sel[i * 2] = i;
4663 sel[i * 2 + 1] = i + nelt;
4665 vec_perm_indices indices (sel, 2, nelt);
4666 if (can_vec_perm_const_p (mode, indices))
4668 for (i = 0; i < 6; i++)
4669 sel[i] += exact_div (nelt, 2);
4670 indices.new_vector (sel, 2, nelt);
4671 if (can_vec_perm_const_p (mode, indices))
4672 return true;
4677 if (dump_enabled_p ())
4678 dump_printf (MSG_MISSED_OPTIMIZATION,
4679 "permutaion op not supported by target.\n");
4680 return false;
4684 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4685 type VECTYPE. */
4687 bool
4688 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4690 return vect_lanes_optab_supported_p ("vec_store_lanes",
4691 vec_store_lanes_optab,
4692 vectype, count);
4696 /* Function vect_permute_store_chain.
4698 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4699 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4700 the data correctly for the stores. Return the final references for stores
4701 in RESULT_CHAIN.
4703 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4704 The input is 4 vectors each containing 8 elements. We assign a number to
4705 each element, the input sequence is:
4707 1st vec: 0 1 2 3 4 5 6 7
4708 2nd vec: 8 9 10 11 12 13 14 15
4709 3rd vec: 16 17 18 19 20 21 22 23
4710 4th vec: 24 25 26 27 28 29 30 31
4712 The output sequence should be:
4714 1st vec: 0 8 16 24 1 9 17 25
4715 2nd vec: 2 10 18 26 3 11 19 27
4716 3rd vec: 4 12 20 28 5 13 21 30
4717 4th vec: 6 14 22 30 7 15 23 31
4719 i.e., we interleave the contents of the four vectors in their order.
4721 We use interleave_high/low instructions to create such output. The input of
4722 each interleave_high/low operation is two vectors:
4723 1st vec 2nd vec
4724 0 1 2 3 4 5 6 7
4725 the even elements of the result vector are obtained left-to-right from the
4726 high/low elements of the first vector. The odd elements of the result are
4727 obtained left-to-right from the high/low elements of the second vector.
4728 The output of interleave_high will be: 0 4 1 5
4729 and of interleave_low: 2 6 3 7
4732 The permutation is done in log LENGTH stages. In each stage interleave_high
4733 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4734 where the first argument is taken from the first half of DR_CHAIN and the
4735 second argument from it's second half.
4736 In our example,
4738 I1: interleave_high (1st vec, 3rd vec)
4739 I2: interleave_low (1st vec, 3rd vec)
4740 I3: interleave_high (2nd vec, 4th vec)
4741 I4: interleave_low (2nd vec, 4th vec)
4743 The output for the first stage is:
4745 I1: 0 16 1 17 2 18 3 19
4746 I2: 4 20 5 21 6 22 7 23
4747 I3: 8 24 9 25 10 26 11 27
4748 I4: 12 28 13 29 14 30 15 31
4750 The output of the second stage, i.e. the final result is:
4752 I1: 0 8 16 24 1 9 17 25
4753 I2: 2 10 18 26 3 11 19 27
4754 I3: 4 12 20 28 5 13 21 30
4755 I4: 6 14 22 30 7 15 23 31. */
4757 void
4758 vect_permute_store_chain (vec<tree> dr_chain,
4759 unsigned int length,
4760 gimple *stmt,
4761 gimple_stmt_iterator *gsi,
4762 vec<tree> *result_chain)
4764 tree vect1, vect2, high, low;
4765 gimple *perm_stmt;
4766 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4767 tree perm_mask_low, perm_mask_high;
4768 tree data_ref;
4769 tree perm3_mask_low, perm3_mask_high;
4770 unsigned int i, j, n, log_length = exact_log2 (length);
4772 result_chain->quick_grow (length);
4773 memcpy (result_chain->address (), dr_chain.address (),
4774 length * sizeof (tree));
4776 if (length == 3)
4778 /* vect_grouped_store_supported ensures that this is constant. */
4779 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4780 unsigned int j0 = 0, j1 = 0, j2 = 0;
4782 vec_perm_builder sel (nelt, nelt, 1);
4783 sel.quick_grow (nelt);
4784 vec_perm_indices indices;
4785 for (j = 0; j < 3; j++)
4787 int nelt0 = ((3 - j) * nelt) % 3;
4788 int nelt1 = ((3 - j) * nelt + 1) % 3;
4789 int nelt2 = ((3 - j) * nelt + 2) % 3;
4791 for (i = 0; i < nelt; i++)
4793 if (3 * i + nelt0 < nelt)
4794 sel[3 * i + nelt0] = j0++;
4795 if (3 * i + nelt1 < nelt)
4796 sel[3 * i + nelt1] = nelt + j1++;
4797 if (3 * i + nelt2 < nelt)
4798 sel[3 * i + nelt2] = 0;
4800 indices.new_vector (sel, 2, nelt);
4801 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4803 for (i = 0; i < nelt; i++)
4805 if (3 * i + nelt0 < nelt)
4806 sel[3 * i + nelt0] = 3 * i + nelt0;
4807 if (3 * i + nelt1 < nelt)
4808 sel[3 * i + nelt1] = 3 * i + nelt1;
4809 if (3 * i + nelt2 < nelt)
4810 sel[3 * i + nelt2] = nelt + j2++;
4812 indices.new_vector (sel, 2, nelt);
4813 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4815 vect1 = dr_chain[0];
4816 vect2 = dr_chain[1];
4818 /* Create interleaving stmt:
4819 low = VEC_PERM_EXPR <vect1, vect2,
4820 {j, nelt, *, j + 1, nelt + j + 1, *,
4821 j + 2, nelt + j + 2, *, ...}> */
4822 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4823 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4824 vect2, perm3_mask_low);
4825 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4827 vect1 = data_ref;
4828 vect2 = dr_chain[2];
4829 /* Create interleaving stmt:
4830 low = VEC_PERM_EXPR <vect1, vect2,
4831 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4832 6, 7, nelt + j + 2, ...}> */
4833 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4834 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4835 vect2, perm3_mask_high);
4836 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4837 (*result_chain)[j] = data_ref;
4840 else
4842 /* If length is not equal to 3 then only power of 2 is supported. */
4843 gcc_assert (pow2p_hwi (length));
4845 /* The encoding has 2 interleaved stepped patterns. */
4846 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
4847 vec_perm_builder sel (nelt, 2, 3);
4848 sel.quick_grow (6);
4849 for (i = 0; i < 3; i++)
4851 sel[i * 2] = i;
4852 sel[i * 2 + 1] = i + nelt;
4854 vec_perm_indices indices (sel, 2, nelt);
4855 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4857 for (i = 0; i < 6; i++)
4858 sel[i] += exact_div (nelt, 2);
4859 indices.new_vector (sel, 2, nelt);
4860 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4862 for (i = 0, n = log_length; i < n; i++)
4864 for (j = 0; j < length/2; j++)
4866 vect1 = dr_chain[j];
4867 vect2 = dr_chain[j+length/2];
4869 /* Create interleaving stmt:
4870 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4871 ...}> */
4872 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4873 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4874 vect2, perm_mask_high);
4875 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4876 (*result_chain)[2*j] = high;
4878 /* Create interleaving stmt:
4879 low = VEC_PERM_EXPR <vect1, vect2,
4880 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4881 ...}> */
4882 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4883 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4884 vect2, perm_mask_low);
4885 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4886 (*result_chain)[2*j+1] = low;
4888 memcpy (dr_chain.address (), result_chain->address (),
4889 length * sizeof (tree));
4894 /* Function vect_setup_realignment
4896 This function is called when vectorizing an unaligned load using
4897 the dr_explicit_realign[_optimized] scheme.
4898 This function generates the following code at the loop prolog:
4900 p = initial_addr;
4901 x msq_init = *(floor(p)); # prolog load
4902 realignment_token = call target_builtin;
4903 loop:
4904 x msq = phi (msq_init, ---)
4906 The stmts marked with x are generated only for the case of
4907 dr_explicit_realign_optimized.
4909 The code above sets up a new (vector) pointer, pointing to the first
4910 location accessed by STMT, and a "floor-aligned" load using that pointer.
4911 It also generates code to compute the "realignment-token" (if the relevant
4912 target hook was defined), and creates a phi-node at the loop-header bb
4913 whose arguments are the result of the prolog-load (created by this
4914 function) and the result of a load that takes place in the loop (to be
4915 created by the caller to this function).
4917 For the case of dr_explicit_realign_optimized:
4918 The caller to this function uses the phi-result (msq) to create the
4919 realignment code inside the loop, and sets up the missing phi argument,
4920 as follows:
4921 loop:
4922 msq = phi (msq_init, lsq)
4923 lsq = *(floor(p')); # load in loop
4924 result = realign_load (msq, lsq, realignment_token);
4926 For the case of dr_explicit_realign:
4927 loop:
4928 msq = *(floor(p)); # load in loop
4929 p' = p + (VS-1);
4930 lsq = *(floor(p')); # load in loop
4931 result = realign_load (msq, lsq, realignment_token);
4933 Input:
4934 STMT - (scalar) load stmt to be vectorized. This load accesses
4935 a memory location that may be unaligned.
4936 BSI - place where new code is to be inserted.
4937 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4938 is used.
4940 Output:
4941 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4942 target hook, if defined.
4943 Return value - the result of the loop-header phi node. */
4945 tree
4946 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4947 tree *realignment_token,
4948 enum dr_alignment_support alignment_support_scheme,
4949 tree init_addr,
4950 struct loop **at_loop)
4952 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4953 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4954 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4955 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4956 struct loop *loop = NULL;
4957 edge pe = NULL;
4958 tree scalar_dest = gimple_assign_lhs (stmt);
4959 tree vec_dest;
4960 gimple *inc;
4961 tree ptr;
4962 tree data_ref;
4963 basic_block new_bb;
4964 tree msq_init = NULL_TREE;
4965 tree new_temp;
4966 gphi *phi_stmt;
4967 tree msq = NULL_TREE;
4968 gimple_seq stmts = NULL;
4969 bool inv_p;
4970 bool compute_in_loop = false;
4971 bool nested_in_vect_loop = false;
4972 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4973 struct loop *loop_for_initial_load = NULL;
4975 if (loop_vinfo)
4977 loop = LOOP_VINFO_LOOP (loop_vinfo);
4978 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4981 gcc_assert (alignment_support_scheme == dr_explicit_realign
4982 || alignment_support_scheme == dr_explicit_realign_optimized);
4984 /* We need to generate three things:
4985 1. the misalignment computation
4986 2. the extra vector load (for the optimized realignment scheme).
4987 3. the phi node for the two vectors from which the realignment is
4988 done (for the optimized realignment scheme). */
4990 /* 1. Determine where to generate the misalignment computation.
4992 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4993 calculation will be generated by this function, outside the loop (in the
4994 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4995 caller, inside the loop.
4997 Background: If the misalignment remains fixed throughout the iterations of
4998 the loop, then both realignment schemes are applicable, and also the
4999 misalignment computation can be done outside LOOP. This is because we are
5000 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5001 are a multiple of VS (the Vector Size), and therefore the misalignment in
5002 different vectorized LOOP iterations is always the same.
5003 The problem arises only if the memory access is in an inner-loop nested
5004 inside LOOP, which is now being vectorized using outer-loop vectorization.
5005 This is the only case when the misalignment of the memory access may not
5006 remain fixed throughout the iterations of the inner-loop (as explained in
5007 detail in vect_supportable_dr_alignment). In this case, not only is the
5008 optimized realignment scheme not applicable, but also the misalignment
5009 computation (and generation of the realignment token that is passed to
5010 REALIGN_LOAD) have to be done inside the loop.
5012 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5013 or not, which in turn determines if the misalignment is computed inside
5014 the inner-loop, or outside LOOP. */
5016 if (init_addr != NULL_TREE || !loop_vinfo)
5018 compute_in_loop = true;
5019 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5023 /* 2. Determine where to generate the extra vector load.
5025 For the optimized realignment scheme, instead of generating two vector
5026 loads in each iteration, we generate a single extra vector load in the
5027 preheader of the loop, and in each iteration reuse the result of the
5028 vector load from the previous iteration. In case the memory access is in
5029 an inner-loop nested inside LOOP, which is now being vectorized using
5030 outer-loop vectorization, we need to determine whether this initial vector
5031 load should be generated at the preheader of the inner-loop, or can be
5032 generated at the preheader of LOOP. If the memory access has no evolution
5033 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5034 to be generated inside LOOP (in the preheader of the inner-loop). */
5036 if (nested_in_vect_loop)
5038 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5039 bool invariant_in_outerloop =
5040 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5041 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5043 else
5044 loop_for_initial_load = loop;
5045 if (at_loop)
5046 *at_loop = loop_for_initial_load;
5048 if (loop_for_initial_load)
5049 pe = loop_preheader_edge (loop_for_initial_load);
5051 /* 3. For the case of the optimized realignment, create the first vector
5052 load at the loop preheader. */
5054 if (alignment_support_scheme == dr_explicit_realign_optimized)
5056 /* Create msq_init = *(floor(p1)) in the loop preheader */
5057 gassign *new_stmt;
5059 gcc_assert (!compute_in_loop);
5060 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5061 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5062 NULL_TREE, &init_addr, NULL, &inc,
5063 true, &inv_p);
5064 if (TREE_CODE (ptr) == SSA_NAME)
5065 new_temp = copy_ssa_name (ptr);
5066 else
5067 new_temp = make_ssa_name (TREE_TYPE (ptr));
5068 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5069 new_stmt = gimple_build_assign
5070 (new_temp, BIT_AND_EXPR, ptr,
5071 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5072 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5073 gcc_assert (!new_bb);
5074 data_ref
5075 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5076 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5077 new_stmt = gimple_build_assign (vec_dest, data_ref);
5078 new_temp = make_ssa_name (vec_dest, new_stmt);
5079 gimple_assign_set_lhs (new_stmt, new_temp);
5080 if (pe)
5082 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5083 gcc_assert (!new_bb);
5085 else
5086 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5088 msq_init = gimple_assign_lhs (new_stmt);
5091 /* 4. Create realignment token using a target builtin, if available.
5092 It is done either inside the containing loop, or before LOOP (as
5093 determined above). */
5095 if (targetm.vectorize.builtin_mask_for_load)
5097 gcall *new_stmt;
5098 tree builtin_decl;
5100 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5101 if (!init_addr)
5103 /* Generate the INIT_ADDR computation outside LOOP. */
5104 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5105 NULL_TREE);
5106 if (loop)
5108 pe = loop_preheader_edge (loop);
5109 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5110 gcc_assert (!new_bb);
5112 else
5113 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5116 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5117 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5118 vec_dest =
5119 vect_create_destination_var (scalar_dest,
5120 gimple_call_return_type (new_stmt));
5121 new_temp = make_ssa_name (vec_dest, new_stmt);
5122 gimple_call_set_lhs (new_stmt, new_temp);
5124 if (compute_in_loop)
5125 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5126 else
5128 /* Generate the misalignment computation outside LOOP. */
5129 pe = loop_preheader_edge (loop);
5130 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5131 gcc_assert (!new_bb);
5134 *realignment_token = gimple_call_lhs (new_stmt);
5136 /* The result of the CALL_EXPR to this builtin is determined from
5137 the value of the parameter and no global variables are touched
5138 which makes the builtin a "const" function. Requiring the
5139 builtin to have the "const" attribute makes it unnecessary
5140 to call mark_call_clobbered. */
5141 gcc_assert (TREE_READONLY (builtin_decl));
5144 if (alignment_support_scheme == dr_explicit_realign)
5145 return msq;
5147 gcc_assert (!compute_in_loop);
5148 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5151 /* 5. Create msq = phi <msq_init, lsq> in loop */
5153 pe = loop_preheader_edge (containing_loop);
5154 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5155 msq = make_ssa_name (vec_dest);
5156 phi_stmt = create_phi_node (msq, containing_loop->header);
5157 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5159 return msq;
5163 /* Function vect_grouped_load_supported.
5165 COUNT is the size of the load group (the number of statements plus the
5166 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5167 only one statement, with a gap of COUNT - 1.
5169 Returns true if a suitable permute exists. */
5171 bool
5172 vect_grouped_load_supported (tree vectype, bool single_element_p,
5173 unsigned HOST_WIDE_INT count)
5175 machine_mode mode = TYPE_MODE (vectype);
5177 /* If this is single-element interleaving with an element distance
5178 that leaves unused vector loads around punt - we at least create
5179 very sub-optimal code in that case (and blow up memory,
5180 see PR65518). */
5181 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5183 if (dump_enabled_p ())
5184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5185 "single-element interleaving not supported "
5186 "for not adjacent vector loads\n");
5187 return false;
5190 /* vect_permute_load_chain requires the group size to be equal to 3 or
5191 be a power of two. */
5192 if (count != 3 && exact_log2 (count) == -1)
5194 if (dump_enabled_p ())
5195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5196 "the size of the group of accesses"
5197 " is not a power of 2 or not equal to 3\n");
5198 return false;
5201 /* Check that the permutation is supported. */
5202 if (VECTOR_MODE_P (mode))
5204 unsigned int i, j;
5205 if (count == 3)
5207 unsigned int nelt;
5208 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5210 if (dump_enabled_p ())
5211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5212 "cannot handle groups of 3 loads for"
5213 " variable-length vectors\n");
5214 return false;
5217 vec_perm_builder sel (nelt, nelt, 1);
5218 sel.quick_grow (nelt);
5219 vec_perm_indices indices;
5220 unsigned int k;
5221 for (k = 0; k < 3; k++)
5223 for (i = 0; i < nelt; i++)
5224 if (3 * i + k < 2 * nelt)
5225 sel[i] = 3 * i + k;
5226 else
5227 sel[i] = 0;
5228 indices.new_vector (sel, 2, nelt);
5229 if (!can_vec_perm_const_p (mode, indices))
5231 if (dump_enabled_p ())
5232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5233 "shuffle of 3 loads is not supported by"
5234 " target\n");
5235 return false;
5237 for (i = 0, j = 0; i < nelt; i++)
5238 if (3 * i + k < 2 * nelt)
5239 sel[i] = i;
5240 else
5241 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5242 indices.new_vector (sel, 2, nelt);
5243 if (!can_vec_perm_const_p (mode, indices))
5245 if (dump_enabled_p ())
5246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5247 "shuffle of 3 loads is not supported by"
5248 " target\n");
5249 return false;
5252 return true;
5254 else
5256 /* If length is not equal to 3 then only power of 2 is supported. */
5257 gcc_assert (pow2p_hwi (count));
5258 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5260 /* The encoding has a single stepped pattern. */
5261 vec_perm_builder sel (nelt, 1, 3);
5262 sel.quick_grow (3);
5263 for (i = 0; i < 3; i++)
5264 sel[i] = i * 2;
5265 vec_perm_indices indices (sel, 2, nelt);
5266 if (can_vec_perm_const_p (mode, indices))
5268 for (i = 0; i < 3; i++)
5269 sel[i] = i * 2 + 1;
5270 indices.new_vector (sel, 2, nelt);
5271 if (can_vec_perm_const_p (mode, indices))
5272 return true;
5277 if (dump_enabled_p ())
5278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5279 "extract even/odd not supported by target\n");
5280 return false;
5283 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5284 type VECTYPE. */
5286 bool
5287 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5289 return vect_lanes_optab_supported_p ("vec_load_lanes",
5290 vec_load_lanes_optab,
5291 vectype, count);
5294 /* Function vect_permute_load_chain.
5296 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5297 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5298 the input data correctly. Return the final references for loads in
5299 RESULT_CHAIN.
5301 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5302 The input is 4 vectors each containing 8 elements. We assign a number to each
5303 element, the input sequence is:
5305 1st vec: 0 1 2 3 4 5 6 7
5306 2nd vec: 8 9 10 11 12 13 14 15
5307 3rd vec: 16 17 18 19 20 21 22 23
5308 4th vec: 24 25 26 27 28 29 30 31
5310 The output sequence should be:
5312 1st vec: 0 4 8 12 16 20 24 28
5313 2nd vec: 1 5 9 13 17 21 25 29
5314 3rd vec: 2 6 10 14 18 22 26 30
5315 4th vec: 3 7 11 15 19 23 27 31
5317 i.e., the first output vector should contain the first elements of each
5318 interleaving group, etc.
5320 We use extract_even/odd instructions to create such output. The input of
5321 each extract_even/odd operation is two vectors
5322 1st vec 2nd vec
5323 0 1 2 3 4 5 6 7
5325 and the output is the vector of extracted even/odd elements. The output of
5326 extract_even will be: 0 2 4 6
5327 and of extract_odd: 1 3 5 7
5330 The permutation is done in log LENGTH stages. In each stage extract_even
5331 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5332 their order. In our example,
5334 E1: extract_even (1st vec, 2nd vec)
5335 E2: extract_odd (1st vec, 2nd vec)
5336 E3: extract_even (3rd vec, 4th vec)
5337 E4: extract_odd (3rd vec, 4th vec)
5339 The output for the first stage will be:
5341 E1: 0 2 4 6 8 10 12 14
5342 E2: 1 3 5 7 9 11 13 15
5343 E3: 16 18 20 22 24 26 28 30
5344 E4: 17 19 21 23 25 27 29 31
5346 In order to proceed and create the correct sequence for the next stage (or
5347 for the correct output, if the second stage is the last one, as in our
5348 example), we first put the output of extract_even operation and then the
5349 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5350 The input for the second stage is:
5352 1st vec (E1): 0 2 4 6 8 10 12 14
5353 2nd vec (E3): 16 18 20 22 24 26 28 30
5354 3rd vec (E2): 1 3 5 7 9 11 13 15
5355 4th vec (E4): 17 19 21 23 25 27 29 31
5357 The output of the second stage:
5359 E1: 0 4 8 12 16 20 24 28
5360 E2: 2 6 10 14 18 22 26 30
5361 E3: 1 5 9 13 17 21 25 29
5362 E4: 3 7 11 15 19 23 27 31
5364 And RESULT_CHAIN after reordering:
5366 1st vec (E1): 0 4 8 12 16 20 24 28
5367 2nd vec (E3): 1 5 9 13 17 21 25 29
5368 3rd vec (E2): 2 6 10 14 18 22 26 30
5369 4th vec (E4): 3 7 11 15 19 23 27 31. */
5371 static void
5372 vect_permute_load_chain (vec<tree> dr_chain,
5373 unsigned int length,
5374 gimple *stmt,
5375 gimple_stmt_iterator *gsi,
5376 vec<tree> *result_chain)
5378 tree data_ref, first_vect, second_vect;
5379 tree perm_mask_even, perm_mask_odd;
5380 tree perm3_mask_low, perm3_mask_high;
5381 gimple *perm_stmt;
5382 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5383 unsigned int i, j, log_length = exact_log2 (length);
5385 result_chain->quick_grow (length);
5386 memcpy (result_chain->address (), dr_chain.address (),
5387 length * sizeof (tree));
5389 if (length == 3)
5391 /* vect_grouped_load_supported ensures that this is constant. */
5392 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5393 unsigned int k;
5395 vec_perm_builder sel (nelt, nelt, 1);
5396 sel.quick_grow (nelt);
5397 vec_perm_indices indices;
5398 for (k = 0; k < 3; k++)
5400 for (i = 0; i < nelt; i++)
5401 if (3 * i + k < 2 * nelt)
5402 sel[i] = 3 * i + k;
5403 else
5404 sel[i] = 0;
5405 indices.new_vector (sel, 2, nelt);
5406 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5408 for (i = 0, j = 0; i < nelt; i++)
5409 if (3 * i + k < 2 * nelt)
5410 sel[i] = i;
5411 else
5412 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5413 indices.new_vector (sel, 2, nelt);
5414 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5416 first_vect = dr_chain[0];
5417 second_vect = dr_chain[1];
5419 /* Create interleaving stmt (low part of):
5420 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5421 ...}> */
5422 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5423 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5424 second_vect, perm3_mask_low);
5425 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5427 /* Create interleaving stmt (high part of):
5428 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5429 ...}> */
5430 first_vect = data_ref;
5431 second_vect = dr_chain[2];
5432 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5433 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5434 second_vect, perm3_mask_high);
5435 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5436 (*result_chain)[k] = data_ref;
5439 else
5441 /* If length is not equal to 3 then only power of 2 is supported. */
5442 gcc_assert (pow2p_hwi (length));
5444 /* The encoding has a single stepped pattern. */
5445 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5446 vec_perm_builder sel (nelt, 1, 3);
5447 sel.quick_grow (3);
5448 for (i = 0; i < 3; ++i)
5449 sel[i] = i * 2;
5450 vec_perm_indices indices (sel, 2, nelt);
5451 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5453 for (i = 0; i < 3; ++i)
5454 sel[i] = i * 2 + 1;
5455 indices.new_vector (sel, 2, nelt);
5456 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5458 for (i = 0; i < log_length; i++)
5460 for (j = 0; j < length; j += 2)
5462 first_vect = dr_chain[j];
5463 second_vect = dr_chain[j+1];
5465 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5466 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5467 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5468 first_vect, second_vect,
5469 perm_mask_even);
5470 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5471 (*result_chain)[j/2] = data_ref;
5473 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5474 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5475 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5476 first_vect, second_vect,
5477 perm_mask_odd);
5478 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5479 (*result_chain)[j/2+length/2] = data_ref;
5481 memcpy (dr_chain.address (), result_chain->address (),
5482 length * sizeof (tree));
5487 /* Function vect_shift_permute_load_chain.
5489 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5490 sequence of stmts to reorder the input data accordingly.
5491 Return the final references for loads in RESULT_CHAIN.
5492 Return true if successed, false otherwise.
5494 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5495 The input is 3 vectors each containing 8 elements. We assign a
5496 number to each element, the input sequence is:
5498 1st vec: 0 1 2 3 4 5 6 7
5499 2nd vec: 8 9 10 11 12 13 14 15
5500 3rd vec: 16 17 18 19 20 21 22 23
5502 The output sequence should be:
5504 1st vec: 0 3 6 9 12 15 18 21
5505 2nd vec: 1 4 7 10 13 16 19 22
5506 3rd vec: 2 5 8 11 14 17 20 23
5508 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5510 First we shuffle all 3 vectors to get correct elements order:
5512 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5513 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5514 3rd vec: (16 19 22) (17 20 23) (18 21)
5516 Next we unite and shift vector 3 times:
5518 1st step:
5519 shift right by 6 the concatenation of:
5520 "1st vec" and "2nd vec"
5521 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5522 "2nd vec" and "3rd vec"
5523 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5524 "3rd vec" and "1st vec"
5525 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5526 | New vectors |
5528 So that now new vectors are:
5530 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5531 2nd vec: (10 13) (16 19 22) (17 20 23)
5532 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5534 2nd step:
5535 shift right by 5 the concatenation of:
5536 "1st vec" and "3rd vec"
5537 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5538 "2nd vec" and "1st vec"
5539 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5540 "3rd vec" and "2nd vec"
5541 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5542 | New vectors |
5544 So that now new vectors are:
5546 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5547 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5548 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5550 3rd step:
5551 shift right by 5 the concatenation of:
5552 "1st vec" and "1st vec"
5553 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5554 shift right by 3 the concatenation of:
5555 "2nd vec" and "2nd vec"
5556 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5557 | New vectors |
5559 So that now all vectors are READY:
5560 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5561 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5562 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5564 This algorithm is faster than one in vect_permute_load_chain if:
5565 1. "shift of a concatination" is faster than general permutation.
5566 This is usually so.
5567 2. The TARGET machine can't execute vector instructions in parallel.
5568 This is because each step of the algorithm depends on previous.
5569 The algorithm in vect_permute_load_chain is much more parallel.
5571 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5574 static bool
5575 vect_shift_permute_load_chain (vec<tree> dr_chain,
5576 unsigned int length,
5577 gimple *stmt,
5578 gimple_stmt_iterator *gsi,
5579 vec<tree> *result_chain)
5581 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5582 tree perm2_mask1, perm2_mask2, perm3_mask;
5583 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5584 gimple *perm_stmt;
5586 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5587 unsigned int i;
5588 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5589 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5591 unsigned HOST_WIDE_INT nelt, vf;
5592 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5593 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5594 /* Not supported for variable-length vectors. */
5595 return false;
5597 vec_perm_builder sel (nelt, nelt, 1);
5598 sel.quick_grow (nelt);
5600 result_chain->quick_grow (length);
5601 memcpy (result_chain->address (), dr_chain.address (),
5602 length * sizeof (tree));
5604 if (pow2p_hwi (length) && vf > 4)
5606 unsigned int j, log_length = exact_log2 (length);
5607 for (i = 0; i < nelt / 2; ++i)
5608 sel[i] = i * 2;
5609 for (i = 0; i < nelt / 2; ++i)
5610 sel[nelt / 2 + i] = i * 2 + 1;
5611 vec_perm_indices indices (sel, 2, nelt);
5612 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5614 if (dump_enabled_p ())
5615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5616 "shuffle of 2 fields structure is not \
5617 supported by target\n");
5618 return false;
5620 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5622 for (i = 0; i < nelt / 2; ++i)
5623 sel[i] = i * 2 + 1;
5624 for (i = 0; i < nelt / 2; ++i)
5625 sel[nelt / 2 + i] = i * 2;
5626 indices.new_vector (sel, 2, nelt);
5627 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5629 if (dump_enabled_p ())
5630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5631 "shuffle of 2 fields structure is not \
5632 supported by target\n");
5633 return false;
5635 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5637 /* Generating permutation constant to shift all elements.
5638 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5639 for (i = 0; i < nelt; i++)
5640 sel[i] = nelt / 2 + i;
5641 indices.new_vector (sel, 2, nelt);
5642 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5644 if (dump_enabled_p ())
5645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5646 "shift permutation is not supported by target\n");
5647 return false;
5649 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5651 /* Generating permutation constant to select vector from 2.
5652 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5653 for (i = 0; i < nelt / 2; i++)
5654 sel[i] = i;
5655 for (i = nelt / 2; i < nelt; i++)
5656 sel[i] = nelt + i;
5657 indices.new_vector (sel, 2, nelt);
5658 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5660 if (dump_enabled_p ())
5661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5662 "select is not supported by target\n");
5663 return false;
5665 select_mask = vect_gen_perm_mask_checked (vectype, indices);
5667 for (i = 0; i < log_length; i++)
5669 for (j = 0; j < length; j += 2)
5671 first_vect = dr_chain[j];
5672 second_vect = dr_chain[j + 1];
5674 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5675 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5676 first_vect, first_vect,
5677 perm2_mask1);
5678 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5679 vect[0] = data_ref;
5681 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5682 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5683 second_vect, second_vect,
5684 perm2_mask2);
5685 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5686 vect[1] = data_ref;
5688 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5689 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5690 vect[0], vect[1], shift1_mask);
5691 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5692 (*result_chain)[j/2 + length/2] = data_ref;
5694 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5695 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5696 vect[0], vect[1], select_mask);
5697 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5698 (*result_chain)[j/2] = data_ref;
5700 memcpy (dr_chain.address (), result_chain->address (),
5701 length * sizeof (tree));
5703 return true;
5705 if (length == 3 && vf > 2)
5707 unsigned int k = 0, l = 0;
5709 /* Generating permutation constant to get all elements in rigth order.
5710 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5711 for (i = 0; i < nelt; i++)
5713 if (3 * k + (l % 3) >= nelt)
5715 k = 0;
5716 l += (3 - (nelt % 3));
5718 sel[i] = 3 * k + (l % 3);
5719 k++;
5721 vec_perm_indices indices (sel, 2, nelt);
5722 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5724 if (dump_enabled_p ())
5725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5726 "shuffle of 3 fields structure is not \
5727 supported by target\n");
5728 return false;
5730 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5732 /* Generating permutation constant to shift all elements.
5733 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5734 for (i = 0; i < nelt; i++)
5735 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5736 indices.new_vector (sel, 2, nelt);
5737 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5739 if (dump_enabled_p ())
5740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5741 "shift permutation is not supported by target\n");
5742 return false;
5744 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5746 /* Generating permutation constant to shift all elements.
5747 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5748 for (i = 0; i < nelt; i++)
5749 sel[i] = 2 * (nelt / 3) + 1 + i;
5750 indices.new_vector (sel, 2, nelt);
5751 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5753 if (dump_enabled_p ())
5754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5755 "shift permutation is not supported by target\n");
5756 return false;
5758 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5760 /* Generating permutation constant to shift all elements.
5761 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5762 for (i = 0; i < nelt; i++)
5763 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5764 indices.new_vector (sel, 2, nelt);
5765 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5767 if (dump_enabled_p ())
5768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5769 "shift permutation is not supported by target\n");
5770 return false;
5772 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5774 /* Generating permutation constant to shift all elements.
5775 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5776 for (i = 0; i < nelt; i++)
5777 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5778 indices.new_vector (sel, 2, nelt);
5779 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5781 if (dump_enabled_p ())
5782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5783 "shift permutation is not supported by target\n");
5784 return false;
5786 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5788 for (k = 0; k < 3; k++)
5790 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5791 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5792 dr_chain[k], dr_chain[k],
5793 perm3_mask);
5794 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5795 vect[k] = data_ref;
5798 for (k = 0; k < 3; k++)
5800 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5801 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5802 vect[k % 3], vect[(k + 1) % 3],
5803 shift1_mask);
5804 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5805 vect_shift[k] = data_ref;
5808 for (k = 0; k < 3; k++)
5810 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5811 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5812 vect_shift[(4 - k) % 3],
5813 vect_shift[(3 - k) % 3],
5814 shift2_mask);
5815 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5816 vect[k] = data_ref;
5819 (*result_chain)[3 - (nelt % 3)] = vect[2];
5821 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5822 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5823 vect[0], shift3_mask);
5824 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5825 (*result_chain)[nelt % 3] = data_ref;
5827 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5828 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5829 vect[1], shift4_mask);
5830 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5831 (*result_chain)[0] = data_ref;
5832 return true;
5834 return false;
5837 /* Function vect_transform_grouped_load.
5839 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5840 to perform their permutation and ascribe the result vectorized statements to
5841 the scalar statements.
5844 void
5845 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5846 gimple_stmt_iterator *gsi)
5848 machine_mode mode;
5849 vec<tree> result_chain = vNULL;
5851 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5852 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5853 vectors, that are ready for vector computation. */
5854 result_chain.create (size);
5856 /* If reassociation width for vector type is 2 or greater target machine can
5857 execute 2 or more vector instructions in parallel. Otherwise try to
5858 get chain for loads group using vect_shift_permute_load_chain. */
5859 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5860 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5861 || pow2p_hwi (size)
5862 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5863 gsi, &result_chain))
5864 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5865 vect_record_grouped_load_vectors (stmt, result_chain);
5866 result_chain.release ();
5869 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5870 generated as part of the vectorization of STMT. Assign the statement
5871 for each vector to the associated scalar statement. */
5873 void
5874 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5876 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5877 gimple *next_stmt, *new_stmt;
5878 unsigned int i, gap_count;
5879 tree tmp_data_ref;
5881 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5882 Since we scan the chain starting from it's first node, their order
5883 corresponds the order of data-refs in RESULT_CHAIN. */
5884 next_stmt = first_stmt;
5885 gap_count = 1;
5886 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5888 if (!next_stmt)
5889 break;
5891 /* Skip the gaps. Loads created for the gaps will be removed by dead
5892 code elimination pass later. No need to check for the first stmt in
5893 the group, since it always exists.
5894 GROUP_GAP is the number of steps in elements from the previous
5895 access (if there is no gap GROUP_GAP is 1). We skip loads that
5896 correspond to the gaps. */
5897 if (next_stmt != first_stmt
5898 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5900 gap_count++;
5901 continue;
5904 while (next_stmt)
5906 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5907 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5908 copies, and we put the new vector statement in the first available
5909 RELATED_STMT. */
5910 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5911 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5912 else
5914 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5916 gimple *prev_stmt =
5917 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5918 gimple *rel_stmt =
5919 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5920 while (rel_stmt)
5922 prev_stmt = rel_stmt;
5923 rel_stmt =
5924 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5927 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5928 new_stmt;
5932 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5933 gap_count = 1;
5934 /* If NEXT_STMT accesses the same DR as the previous statement,
5935 put the same TMP_DATA_REF as its vectorized statement; otherwise
5936 get the next data-ref from RESULT_CHAIN. */
5937 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5938 break;
5943 /* Function vect_force_dr_alignment_p.
5945 Returns whether the alignment of a DECL can be forced to be aligned
5946 on ALIGNMENT bit boundary. */
5948 bool
5949 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5951 if (!VAR_P (decl))
5952 return false;
5954 if (decl_in_symtab_p (decl)
5955 && !symtab_node::get (decl)->can_increase_alignment_p ())
5956 return false;
5958 if (TREE_STATIC (decl))
5959 return (alignment <= MAX_OFILE_ALIGNMENT);
5960 else
5961 return (alignment <= MAX_STACK_ALIGNMENT);
5965 /* Return whether the data reference DR is supported with respect to its
5966 alignment.
5967 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5968 it is aligned, i.e., check if it is possible to vectorize it with different
5969 alignment. */
5971 enum dr_alignment_support
5972 vect_supportable_dr_alignment (struct data_reference *dr,
5973 bool check_aligned_accesses)
5975 gimple *stmt = DR_STMT (dr);
5976 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5977 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5978 machine_mode mode = TYPE_MODE (vectype);
5979 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5980 struct loop *vect_loop = NULL;
5981 bool nested_in_vect_loop = false;
5983 if (aligned_access_p (dr) && !check_aligned_accesses)
5984 return dr_aligned;
5986 /* For now assume all conditional loads/stores support unaligned
5987 access without any special code. */
5988 if (is_gimple_call (stmt)
5989 && gimple_call_internal_p (stmt)
5990 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5991 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5992 return dr_unaligned_supported;
5994 if (loop_vinfo)
5996 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5997 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6000 /* Possibly unaligned access. */
6002 /* We can choose between using the implicit realignment scheme (generating
6003 a misaligned_move stmt) and the explicit realignment scheme (generating
6004 aligned loads with a REALIGN_LOAD). There are two variants to the
6005 explicit realignment scheme: optimized, and unoptimized.
6006 We can optimize the realignment only if the step between consecutive
6007 vector loads is equal to the vector size. Since the vector memory
6008 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6009 is guaranteed that the misalignment amount remains the same throughout the
6010 execution of the vectorized loop. Therefore, we can create the
6011 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6012 at the loop preheader.
6014 However, in the case of outer-loop vectorization, when vectorizing a
6015 memory access in the inner-loop nested within the LOOP that is now being
6016 vectorized, while it is guaranteed that the misalignment of the
6017 vectorized memory access will remain the same in different outer-loop
6018 iterations, it is *not* guaranteed that is will remain the same throughout
6019 the execution of the inner-loop. This is because the inner-loop advances
6020 with the original scalar step (and not in steps of VS). If the inner-loop
6021 step happens to be a multiple of VS, then the misalignment remains fixed
6022 and we can use the optimized realignment scheme. For example:
6024 for (i=0; i<N; i++)
6025 for (j=0; j<M; j++)
6026 s += a[i+j];
6028 When vectorizing the i-loop in the above example, the step between
6029 consecutive vector loads is 1, and so the misalignment does not remain
6030 fixed across the execution of the inner-loop, and the realignment cannot
6031 be optimized (as illustrated in the following pseudo vectorized loop):
6033 for (i=0; i<N; i+=4)
6034 for (j=0; j<M; j++){
6035 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6036 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6037 // (assuming that we start from an aligned address).
6040 We therefore have to use the unoptimized realignment scheme:
6042 for (i=0; i<N; i+=4)
6043 for (j=k; j<M; j+=4)
6044 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6045 // that the misalignment of the initial address is
6046 // 0).
6048 The loop can then be vectorized as follows:
6050 for (k=0; k<4; k++){
6051 rt = get_realignment_token (&vp[k]);
6052 for (i=0; i<N; i+=4){
6053 v1 = vp[i+k];
6054 for (j=k; j<M; j+=4){
6055 v2 = vp[i+j+VS-1];
6056 va = REALIGN_LOAD <v1,v2,rt>;
6057 vs += va;
6058 v1 = v2;
6061 } */
6063 if (DR_IS_READ (dr))
6065 bool is_packed = false;
6066 tree type = (TREE_TYPE (DR_REF (dr)));
6068 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6069 && (!targetm.vectorize.builtin_mask_for_load
6070 || targetm.vectorize.builtin_mask_for_load ()))
6072 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6074 /* If we are doing SLP then the accesses need not have the
6075 same alignment, instead it depends on the SLP group size. */
6076 if (loop_vinfo
6077 && STMT_SLP_TYPE (stmt_info)
6078 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6079 * GROUP_SIZE (vinfo_for_stmt
6080 (GROUP_FIRST_ELEMENT (stmt_info))),
6081 TYPE_VECTOR_SUBPARTS (vectype)))
6083 else if (!loop_vinfo
6084 || (nested_in_vect_loop
6085 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6086 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6087 return dr_explicit_realign;
6088 else
6089 return dr_explicit_realign_optimized;
6091 if (!known_alignment_for_access_p (dr))
6092 is_packed = not_size_aligned (DR_REF (dr));
6094 if (targetm.vectorize.support_vector_misalignment
6095 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6096 /* Can't software pipeline the loads, but can at least do them. */
6097 return dr_unaligned_supported;
6099 else
6101 bool is_packed = false;
6102 tree type = (TREE_TYPE (DR_REF (dr)));
6104 if (!known_alignment_for_access_p (dr))
6105 is_packed = not_size_aligned (DR_REF (dr));
6107 if (targetm.vectorize.support_vector_misalignment
6108 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6109 return dr_unaligned_supported;
6112 /* Unsupported. */
6113 return dr_unaligned_unsupported;