2017-09-15 Paul Thomas <pault@gcc.gnu.org>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobc409dc71a49ec6e55fcf6199304aec3017c7afb3
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
56 /* Return true if load- or store-lanes optab OPTAB is implemented for
57 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59 static bool
60 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
61 tree vectype, unsigned HOST_WIDE_INT count)
63 machine_mode mode;
64 scalar_int_mode array_mode;
65 bool limit_p;
67 mode = TYPE_MODE (vectype);
68 limit_p = !targetm.array_mode_supported_p (mode, count);
69 if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
70 limit_p).exists (&array_mode))
72 if (dump_enabled_p ())
73 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
74 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
75 GET_MODE_NAME (mode), count);
76 return false;
79 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
81 if (dump_enabled_p ())
82 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
83 "cannot use %s<%s><%s>\n", name,
84 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
85 return false;
88 if (dump_enabled_p ())
89 dump_printf_loc (MSG_NOTE, vect_location,
90 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
91 GET_MODE_NAME (mode));
93 return true;
97 /* Return the smallest scalar part of STMT.
98 This is used to determine the vectype of the stmt. We generally set the
99 vectype according to the type of the result (lhs). For stmts whose
100 result-type is different than the type of the arguments (e.g., demotion,
101 promotion), vectype will be reset appropriately (later). Note that we have
102 to visit the smallest datatype in this function, because that determines the
103 VF. If the smallest datatype in the loop is present only as the rhs of a
104 promotion operation - we'd miss it.
105 Such a case, where a variable of this datatype does not appear in the lhs
106 anywhere in the loop, can only occur if it's an invariant: e.g.:
107 'int_x = (int) short_inv', which we'd expect to have been optimized away by
108 invariant motion. However, we cannot rely on invariant motion to always
109 take invariants out of the loop, and so in the case of promotion we also
110 have to check the rhs.
111 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
112 types. */
114 tree
115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
116 HOST_WIDE_INT *rhs_size_unit)
118 tree scalar_type = gimple_expr_type (stmt);
119 HOST_WIDE_INT lhs, rhs;
121 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
123 if (is_gimple_assign (stmt)
124 && (gimple_assign_cast_p (stmt)
125 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
126 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
127 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
129 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
131 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
132 if (rhs < lhs)
133 scalar_type = rhs_type;
136 *lhs_size_unit = lhs;
137 *rhs_size_unit = rhs;
138 return scalar_type;
142 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
143 tested at run-time. Return TRUE if DDR was successfully inserted.
144 Return false if versioning is not supported. */
146 static bool
147 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
149 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
151 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
152 return false;
154 if (!runtime_alias_check_p (ddr, loop,
155 optimize_loop_nest_for_speed_p (loop)))
156 return false;
158 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
159 return true;
163 /* A subroutine of vect_analyze_data_ref_dependence. Handle
164 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
165 distances. These distances are conservatively correct but they don't
166 reflect a guaranteed dependence.
168 Return true if this function does all the work necessary to avoid
169 an alias or false if the caller should use the dependence distances
170 to limit the vectorization factor in the usual way. LOOP_DEPTH is
171 the depth of the loop described by LOOP_VINFO and the other arguments
172 are as for vect_analyze_data_ref_dependence. */
174 static bool
175 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
176 loop_vec_info loop_vinfo,
177 int loop_depth, int *max_vf)
179 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
180 lambda_vector dist_v;
181 unsigned int i;
182 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
184 int dist = dist_v[loop_depth];
185 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
187 /* If the user asserted safelen >= DIST consecutive iterations
188 can be executed concurrently, assume independence.
190 ??? An alternative would be to add the alias check even
191 in this case, and vectorize the fallback loop with the
192 maximum VF set to safelen. However, if the user has
193 explicitly given a length, it's less likely that that
194 would be a win. */
195 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
197 if (loop->safelen < *max_vf)
198 *max_vf = loop->safelen;
199 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
200 continue;
203 /* For dependence distances of 2 or more, we have the option
204 of limiting VF or checking for an alias at runtime.
205 Prefer to check at runtime if we can, to avoid limiting
206 the VF unnecessarily when the bases are in fact independent.
208 Note that the alias checks will be removed if the VF ends up
209 being small enough. */
210 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
213 return true;
217 /* Function vect_analyze_data_ref_dependence.
219 Return TRUE if there (might) exist a dependence between a memory-reference
220 DRA and a memory-reference DRB. When versioning for alias may check a
221 dependence at run-time, return FALSE. Adjust *MAX_VF according to
222 the data dependence. */
224 static bool
225 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
226 loop_vec_info loop_vinfo, int *max_vf)
228 unsigned int i;
229 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
230 struct data_reference *dra = DDR_A (ddr);
231 struct data_reference *drb = DDR_B (ddr);
232 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
233 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
234 lambda_vector dist_v;
235 unsigned int loop_depth;
237 /* In loop analysis all data references should be vectorizable. */
238 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
239 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
240 gcc_unreachable ();
242 /* Independent data accesses. */
243 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
244 return false;
246 if (dra == drb
247 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
248 return false;
250 /* We do not have to consider dependences between accesses that belong
251 to the same group. */
252 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
253 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
254 return false;
256 /* Even if we have an anti-dependence then, as the vectorized loop covers at
257 least two scalar iterations, there is always also a true dependence.
258 As the vectorizer does not re-order loads and stores we can ignore
259 the anti-dependence if TBAA can disambiguate both DRs similar to the
260 case with known negative distance anti-dependences (positive
261 distance anti-dependences would violate TBAA constraints). */
262 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
263 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
264 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
265 get_alias_set (DR_REF (drb))))
266 return false;
268 /* Unknown data dependence. */
269 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
271 /* If user asserted safelen consecutive iterations can be
272 executed concurrently, assume independence. */
273 if (loop->safelen >= 2)
275 if (loop->safelen < *max_vf)
276 *max_vf = loop->safelen;
277 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
278 return false;
281 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
282 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
284 if (dump_enabled_p ())
286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
287 "versioning for alias not supported for: "
288 "can't determine dependence between ");
289 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
290 DR_REF (dra));
291 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
292 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
293 DR_REF (drb));
294 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
296 return true;
299 if (dump_enabled_p ())
301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
302 "versioning for alias required: "
303 "can't determine dependence between ");
304 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
305 DR_REF (dra));
306 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
308 DR_REF (drb));
309 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
312 /* Add to list of ddrs that need to be tested at run-time. */
313 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
316 /* Known data dependence. */
317 if (DDR_NUM_DIST_VECTS (ddr) == 0)
319 /* If user asserted safelen consecutive iterations can be
320 executed concurrently, assume independence. */
321 if (loop->safelen >= 2)
323 if (loop->safelen < *max_vf)
324 *max_vf = loop->safelen;
325 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
326 return false;
329 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
330 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
332 if (dump_enabled_p ())
334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
335 "versioning for alias not supported for: "
336 "bad dist vector for ");
337 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
338 DR_REF (dra));
339 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
340 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
341 DR_REF (drb));
342 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
344 return true;
347 if (dump_enabled_p ())
349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
350 "versioning for alias required: "
351 "bad dist vector for ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
353 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
354 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
355 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
357 /* Add to list of ddrs that need to be tested at run-time. */
358 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
361 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
363 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
364 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
365 loop_depth, max_vf))
366 return false;
368 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
370 int dist = dist_v[loop_depth];
372 if (dump_enabled_p ())
373 dump_printf_loc (MSG_NOTE, vect_location,
374 "dependence distance = %d.\n", dist);
376 if (dist == 0)
378 if (dump_enabled_p ())
380 dump_printf_loc (MSG_NOTE, vect_location,
381 "dependence distance == 0 between ");
382 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
383 dump_printf (MSG_NOTE, " and ");
384 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
385 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
388 /* When we perform grouped accesses and perform implicit CSE
389 by detecting equal accesses and doing disambiguation with
390 runtime alias tests like for
391 .. = a[i];
392 .. = a[i+1];
393 a[i] = ..;
394 a[i+1] = ..;
395 *p = ..;
396 .. = a[i];
397 .. = a[i+1];
398 where we will end up loading { a[i], a[i+1] } once, make
399 sure that inserting group loads before the first load and
400 stores after the last store will do the right thing.
401 Similar for groups like
402 a[i] = ...;
403 ... = a[i];
404 a[i+1] = ...;
405 where loads from the group interleave with the store. */
406 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
407 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
409 gimple *earlier_stmt;
410 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
411 if (DR_IS_WRITE
412 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
416 "READ_WRITE dependence in interleaving."
417 "\n");
418 return true;
422 continue;
425 if (dist > 0 && DDR_REVERSED_P (ddr))
427 /* If DDR_REVERSED_P the order of the data-refs in DDR was
428 reversed (to make distance vector positive), and the actual
429 distance is negative. */
430 if (dump_enabled_p ())
431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
432 "dependence distance negative.\n");
433 /* Record a negative dependence distance to later limit the
434 amount of stmt copying / unrolling we can perform.
435 Only need to handle read-after-write dependence. */
436 if (DR_IS_READ (drb)
437 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
438 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
439 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
440 continue;
443 if (abs (dist) >= 2
444 && abs (dist) < *max_vf)
446 /* The dependence distance requires reduction of the maximal
447 vectorization factor. */
448 *max_vf = abs (dist);
449 if (dump_enabled_p ())
450 dump_printf_loc (MSG_NOTE, vect_location,
451 "adjusting maximal vectorization factor to %i\n",
452 *max_vf);
455 if (abs (dist) >= *max_vf)
457 /* Dependence distance does not create dependence, as far as
458 vectorization is concerned, in this case. */
459 if (dump_enabled_p ())
460 dump_printf_loc (MSG_NOTE, vect_location,
461 "dependence distance >= VF.\n");
462 continue;
465 if (dump_enabled_p ())
467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
468 "not vectorized, possible dependence "
469 "between data-refs ");
470 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
471 dump_printf (MSG_NOTE, " and ");
472 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
473 dump_printf (MSG_NOTE, "\n");
476 return true;
479 return false;
482 /* Function vect_analyze_data_ref_dependences.
484 Examine all the data references in the loop, and make sure there do not
485 exist any data dependences between them. Set *MAX_VF according to
486 the maximum vectorization factor the data dependences allow. */
488 bool
489 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
491 unsigned int i;
492 struct data_dependence_relation *ddr;
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_NOTE, vect_location,
496 "=== vect_analyze_data_ref_dependences ===\n");
498 LOOP_VINFO_DDRS (loop_vinfo)
499 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
500 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
501 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
502 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
503 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
504 &LOOP_VINFO_DDRS (loop_vinfo),
505 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
506 return false;
508 /* For epilogues we either have no aliases or alias versioning
509 was applied to original loop. Therefore we may just get max_vf
510 using VF of original loop. */
511 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
512 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
513 else
514 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
515 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
516 return false;
518 return true;
522 /* Function vect_slp_analyze_data_ref_dependence.
524 Return TRUE if there (might) exist a dependence between a memory-reference
525 DRA and a memory-reference DRB. When versioning for alias may check a
526 dependence at run-time, return FALSE. Adjust *MAX_VF according to
527 the data dependence. */
529 static bool
530 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
532 struct data_reference *dra = DDR_A (ddr);
533 struct data_reference *drb = DDR_B (ddr);
535 /* We need to check dependences of statements marked as unvectorizable
536 as well, they still can prohibit vectorization. */
538 /* Independent data accesses. */
539 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
540 return false;
542 if (dra == drb)
543 return false;
545 /* Read-read is OK. */
546 if (DR_IS_READ (dra) && DR_IS_READ (drb))
547 return false;
549 /* If dra and drb are part of the same interleaving chain consider
550 them independent. */
551 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
552 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
553 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
554 return false;
556 /* Unknown data dependence. */
557 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
559 if (dump_enabled_p ())
561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
562 "can't determine dependence between ");
563 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
564 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
565 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
566 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
569 else if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location,
572 "determined dependence between ");
573 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
574 dump_printf (MSG_NOTE, " and ");
575 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
576 dump_printf (MSG_NOTE, "\n");
579 return true;
583 /* Analyze dependences involved in the transform of SLP NODE. STORES
584 contain the vector of scalar stores of this instance if we are
585 disambiguating the loads. */
587 static bool
588 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
589 vec<gimple *> stores, gimple *last_store)
591 /* This walks over all stmts involved in the SLP load/store done
592 in NODE verifying we can sink them up to the last stmt in the
593 group. */
594 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
595 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
597 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
598 if (access == last_access)
599 continue;
600 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
601 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
602 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
604 gimple *stmt = gsi_stmt (gsi);
605 if (! gimple_vuse (stmt)
606 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
607 continue;
609 /* If we couldn't record a (single) data reference for this
610 stmt we have to give up. */
611 /* ??? Here and below if dependence analysis fails we can resort
612 to the alias oracle which can handle more kinds of stmts. */
613 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
614 if (!dr_b)
615 return false;
617 bool dependent = false;
618 /* If we run into a store of this same instance (we've just
619 marked those) then delay dependence checking until we run
620 into the last store because this is where it will have
621 been sunk to (and we verify if we can do that as well). */
622 if (gimple_visited_p (stmt))
624 if (stmt != last_store)
625 continue;
626 unsigned i;
627 gimple *store;
628 FOR_EACH_VEC_ELT (stores, i, store)
630 data_reference *store_dr
631 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
632 ddr_p ddr = initialize_data_dependence_relation
633 (dr_a, store_dr, vNULL);
634 dependent = vect_slp_analyze_data_ref_dependence (ddr);
635 free_dependence_relation (ddr);
636 if (dependent)
637 break;
640 else
642 ddr_p ddr = initialize_data_dependence_relation (dr_a,
643 dr_b, vNULL);
644 dependent = vect_slp_analyze_data_ref_dependence (ddr);
645 free_dependence_relation (ddr);
647 if (dependent)
648 return false;
651 return true;
655 /* Function vect_analyze_data_ref_dependences.
657 Examine all the data references in the basic-block, and make sure there
658 do not exist any data dependences between them. Set *MAX_VF according to
659 the maximum vectorization factor the data dependences allow. */
661 bool
662 vect_slp_analyze_instance_dependence (slp_instance instance)
664 if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE, vect_location,
666 "=== vect_slp_analyze_instance_dependence ===\n");
668 /* The stores of this instance are at the root of the SLP tree. */
669 slp_tree store = SLP_INSTANCE_TREE (instance);
670 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
671 store = NULL;
673 /* Verify we can sink stores to the vectorized stmt insert location. */
674 gimple *last_store = NULL;
675 if (store)
677 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
678 return false;
680 /* Mark stores in this instance and remember the last one. */
681 last_store = vect_find_last_scalar_stmt_in_slp (store);
682 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
683 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
686 bool res = true;
688 /* Verify we can sink loads to the vectorized stmt insert location,
689 special-casing stores of this instance. */
690 slp_tree load;
691 unsigned int i;
692 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
693 if (! vect_slp_analyze_node_dependences (instance, load,
694 store
695 ? SLP_TREE_SCALAR_STMTS (store)
696 : vNULL, last_store))
698 res = false;
699 break;
702 /* Unset the visited flag. */
703 if (store)
704 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
705 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
707 return res;
710 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
711 the statement that contains DRB, which is useful for recording in the
712 dump file. */
714 static void
715 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
716 innermost_loop_behavior *drb)
718 bool existed;
719 innermost_loop_behavior *&entry
720 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
721 if (!existed || entry->base_alignment < drb->base_alignment)
723 entry = drb;
724 if (dump_enabled_p ())
726 dump_printf_loc (MSG_NOTE, vect_location,
727 "recording new base alignment for ");
728 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
729 dump_printf (MSG_NOTE, "\n");
730 dump_printf_loc (MSG_NOTE, vect_location,
731 " alignment: %d\n", drb->base_alignment);
732 dump_printf_loc (MSG_NOTE, vect_location,
733 " misalignment: %d\n", drb->base_misalignment);
734 dump_printf_loc (MSG_NOTE, vect_location,
735 " based on: ");
736 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
741 /* If the region we're going to vectorize is reached, all unconditional
742 data references occur at least once. We can therefore pool the base
743 alignment guarantees from each unconditional reference. Do this by
744 going through all the data references in VINFO and checking whether
745 the containing statement makes the reference unconditionally. If so,
746 record the alignment of the base address in VINFO so that it can be
747 used for all other references with the same base. */
749 void
750 vect_record_base_alignments (vec_info *vinfo)
752 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
753 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
754 data_reference *dr;
755 unsigned int i;
756 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
757 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
759 gimple *stmt = DR_STMT (dr);
760 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
762 /* If DR is nested in the loop that is being vectorized, we can also
763 record the alignment of the base wrt the outer loop. */
764 if (loop && nested_in_vect_loop_p (loop, stmt))
766 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
767 vect_record_base_alignment
768 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
773 /* Function vect_compute_data_ref_alignment
775 Compute the misalignment of the data reference DR.
777 Output:
778 1. If during the misalignment computation it is found that the data reference
779 cannot be vectorized then false is returned.
780 2. DR_MISALIGNMENT (DR) is defined.
782 FOR NOW: No analysis is actually performed. Misalignment is calculated
783 only for trivial cases. TODO. */
785 bool
786 vect_compute_data_ref_alignment (struct data_reference *dr)
788 gimple *stmt = DR_STMT (dr);
789 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
790 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
791 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
792 struct loop *loop = NULL;
793 tree ref = DR_REF (dr);
794 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
796 if (dump_enabled_p ())
797 dump_printf_loc (MSG_NOTE, vect_location,
798 "vect_compute_data_ref_alignment:\n");
800 if (loop_vinfo)
801 loop = LOOP_VINFO_LOOP (loop_vinfo);
803 /* Initialize misalignment to unknown. */
804 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
806 innermost_loop_behavior *drb = vect_dr_behavior (dr);
807 bool step_preserves_misalignment_p;
809 /* No step for BB vectorization. */
810 if (!loop)
812 gcc_assert (integer_zerop (drb->step));
813 step_preserves_misalignment_p = true;
816 /* In case the dataref is in an inner-loop of the loop that is being
817 vectorized (LOOP), we use the base and misalignment information
818 relative to the outer-loop (LOOP). This is ok only if the misalignment
819 stays the same throughout the execution of the inner-loop, which is why
820 we have to check that the stride of the dataref in the inner-loop evenly
821 divides by the vector size. */
822 else if (nested_in_vect_loop_p (loop, stmt))
824 step_preserves_misalignment_p
825 = (DR_STEP_ALIGNMENT (dr)
826 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
828 if (dump_enabled_p ())
830 if (step_preserves_misalignment_p)
831 dump_printf_loc (MSG_NOTE, vect_location,
832 "inner step divides the vector-size.\n");
833 else
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
835 "inner step doesn't divide the vector-size.\n");
839 /* Similarly we can only use base and misalignment information relative to
840 an innermost loop if the misalignment stays the same throughout the
841 execution of the loop. As above, this is the case if the stride of
842 the dataref evenly divides by the vector size. */
843 else
845 unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
846 step_preserves_misalignment_p
847 = ((DR_STEP_ALIGNMENT (dr) * vf)
848 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
850 if (!step_preserves_misalignment_p && dump_enabled_p ())
851 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
852 "step doesn't divide the vector-size.\n");
855 unsigned int base_alignment = drb->base_alignment;
856 unsigned int base_misalignment = drb->base_misalignment;
857 unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
859 /* Calculate the maximum of the pooled base address alignment and the
860 alignment that we can compute for DR itself. */
861 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
862 if (entry && base_alignment < (*entry)->base_alignment)
864 base_alignment = (*entry)->base_alignment;
865 base_misalignment = (*entry)->base_misalignment;
868 if (drb->offset_alignment < vector_alignment
869 || !step_preserves_misalignment_p
870 /* We need to know whether the step wrt the vectorized loop is
871 negative when computing the starting misalignment below. */
872 || TREE_CODE (drb->step) != INTEGER_CST)
874 if (dump_enabled_p ())
876 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
877 "Unknown alignment for access: ");
878 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
879 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
881 return true;
884 if (base_alignment < vector_alignment)
886 tree base = drb->base_address;
887 if (TREE_CODE (base) == ADDR_EXPR)
888 base = TREE_OPERAND (base, 0);
889 if (!vect_can_force_dr_alignment_p (base,
890 vector_alignment * BITS_PER_UNIT))
892 if (dump_enabled_p ())
894 dump_printf_loc (MSG_NOTE, vect_location,
895 "can't force alignment of ref: ");
896 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
897 dump_printf (MSG_NOTE, "\n");
899 return true;
902 if (DECL_USER_ALIGN (base))
904 if (dump_enabled_p ())
906 dump_printf_loc (MSG_NOTE, vect_location,
907 "not forcing alignment of user-aligned "
908 "variable: ");
909 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
910 dump_printf (MSG_NOTE, "\n");
912 return true;
915 /* Force the alignment of the decl.
916 NOTE: This is the only change to the code we make during
917 the analysis phase, before deciding to vectorize the loop. */
918 if (dump_enabled_p ())
920 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
921 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
922 dump_printf (MSG_NOTE, "\n");
925 DR_VECT_AUX (dr)->base_decl = base;
926 DR_VECT_AUX (dr)->base_misaligned = true;
927 base_misalignment = 0;
929 unsigned int misalignment = (base_misalignment
930 + TREE_INT_CST_LOW (drb->init));
932 /* If this is a backward running DR then first access in the larger
933 vectype actually is N-1 elements before the address in the DR.
934 Adjust misalign accordingly. */
935 if (tree_int_cst_sgn (drb->step) < 0)
936 /* PLUS because STEP is negative. */
937 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
938 * TREE_INT_CST_LOW (drb->step));
940 SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
942 if (dump_enabled_p ())
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
945 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
946 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
947 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
950 return true;
954 /* Function vect_update_misalignment_for_peel.
955 Sets DR's misalignment
956 - to 0 if it has the same alignment as DR_PEEL,
957 - to the misalignment computed using NPEEL if DR's salignment is known,
958 - to -1 (unknown) otherwise.
960 DR - the data reference whose misalignment is to be adjusted.
961 DR_PEEL - the data reference whose misalignment is being made
962 zero in the vector loop by the peel.
963 NPEEL - the number of iterations in the peel loop if the misalignment
964 of DR_PEEL is known at compile time. */
966 static void
967 vect_update_misalignment_for_peel (struct data_reference *dr,
968 struct data_reference *dr_peel, int npeel)
970 unsigned int i;
971 vec<dr_p> same_aligned_drs;
972 struct data_reference *current_dr;
973 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
974 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
975 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
976 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
978 /* For interleaved data accesses the step in the loop must be multiplied by
979 the size of the interleaving group. */
980 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
981 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
982 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
983 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
985 /* It can be assumed that the data refs with the same alignment as dr_peel
986 are aligned in the vector loop. */
987 same_aligned_drs
988 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
989 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
991 if (current_dr != dr)
992 continue;
993 gcc_assert (!known_alignment_for_access_p (dr)
994 || !known_alignment_for_access_p (dr_peel)
995 || (DR_MISALIGNMENT (dr) / dr_size
996 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
997 SET_DR_MISALIGNMENT (dr, 0);
998 return;
1001 if (known_alignment_for_access_p (dr)
1002 && known_alignment_for_access_p (dr_peel))
1004 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1005 int misal = DR_MISALIGNMENT (dr);
1006 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1007 misal += negative ? -npeel * dr_size : npeel * dr_size;
1008 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
1009 SET_DR_MISALIGNMENT (dr, misal);
1010 return;
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1015 "to unknown (-1).\n");
1016 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1020 /* Function verify_data_ref_alignment
1022 Return TRUE if DR can be handled with respect to alignment. */
1024 static bool
1025 verify_data_ref_alignment (data_reference_p dr)
1027 enum dr_alignment_support supportable_dr_alignment
1028 = vect_supportable_dr_alignment (dr, false);
1029 if (!supportable_dr_alignment)
1031 if (dump_enabled_p ())
1033 if (DR_IS_READ (dr))
1034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035 "not vectorized: unsupported unaligned load.");
1036 else
1037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1038 "not vectorized: unsupported unaligned "
1039 "store.");
1041 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1042 DR_REF (dr));
1043 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1045 return false;
1048 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1049 dump_printf_loc (MSG_NOTE, vect_location,
1050 "Vectorizing an unaligned access.\n");
1052 return true;
1055 /* Function vect_verify_datarefs_alignment
1057 Return TRUE if all data references in the loop can be
1058 handled with respect to alignment. */
1060 bool
1061 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1063 vec<data_reference_p> datarefs = vinfo->datarefs;
1064 struct data_reference *dr;
1065 unsigned int i;
1067 FOR_EACH_VEC_ELT (datarefs, i, dr)
1069 gimple *stmt = DR_STMT (dr);
1070 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1072 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1073 continue;
1075 /* For interleaving, only the alignment of the first access matters. */
1076 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1077 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1078 continue;
1080 /* Strided accesses perform only component accesses, alignment is
1081 irrelevant for them. */
1082 if (STMT_VINFO_STRIDED_P (stmt_info)
1083 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1084 continue;
1086 if (! verify_data_ref_alignment (dr))
1087 return false;
1090 return true;
1093 /* Given an memory reference EXP return whether its alignment is less
1094 than its size. */
1096 static bool
1097 not_size_aligned (tree exp)
1099 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1100 return true;
1102 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1103 > get_object_alignment (exp));
1106 /* Function vector_alignment_reachable_p
1108 Return true if vector alignment for DR is reachable by peeling
1109 a few loop iterations. Return false otherwise. */
1111 static bool
1112 vector_alignment_reachable_p (struct data_reference *dr)
1114 gimple *stmt = DR_STMT (dr);
1115 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1116 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1118 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1120 /* For interleaved access we peel only if number of iterations in
1121 the prolog loop ({VF - misalignment}), is a multiple of the
1122 number of the interleaved accesses. */
1123 int elem_size, mis_in_elements;
1124 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1126 /* FORNOW: handle only known alignment. */
1127 if (!known_alignment_for_access_p (dr))
1128 return false;
1130 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1131 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1133 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1134 return false;
1137 /* If misalignment is known at the compile time then allow peeling
1138 only if natural alignment is reachable through peeling. */
1139 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1141 HOST_WIDE_INT elmsize =
1142 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1143 if (dump_enabled_p ())
1145 dump_printf_loc (MSG_NOTE, vect_location,
1146 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1147 dump_printf (MSG_NOTE,
1148 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1150 if (DR_MISALIGNMENT (dr) % elmsize)
1152 if (dump_enabled_p ())
1153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1154 "data size does not divide the misalignment.\n");
1155 return false;
1159 if (!known_alignment_for_access_p (dr))
1161 tree type = TREE_TYPE (DR_REF (dr));
1162 bool is_packed = not_size_aligned (DR_REF (dr));
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 "Unknown misalignment, %snaturally aligned\n",
1166 is_packed ? "not " : "");
1167 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1170 return true;
1174 /* Calculate the cost of the memory access represented by DR. */
1176 static void
1177 vect_get_data_access_cost (struct data_reference *dr,
1178 unsigned int *inside_cost,
1179 unsigned int *outside_cost,
1180 stmt_vector_for_cost *body_cost_vec)
1182 gimple *stmt = DR_STMT (dr);
1183 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1184 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1185 int ncopies;
1187 if (PURE_SLP_STMT (stmt_info))
1188 ncopies = 1;
1189 else
1190 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1192 if (DR_IS_READ (dr))
1193 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1194 NULL, body_cost_vec, false);
1195 else
1196 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1198 if (dump_enabled_p ())
1199 dump_printf_loc (MSG_NOTE, vect_location,
1200 "vect_get_data_access_cost: inside_cost = %d, "
1201 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1205 typedef struct _vect_peel_info
1207 struct data_reference *dr;
1208 int npeel;
1209 unsigned int count;
1210 } *vect_peel_info;
1212 typedef struct _vect_peel_extended_info
1214 struct _vect_peel_info peel_info;
1215 unsigned int inside_cost;
1216 unsigned int outside_cost;
1217 } *vect_peel_extended_info;
1220 /* Peeling hashtable helpers. */
1222 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1224 static inline hashval_t hash (const _vect_peel_info *);
1225 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1228 inline hashval_t
1229 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1231 return (hashval_t) peel_info->npeel;
1234 inline bool
1235 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1237 return (a->npeel == b->npeel);
1241 /* Insert DR into peeling hash table with NPEEL as key. */
1243 static void
1244 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1245 loop_vec_info loop_vinfo, struct data_reference *dr,
1246 int npeel)
1248 struct _vect_peel_info elem, *slot;
1249 _vect_peel_info **new_slot;
1250 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1252 elem.npeel = npeel;
1253 slot = peeling_htab->find (&elem);
1254 if (slot)
1255 slot->count++;
1256 else
1258 slot = XNEW (struct _vect_peel_info);
1259 slot->npeel = npeel;
1260 slot->dr = dr;
1261 slot->count = 1;
1262 new_slot = peeling_htab->find_slot (slot, INSERT);
1263 *new_slot = slot;
1266 if (!supportable_dr_alignment
1267 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1268 slot->count += VECT_MAX_COST;
1272 /* Traverse peeling hash table to find peeling option that aligns maximum
1273 number of data accesses. */
1276 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1277 _vect_peel_extended_info *max)
1279 vect_peel_info elem = *slot;
1281 if (elem->count > max->peel_info.count
1282 || (elem->count == max->peel_info.count
1283 && max->peel_info.npeel > elem->npeel))
1285 max->peel_info.npeel = elem->npeel;
1286 max->peel_info.count = elem->count;
1287 max->peel_info.dr = elem->dr;
1290 return 1;
1293 /* Get the costs of peeling NPEEL iterations checking data access costs
1294 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1295 misalignment will be zero after peeling. */
1297 static void
1298 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1299 struct data_reference *dr0,
1300 unsigned int *inside_cost,
1301 unsigned int *outside_cost,
1302 stmt_vector_for_cost *body_cost_vec,
1303 unsigned int npeel,
1304 bool unknown_misalignment)
1306 unsigned i;
1307 data_reference *dr;
1309 FOR_EACH_VEC_ELT (datarefs, i, dr)
1311 gimple *stmt = DR_STMT (dr);
1312 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1313 /* For interleaving, only the alignment of the first access
1314 matters. */
1315 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1316 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1317 continue;
1319 /* Strided accesses perform only component accesses, alignment is
1320 irrelevant for them. */
1321 if (STMT_VINFO_STRIDED_P (stmt_info)
1322 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1323 continue;
1325 int save_misalignment;
1326 save_misalignment = DR_MISALIGNMENT (dr);
1327 if (npeel == 0)
1329 else if (unknown_misalignment && dr == dr0)
1330 SET_DR_MISALIGNMENT (dr, 0);
1331 else
1332 vect_update_misalignment_for_peel (dr, dr0, npeel);
1333 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1334 body_cost_vec);
1335 SET_DR_MISALIGNMENT (dr, save_misalignment);
1339 /* Traverse peeling hash table and calculate cost for each peeling option.
1340 Find the one with the lowest cost. */
1343 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1344 _vect_peel_extended_info *min)
1346 vect_peel_info elem = *slot;
1347 int dummy;
1348 unsigned int inside_cost = 0, outside_cost = 0;
1349 gimple *stmt = DR_STMT (elem->dr);
1350 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1351 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1352 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1353 epilogue_cost_vec;
1355 prologue_cost_vec.create (2);
1356 body_cost_vec.create (2);
1357 epilogue_cost_vec.create (2);
1359 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1360 elem->dr, &inside_cost, &outside_cost,
1361 &body_cost_vec, elem->npeel, false);
1363 body_cost_vec.release ();
1365 outside_cost += vect_get_known_peeling_cost
1366 (loop_vinfo, elem->npeel, &dummy,
1367 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1368 &prologue_cost_vec, &epilogue_cost_vec);
1370 /* Prologue and epilogue costs are added to the target model later.
1371 These costs depend only on the scalar iteration cost, the
1372 number of peeling iterations finally chosen, and the number of
1373 misaligned statements. So discard the information found here. */
1374 prologue_cost_vec.release ();
1375 epilogue_cost_vec.release ();
1377 if (inside_cost < min->inside_cost
1378 || (inside_cost == min->inside_cost
1379 && outside_cost < min->outside_cost))
1381 min->inside_cost = inside_cost;
1382 min->outside_cost = outside_cost;
1383 min->peel_info.dr = elem->dr;
1384 min->peel_info.npeel = elem->npeel;
1385 min->peel_info.count = elem->count;
1388 return 1;
1392 /* Choose best peeling option by traversing peeling hash table and either
1393 choosing an option with the lowest cost (if cost model is enabled) or the
1394 option that aligns as many accesses as possible. */
1396 static struct _vect_peel_extended_info
1397 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1398 loop_vec_info loop_vinfo)
1400 struct _vect_peel_extended_info res;
1402 res.peel_info.dr = NULL;
1404 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1406 res.inside_cost = INT_MAX;
1407 res.outside_cost = INT_MAX;
1408 peeling_htab->traverse <_vect_peel_extended_info *,
1409 vect_peeling_hash_get_lowest_cost> (&res);
1411 else
1413 res.peel_info.count = 0;
1414 peeling_htab->traverse <_vect_peel_extended_info *,
1415 vect_peeling_hash_get_most_frequent> (&res);
1416 res.inside_cost = 0;
1417 res.outside_cost = 0;
1420 return res;
1423 /* Return true if the new peeling NPEEL is supported. */
1425 static bool
1426 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1427 unsigned npeel)
1429 unsigned i;
1430 struct data_reference *dr = NULL;
1431 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1432 gimple *stmt;
1433 stmt_vec_info stmt_info;
1434 enum dr_alignment_support supportable_dr_alignment;
1436 /* Ensure that all data refs can be vectorized after the peel. */
1437 FOR_EACH_VEC_ELT (datarefs, i, dr)
1439 int save_misalignment;
1441 if (dr == dr0)
1442 continue;
1444 stmt = DR_STMT (dr);
1445 stmt_info = vinfo_for_stmt (stmt);
1446 /* For interleaving, only the alignment of the first access
1447 matters. */
1448 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1449 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1450 continue;
1452 /* Strided accesses perform only component accesses, alignment is
1453 irrelevant for them. */
1454 if (STMT_VINFO_STRIDED_P (stmt_info)
1455 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1456 continue;
1458 save_misalignment = DR_MISALIGNMENT (dr);
1459 vect_update_misalignment_for_peel (dr, dr0, npeel);
1460 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1461 SET_DR_MISALIGNMENT (dr, save_misalignment);
1463 if (!supportable_dr_alignment)
1464 return false;
1467 return true;
1470 /* Function vect_enhance_data_refs_alignment
1472 This pass will use loop versioning and loop peeling in order to enhance
1473 the alignment of data references in the loop.
1475 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1476 original loop is to be vectorized. Any other loops that are created by
1477 the transformations performed in this pass - are not supposed to be
1478 vectorized. This restriction will be relaxed.
1480 This pass will require a cost model to guide it whether to apply peeling
1481 or versioning or a combination of the two. For example, the scheme that
1482 intel uses when given a loop with several memory accesses, is as follows:
1483 choose one memory access ('p') which alignment you want to force by doing
1484 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1485 other accesses are not necessarily aligned, or (2) use loop versioning to
1486 generate one loop in which all accesses are aligned, and another loop in
1487 which only 'p' is necessarily aligned.
1489 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1490 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1491 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1493 Devising a cost model is the most critical aspect of this work. It will
1494 guide us on which access to peel for, whether to use loop versioning, how
1495 many versions to create, etc. The cost model will probably consist of
1496 generic considerations as well as target specific considerations (on
1497 powerpc for example, misaligned stores are more painful than misaligned
1498 loads).
1500 Here are the general steps involved in alignment enhancements:
1502 -- original loop, before alignment analysis:
1503 for (i=0; i<N; i++){
1504 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1505 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1508 -- After vect_compute_data_refs_alignment:
1509 for (i=0; i<N; i++){
1510 x = q[i]; # DR_MISALIGNMENT(q) = 3
1511 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1514 -- Possibility 1: we do loop versioning:
1515 if (p is aligned) {
1516 for (i=0; i<N; i++){ # loop 1A
1517 x = q[i]; # DR_MISALIGNMENT(q) = 3
1518 p[i] = y; # DR_MISALIGNMENT(p) = 0
1521 else {
1522 for (i=0; i<N; i++){ # loop 1B
1523 x = q[i]; # DR_MISALIGNMENT(q) = 3
1524 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1528 -- Possibility 2: we do loop peeling:
1529 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1530 x = q[i];
1531 p[i] = y;
1533 for (i = 3; i < N; i++){ # loop 2A
1534 x = q[i]; # DR_MISALIGNMENT(q) = 0
1535 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1538 -- Possibility 3: combination of loop peeling and versioning:
1539 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1540 x = q[i];
1541 p[i] = y;
1543 if (p is aligned) {
1544 for (i = 3; i<N; i++){ # loop 3A
1545 x = q[i]; # DR_MISALIGNMENT(q) = 0
1546 p[i] = y; # DR_MISALIGNMENT(p) = 0
1549 else {
1550 for (i = 3; i<N; i++){ # loop 3B
1551 x = q[i]; # DR_MISALIGNMENT(q) = 0
1552 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1556 These loops are later passed to loop_transform to be vectorized. The
1557 vectorizer will use the alignment information to guide the transformation
1558 (whether to generate regular loads/stores, or with special handling for
1559 misalignment). */
1561 bool
1562 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1564 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1565 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1566 enum dr_alignment_support supportable_dr_alignment;
1567 struct data_reference *dr0 = NULL, *first_store = NULL;
1568 struct data_reference *dr;
1569 unsigned int i, j;
1570 bool do_peeling = false;
1571 bool do_versioning = false;
1572 bool stat;
1573 gimple *stmt;
1574 stmt_vec_info stmt_info;
1575 unsigned int npeel = 0;
1576 bool one_misalignment_known = false;
1577 bool one_misalignment_unknown = false;
1578 bool one_dr_unsupportable = false;
1579 struct data_reference *unsupportable_dr = NULL;
1580 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1581 unsigned possible_npeel_number = 1;
1582 tree vectype;
1583 unsigned int nelements, mis, same_align_drs_max = 0;
1584 hash_table<peel_info_hasher> peeling_htab (1);
1586 if (dump_enabled_p ())
1587 dump_printf_loc (MSG_NOTE, vect_location,
1588 "=== vect_enhance_data_refs_alignment ===\n");
1590 /* Reset data so we can safely be called multiple times. */
1591 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1592 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1594 /* While cost model enhancements are expected in the future, the high level
1595 view of the code at this time is as follows:
1597 A) If there is a misaligned access then see if peeling to align
1598 this access can make all data references satisfy
1599 vect_supportable_dr_alignment. If so, update data structures
1600 as needed and return true.
1602 B) If peeling wasn't possible and there is a data reference with an
1603 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1604 then see if loop versioning checks can be used to make all data
1605 references satisfy vect_supportable_dr_alignment. If so, update
1606 data structures as needed and return true.
1608 C) If neither peeling nor versioning were successful then return false if
1609 any data reference does not satisfy vect_supportable_dr_alignment.
1611 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1613 Note, Possibility 3 above (which is peeling and versioning together) is not
1614 being done at this time. */
1616 /* (1) Peeling to force alignment. */
1618 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1619 Considerations:
1620 + How many accesses will become aligned due to the peeling
1621 - How many accesses will become unaligned due to the peeling,
1622 and the cost of misaligned accesses.
1623 - The cost of peeling (the extra runtime checks, the increase
1624 in code size). */
1626 FOR_EACH_VEC_ELT (datarefs, i, dr)
1628 stmt = DR_STMT (dr);
1629 stmt_info = vinfo_for_stmt (stmt);
1631 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1632 continue;
1634 /* For interleaving, only the alignment of the first access
1635 matters. */
1636 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1637 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1638 continue;
1640 /* For invariant accesses there is nothing to enhance. */
1641 if (integer_zerop (DR_STEP (dr)))
1642 continue;
1644 /* Strided accesses perform only component accesses, alignment is
1645 irrelevant for them. */
1646 if (STMT_VINFO_STRIDED_P (stmt_info)
1647 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1648 continue;
1650 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1651 do_peeling = vector_alignment_reachable_p (dr);
1652 if (do_peeling)
1654 if (known_alignment_for_access_p (dr))
1656 unsigned int npeel_tmp = 0;
1657 bool negative = tree_int_cst_compare (DR_STEP (dr),
1658 size_zero_node) < 0;
1660 vectype = STMT_VINFO_VECTYPE (stmt_info);
1661 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1662 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1663 TREE_TYPE (DR_REF (dr))));
1664 if (DR_MISALIGNMENT (dr) != 0)
1665 npeel_tmp = (negative ? (mis - nelements)
1666 : (nelements - mis)) & (nelements - 1);
1668 /* For multiple types, it is possible that the bigger type access
1669 will have more than one peeling option. E.g., a loop with two
1670 types: one of size (vector size / 4), and the other one of
1671 size (vector size / 8). Vectorization factor will 8. If both
1672 accesses are misaligned by 3, the first one needs one scalar
1673 iteration to be aligned, and the second one needs 5. But the
1674 first one will be aligned also by peeling 5 scalar
1675 iterations, and in that case both accesses will be aligned.
1676 Hence, except for the immediate peeling amount, we also want
1677 to try to add full vector size, while we don't exceed
1678 vectorization factor.
1679 We do this automatically for cost model, since we calculate
1680 cost for every peeling option. */
1681 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1683 if (STMT_SLP_TYPE (stmt_info))
1684 possible_npeel_number
1685 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1686 else
1687 possible_npeel_number = vf / nelements;
1689 /* NPEEL_TMP is 0 when there is no misalignment, but also
1690 allow peeling NELEMENTS. */
1691 if (DR_MISALIGNMENT (dr) == 0)
1692 possible_npeel_number++;
1695 /* Save info about DR in the hash table. Also include peeling
1696 amounts according to the explanation above. */
1697 for (j = 0; j < possible_npeel_number; j++)
1699 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1700 dr, npeel_tmp);
1701 npeel_tmp += nelements;
1704 one_misalignment_known = true;
1706 else
1708 /* If we don't know any misalignment values, we prefer
1709 peeling for data-ref that has the maximum number of data-refs
1710 with the same alignment, unless the target prefers to align
1711 stores over load. */
1712 unsigned same_align_drs
1713 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1714 if (!dr0
1715 || same_align_drs_max < same_align_drs)
1717 same_align_drs_max = same_align_drs;
1718 dr0 = dr;
1720 /* For data-refs with the same number of related
1721 accesses prefer the one where the misalign
1722 computation will be invariant in the outermost loop. */
1723 else if (same_align_drs_max == same_align_drs)
1725 struct loop *ivloop0, *ivloop;
1726 ivloop0 = outermost_invariant_loop_for_expr
1727 (loop, DR_BASE_ADDRESS (dr0));
1728 ivloop = outermost_invariant_loop_for_expr
1729 (loop, DR_BASE_ADDRESS (dr));
1730 if ((ivloop && !ivloop0)
1731 || (ivloop && ivloop0
1732 && flow_loop_nested_p (ivloop, ivloop0)))
1733 dr0 = dr;
1736 one_misalignment_unknown = true;
1738 /* Check for data refs with unsupportable alignment that
1739 can be peeled. */
1740 if (!supportable_dr_alignment)
1742 one_dr_unsupportable = true;
1743 unsupportable_dr = dr;
1746 if (!first_store && DR_IS_WRITE (dr))
1747 first_store = dr;
1750 else
1752 if (!aligned_access_p (dr))
1754 if (dump_enabled_p ())
1755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1756 "vector alignment may not be reachable\n");
1757 break;
1762 /* Check if we can possibly peel the loop. */
1763 if (!vect_can_advance_ivs_p (loop_vinfo)
1764 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1765 || loop->inner)
1766 do_peeling = false;
1768 struct _vect_peel_extended_info peel_for_known_alignment;
1769 struct _vect_peel_extended_info peel_for_unknown_alignment;
1770 struct _vect_peel_extended_info best_peel;
1772 peel_for_unknown_alignment.inside_cost = INT_MAX;
1773 peel_for_unknown_alignment.outside_cost = INT_MAX;
1774 peel_for_unknown_alignment.peel_info.count = 0;
1776 if (do_peeling
1777 && one_misalignment_unknown)
1779 /* Check if the target requires to prefer stores over loads, i.e., if
1780 misaligned stores are more expensive than misaligned loads (taking
1781 drs with same alignment into account). */
1782 unsigned int load_inside_cost = 0;
1783 unsigned int load_outside_cost = 0;
1784 unsigned int store_inside_cost = 0;
1785 unsigned int store_outside_cost = 0;
1787 stmt_vector_for_cost dummy;
1788 dummy.create (2);
1789 vect_get_peeling_costs_all_drs (datarefs, dr0,
1790 &load_inside_cost,
1791 &load_outside_cost,
1792 &dummy, vf / 2, true);
1793 dummy.release ();
1795 if (first_store)
1797 dummy.create (2);
1798 vect_get_peeling_costs_all_drs (datarefs, first_store,
1799 &store_inside_cost,
1800 &store_outside_cost,
1801 &dummy, vf / 2, true);
1802 dummy.release ();
1804 else
1806 store_inside_cost = INT_MAX;
1807 store_outside_cost = INT_MAX;
1810 if (load_inside_cost > store_inside_cost
1811 || (load_inside_cost == store_inside_cost
1812 && load_outside_cost > store_outside_cost))
1814 dr0 = first_store;
1815 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1816 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1818 else
1820 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1821 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1824 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1825 prologue_cost_vec.create (2);
1826 epilogue_cost_vec.create (2);
1828 int dummy2;
1829 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1830 (loop_vinfo, vf / 2, &dummy2,
1831 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1832 &prologue_cost_vec, &epilogue_cost_vec);
1834 prologue_cost_vec.release ();
1835 epilogue_cost_vec.release ();
1837 peel_for_unknown_alignment.peel_info.count = 1
1838 + STMT_VINFO_SAME_ALIGN_REFS
1839 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1842 peel_for_unknown_alignment.peel_info.npeel = 0;
1843 peel_for_unknown_alignment.peel_info.dr = dr0;
1845 best_peel = peel_for_unknown_alignment;
1847 peel_for_known_alignment.inside_cost = INT_MAX;
1848 peel_for_known_alignment.outside_cost = INT_MAX;
1849 peel_for_known_alignment.peel_info.count = 0;
1850 peel_for_known_alignment.peel_info.dr = NULL;
1852 if (do_peeling && one_misalignment_known)
1854 /* Peeling is possible, but there is no data access that is not supported
1855 unless aligned. So we try to choose the best possible peeling from
1856 the hash table. */
1857 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1858 (&peeling_htab, loop_vinfo);
1861 /* Compare costs of peeling for known and unknown alignment. */
1862 if (peel_for_known_alignment.peel_info.dr != NULL
1863 && peel_for_unknown_alignment.inside_cost
1864 >= peel_for_known_alignment.inside_cost)
1866 best_peel = peel_for_known_alignment;
1868 /* If the best peeling for known alignment has NPEEL == 0, perform no
1869 peeling at all except if there is an unsupportable dr that we can
1870 align. */
1871 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1872 do_peeling = false;
1875 /* If there is an unsupportable data ref, prefer this over all choices so far
1876 since we'd have to discard a chosen peeling except when it accidentally
1877 aligned the unsupportable data ref. */
1878 if (one_dr_unsupportable)
1879 dr0 = unsupportable_dr;
1880 else if (do_peeling)
1882 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1883 TODO: Use nopeel_outside_cost or get rid of it? */
1884 unsigned nopeel_inside_cost = 0;
1885 unsigned nopeel_outside_cost = 0;
1887 stmt_vector_for_cost dummy;
1888 dummy.create (2);
1889 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1890 &nopeel_outside_cost, &dummy, 0, false);
1891 dummy.release ();
1893 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1894 costs will be recorded. */
1895 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1896 prologue_cost_vec.create (2);
1897 epilogue_cost_vec.create (2);
1899 int dummy2;
1900 nopeel_outside_cost += vect_get_known_peeling_cost
1901 (loop_vinfo, 0, &dummy2,
1902 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1903 &prologue_cost_vec, &epilogue_cost_vec);
1905 prologue_cost_vec.release ();
1906 epilogue_cost_vec.release ();
1908 npeel = best_peel.peel_info.npeel;
1909 dr0 = best_peel.peel_info.dr;
1911 /* If no peeling is not more expensive than the best peeling we
1912 have so far, don't perform any peeling. */
1913 if (nopeel_inside_cost <= best_peel.inside_cost)
1914 do_peeling = false;
1917 if (do_peeling)
1919 stmt = DR_STMT (dr0);
1920 stmt_info = vinfo_for_stmt (stmt);
1921 vectype = STMT_VINFO_VECTYPE (stmt_info);
1922 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1924 if (known_alignment_for_access_p (dr0))
1926 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1927 size_zero_node) < 0;
1928 if (!npeel)
1930 /* Since it's known at compile time, compute the number of
1931 iterations in the peeled loop (the peeling factor) for use in
1932 updating DR_MISALIGNMENT values. The peeling factor is the
1933 vectorization factor minus the misalignment as an element
1934 count. */
1935 mis = DR_MISALIGNMENT (dr0);
1936 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1937 npeel = ((negative ? mis - nelements : nelements - mis)
1938 & (nelements - 1));
1941 /* For interleaved data access every iteration accesses all the
1942 members of the group, therefore we divide the number of iterations
1943 by the group size. */
1944 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1945 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1946 npeel /= GROUP_SIZE (stmt_info);
1948 if (dump_enabled_p ())
1949 dump_printf_loc (MSG_NOTE, vect_location,
1950 "Try peeling by %d\n", npeel);
1953 /* Ensure that all datarefs can be vectorized after the peel. */
1954 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1955 do_peeling = false;
1957 /* Check if all datarefs are supportable and log. */
1958 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1960 stat = vect_verify_datarefs_alignment (loop_vinfo);
1961 if (!stat)
1962 do_peeling = false;
1963 else
1964 return stat;
1967 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1968 if (do_peeling)
1970 unsigned max_allowed_peel
1971 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1972 if (max_allowed_peel != (unsigned)-1)
1974 unsigned max_peel = npeel;
1975 if (max_peel == 0)
1977 gimple *dr_stmt = DR_STMT (dr0);
1978 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1979 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1980 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1982 if (max_peel > max_allowed_peel)
1984 do_peeling = false;
1985 if (dump_enabled_p ())
1986 dump_printf_loc (MSG_NOTE, vect_location,
1987 "Disable peeling, max peels reached: %d\n", max_peel);
1992 /* Cost model #2 - if peeling may result in a remaining loop not
1993 iterating enough to be vectorized then do not peel. */
1994 if (do_peeling
1995 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1997 unsigned max_peel
1998 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1999 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
2000 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
2001 do_peeling = false;
2004 if (do_peeling)
2006 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2007 If the misalignment of DR_i is identical to that of dr0 then set
2008 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2009 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2010 by the peeling factor times the element size of DR_i (MOD the
2011 vectorization factor times the size). Otherwise, the
2012 misalignment of DR_i must be set to unknown. */
2013 FOR_EACH_VEC_ELT (datarefs, i, dr)
2014 if (dr != dr0)
2016 /* Strided accesses perform only component accesses, alignment
2017 is irrelevant for them. */
2018 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2019 if (STMT_VINFO_STRIDED_P (stmt_info)
2020 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2021 continue;
2023 vect_update_misalignment_for_peel (dr, dr0, npeel);
2026 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2027 if (npeel)
2028 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2029 else
2030 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2031 = DR_MISALIGNMENT (dr0);
2032 SET_DR_MISALIGNMENT (dr0, 0);
2033 if (dump_enabled_p ())
2035 dump_printf_loc (MSG_NOTE, vect_location,
2036 "Alignment of access forced using peeling.\n");
2037 dump_printf_loc (MSG_NOTE, vect_location,
2038 "Peeling for alignment will be applied.\n");
2041 /* The inside-loop cost will be accounted for in vectorizable_load
2042 and vectorizable_store correctly with adjusted alignments.
2043 Drop the body_cst_vec on the floor here. */
2044 stat = vect_verify_datarefs_alignment (loop_vinfo);
2045 gcc_assert (stat);
2046 return stat;
2050 /* (2) Versioning to force alignment. */
2052 /* Try versioning if:
2053 1) optimize loop for speed
2054 2) there is at least one unsupported misaligned data ref with an unknown
2055 misalignment, and
2056 3) all misaligned data refs with a known misalignment are supported, and
2057 4) the number of runtime alignment checks is within reason. */
2059 do_versioning =
2060 optimize_loop_nest_for_speed_p (loop)
2061 && (!loop->inner); /* FORNOW */
2063 if (do_versioning)
2065 FOR_EACH_VEC_ELT (datarefs, i, dr)
2067 stmt = DR_STMT (dr);
2068 stmt_info = vinfo_for_stmt (stmt);
2070 /* For interleaving, only the alignment of the first access
2071 matters. */
2072 if (aligned_access_p (dr)
2073 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2074 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2075 continue;
2077 if (STMT_VINFO_STRIDED_P (stmt_info))
2079 /* Strided loads perform only component accesses, alignment is
2080 irrelevant for them. */
2081 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2082 continue;
2083 do_versioning = false;
2084 break;
2087 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2089 if (!supportable_dr_alignment)
2091 gimple *stmt;
2092 int mask;
2093 tree vectype;
2095 if (known_alignment_for_access_p (dr)
2096 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2097 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2099 do_versioning = false;
2100 break;
2103 stmt = DR_STMT (dr);
2104 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2105 gcc_assert (vectype);
2107 /* The rightmost bits of an aligned address must be zeros.
2108 Construct the mask needed for this test. For example,
2109 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2110 mask must be 15 = 0xf. */
2111 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2113 /* FORNOW: use the same mask to test all potentially unaligned
2114 references in the loop. The vectorizer currently supports
2115 a single vector size, see the reference to
2116 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2117 vectorization factor is computed. */
2118 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2119 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2120 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2121 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2122 DR_STMT (dr));
2126 /* Versioning requires at least one misaligned data reference. */
2127 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2128 do_versioning = false;
2129 else if (!do_versioning)
2130 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2133 if (do_versioning)
2135 vec<gimple *> may_misalign_stmts
2136 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2137 gimple *stmt;
2139 /* It can now be assumed that the data references in the statements
2140 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2141 of the loop being vectorized. */
2142 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2144 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2145 dr = STMT_VINFO_DATA_REF (stmt_info);
2146 SET_DR_MISALIGNMENT (dr, 0);
2147 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_NOTE, vect_location,
2149 "Alignment of access forced using versioning.\n");
2152 if (dump_enabled_p ())
2153 dump_printf_loc (MSG_NOTE, vect_location,
2154 "Versioning for alignment will be applied.\n");
2156 /* Peeling and versioning can't be done together at this time. */
2157 gcc_assert (! (do_peeling && do_versioning));
2159 stat = vect_verify_datarefs_alignment (loop_vinfo);
2160 gcc_assert (stat);
2161 return stat;
2164 /* This point is reached if neither peeling nor versioning is being done. */
2165 gcc_assert (! (do_peeling || do_versioning));
2167 stat = vect_verify_datarefs_alignment (loop_vinfo);
2168 return stat;
2172 /* Function vect_find_same_alignment_drs.
2174 Update group and alignment relations according to the chosen
2175 vectorization factor. */
2177 static void
2178 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2180 struct data_reference *dra = DDR_A (ddr);
2181 struct data_reference *drb = DDR_B (ddr);
2182 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2183 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2185 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2186 return;
2188 if (dra == drb)
2189 return;
2191 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2192 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2193 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2194 return;
2196 /* Two references with distance zero have the same alignment. */
2197 offset_int diff = (wi::to_offset (DR_INIT (dra))
2198 - wi::to_offset (DR_INIT (drb)));
2199 if (diff != 0)
2201 /* Get the wider of the two alignments. */
2202 unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2203 unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2204 unsigned int max_align = MAX (align_a, align_b);
2206 /* Require the gap to be a multiple of the larger vector alignment. */
2207 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2208 return;
2211 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2212 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2213 if (dump_enabled_p ())
2215 dump_printf_loc (MSG_NOTE, vect_location,
2216 "accesses have the same alignment: ");
2217 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2218 dump_printf (MSG_NOTE, " and ");
2219 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2220 dump_printf (MSG_NOTE, "\n");
2225 /* Function vect_analyze_data_refs_alignment
2227 Analyze the alignment of the data-references in the loop.
2228 Return FALSE if a data reference is found that cannot be vectorized. */
2230 bool
2231 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_NOTE, vect_location,
2235 "=== vect_analyze_data_refs_alignment ===\n");
2237 /* Mark groups of data references with same alignment using
2238 data dependence information. */
2239 vec<ddr_p> ddrs = vinfo->ddrs;
2240 struct data_dependence_relation *ddr;
2241 unsigned int i;
2243 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2244 vect_find_same_alignment_drs (ddr);
2246 vec<data_reference_p> datarefs = vinfo->datarefs;
2247 struct data_reference *dr;
2249 vect_record_base_alignments (vinfo);
2250 FOR_EACH_VEC_ELT (datarefs, i, dr)
2252 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2253 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2254 && !vect_compute_data_ref_alignment (dr))
2256 /* Strided accesses perform only component accesses, misalignment
2257 information is irrelevant for them. */
2258 if (STMT_VINFO_STRIDED_P (stmt_info)
2259 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2260 continue;
2262 if (dump_enabled_p ())
2263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264 "not vectorized: can't calculate alignment "
2265 "for data ref.\n");
2267 return false;
2271 return true;
2275 /* Analyze alignment of DRs of stmts in NODE. */
2277 static bool
2278 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2280 /* We vectorize from the first scalar stmt in the node unless
2281 the node is permuted in which case we start from the first
2282 element in the group. */
2283 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2284 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2285 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2286 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2288 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2289 if (! vect_compute_data_ref_alignment (dr)
2290 /* For creating the data-ref pointer we need alignment of the
2291 first element anyway. */
2292 || (dr != first_dr
2293 && ! vect_compute_data_ref_alignment (first_dr))
2294 || ! verify_data_ref_alignment (dr))
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298 "not vectorized: bad data alignment in basic "
2299 "block.\n");
2300 return false;
2303 return true;
2306 /* Function vect_slp_analyze_instance_alignment
2308 Analyze the alignment of the data-references in the SLP instance.
2309 Return FALSE if a data reference is found that cannot be vectorized. */
2311 bool
2312 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_NOTE, vect_location,
2316 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2318 slp_tree node;
2319 unsigned i;
2320 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2321 if (! vect_slp_analyze_and_verify_node_alignment (node))
2322 return false;
2324 node = SLP_INSTANCE_TREE (instance);
2325 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2326 && ! vect_slp_analyze_and_verify_node_alignment
2327 (SLP_INSTANCE_TREE (instance)))
2328 return false;
2330 return true;
2334 /* Analyze groups of accesses: check that DR belongs to a group of
2335 accesses of legal size, step, etc. Detect gaps, single element
2336 interleaving, and other special cases. Set grouped access info.
2337 Collect groups of strided stores for further use in SLP analysis.
2338 Worker for vect_analyze_group_access. */
2340 static bool
2341 vect_analyze_group_access_1 (struct data_reference *dr)
2343 tree step = DR_STEP (dr);
2344 tree scalar_type = TREE_TYPE (DR_REF (dr));
2345 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2346 gimple *stmt = DR_STMT (dr);
2347 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2348 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2349 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2350 HOST_WIDE_INT dr_step = -1;
2351 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2352 bool slp_impossible = false;
2354 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2355 size of the interleaving group (including gaps). */
2356 if (tree_fits_shwi_p (step))
2358 dr_step = tree_to_shwi (step);
2359 /* Check that STEP is a multiple of type size. Otherwise there is
2360 a non-element-sized gap at the end of the group which we
2361 cannot represent in GROUP_GAP or GROUP_SIZE.
2362 ??? As we can handle non-constant step fine here we should
2363 simply remove uses of GROUP_GAP between the last and first
2364 element and instead rely on DR_STEP. GROUP_SIZE then would
2365 simply not include that gap. */
2366 if ((dr_step % type_size) != 0)
2368 if (dump_enabled_p ())
2370 dump_printf_loc (MSG_NOTE, vect_location,
2371 "Step ");
2372 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2373 dump_printf (MSG_NOTE,
2374 " is not a multiple of the element size for ");
2375 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2376 dump_printf (MSG_NOTE, "\n");
2378 return false;
2380 groupsize = absu_hwi (dr_step) / type_size;
2382 else
2383 groupsize = 0;
2385 /* Not consecutive access is possible only if it is a part of interleaving. */
2386 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2388 /* Check if it this DR is a part of interleaving, and is a single
2389 element of the group that is accessed in the loop. */
2391 /* Gaps are supported only for loads. STEP must be a multiple of the type
2392 size. The size of the group must be a power of 2. */
2393 if (DR_IS_READ (dr)
2394 && (dr_step % type_size) == 0
2395 && groupsize > 0
2396 && pow2p_hwi (groupsize))
2398 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2399 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2400 GROUP_GAP (stmt_info) = groupsize - 1;
2401 if (dump_enabled_p ())
2403 dump_printf_loc (MSG_NOTE, vect_location,
2404 "Detected single element interleaving ");
2405 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2406 dump_printf (MSG_NOTE, " step ");
2407 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2408 dump_printf (MSG_NOTE, "\n");
2411 return true;
2414 if (dump_enabled_p ())
2416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2417 "not consecutive access ");
2418 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2421 if (bb_vinfo)
2423 /* Mark the statement as unvectorizable. */
2424 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2425 return true;
2428 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2429 STMT_VINFO_STRIDED_P (stmt_info) = true;
2430 return true;
2433 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2435 /* First stmt in the interleaving chain. Check the chain. */
2436 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2437 struct data_reference *data_ref = dr;
2438 unsigned int count = 1;
2439 tree prev_init = DR_INIT (data_ref);
2440 gimple *prev = stmt;
2441 HOST_WIDE_INT diff, gaps = 0;
2443 while (next)
2445 /* Skip same data-refs. In case that two or more stmts share
2446 data-ref (supported only for loads), we vectorize only the first
2447 stmt, and the rest get their vectorized loads from the first
2448 one. */
2449 if (!tree_int_cst_compare (DR_INIT (data_ref),
2450 DR_INIT (STMT_VINFO_DATA_REF (
2451 vinfo_for_stmt (next)))))
2453 if (DR_IS_WRITE (data_ref))
2455 if (dump_enabled_p ())
2456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2457 "Two store stmts share the same dr.\n");
2458 return false;
2461 if (dump_enabled_p ())
2462 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2463 "Two or more load stmts share the same dr.\n");
2465 /* For load use the same data-ref load. */
2466 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2468 prev = next;
2469 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2470 continue;
2473 prev = next;
2474 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2476 /* All group members have the same STEP by construction. */
2477 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2479 /* Check that the distance between two accesses is equal to the type
2480 size. Otherwise, we have gaps. */
2481 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2482 - TREE_INT_CST_LOW (prev_init)) / type_size;
2483 if (diff != 1)
2485 /* FORNOW: SLP of accesses with gaps is not supported. */
2486 slp_impossible = true;
2487 if (DR_IS_WRITE (data_ref))
2489 if (dump_enabled_p ())
2490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2491 "interleaved store with gaps\n");
2492 return false;
2495 gaps += diff - 1;
2498 last_accessed_element += diff;
2500 /* Store the gap from the previous member of the group. If there is no
2501 gap in the access, GROUP_GAP is always 1. */
2502 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2504 prev_init = DR_INIT (data_ref);
2505 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2506 /* Count the number of data-refs in the chain. */
2507 count++;
2510 if (groupsize == 0)
2511 groupsize = count + gaps;
2513 /* This could be UINT_MAX but as we are generating code in a very
2514 inefficient way we have to cap earlier. See PR78699 for example. */
2515 if (groupsize > 4096)
2517 if (dump_enabled_p ())
2518 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2519 "group is too large\n");
2520 return false;
2523 /* Check that the size of the interleaving is equal to count for stores,
2524 i.e., that there are no gaps. */
2525 if (groupsize != count
2526 && !DR_IS_READ (dr))
2528 if (dump_enabled_p ())
2529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2530 "interleaved store with gaps\n");
2531 return false;
2534 /* If there is a gap after the last load in the group it is the
2535 difference between the groupsize and the last accessed
2536 element.
2537 When there is no gap, this difference should be 0. */
2538 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2540 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2541 if (dump_enabled_p ())
2543 dump_printf_loc (MSG_NOTE, vect_location,
2544 "Detected interleaving ");
2545 if (DR_IS_READ (dr))
2546 dump_printf (MSG_NOTE, "load ");
2547 else
2548 dump_printf (MSG_NOTE, "store ");
2549 dump_printf (MSG_NOTE, "of size %u starting with ",
2550 (unsigned)groupsize);
2551 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2552 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2553 dump_printf_loc (MSG_NOTE, vect_location,
2554 "There is a gap of %u elements after the group\n",
2555 GROUP_GAP (vinfo_for_stmt (stmt)));
2558 /* SLP: create an SLP data structure for every interleaving group of
2559 stores for further analysis in vect_analyse_slp. */
2560 if (DR_IS_WRITE (dr) && !slp_impossible)
2562 if (loop_vinfo)
2563 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2564 if (bb_vinfo)
2565 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2569 return true;
2572 /* Analyze groups of accesses: check that DR belongs to a group of
2573 accesses of legal size, step, etc. Detect gaps, single element
2574 interleaving, and other special cases. Set grouped access info.
2575 Collect groups of strided stores for further use in SLP analysis. */
2577 static bool
2578 vect_analyze_group_access (struct data_reference *dr)
2580 if (!vect_analyze_group_access_1 (dr))
2582 /* Dissolve the group if present. */
2583 gimple *next;
2584 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2585 while (stmt)
2587 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2588 next = GROUP_NEXT_ELEMENT (vinfo);
2589 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2590 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2591 stmt = next;
2593 return false;
2595 return true;
2598 /* Analyze the access pattern of the data-reference DR.
2599 In case of non-consecutive accesses call vect_analyze_group_access() to
2600 analyze groups of accesses. */
2602 static bool
2603 vect_analyze_data_ref_access (struct data_reference *dr)
2605 tree step = DR_STEP (dr);
2606 tree scalar_type = TREE_TYPE (DR_REF (dr));
2607 gimple *stmt = DR_STMT (dr);
2608 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2609 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2610 struct loop *loop = NULL;
2612 if (loop_vinfo)
2613 loop = LOOP_VINFO_LOOP (loop_vinfo);
2615 if (loop_vinfo && !step)
2617 if (dump_enabled_p ())
2618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2619 "bad data-ref access in loop\n");
2620 return false;
2623 /* Allow loads with zero step in inner-loop vectorization. */
2624 if (loop_vinfo && integer_zerop (step))
2626 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2627 if (!nested_in_vect_loop_p (loop, stmt))
2628 return DR_IS_READ (dr);
2629 /* Allow references with zero step for outer loops marked
2630 with pragma omp simd only - it guarantees absence of
2631 loop-carried dependencies between inner loop iterations. */
2632 if (!loop->force_vectorize)
2634 if (dump_enabled_p ())
2635 dump_printf_loc (MSG_NOTE, vect_location,
2636 "zero step in inner loop of nest\n");
2637 return false;
2641 if (loop && nested_in_vect_loop_p (loop, stmt))
2643 /* Interleaved accesses are not yet supported within outer-loop
2644 vectorization for references in the inner-loop. */
2645 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2647 /* For the rest of the analysis we use the outer-loop step. */
2648 step = STMT_VINFO_DR_STEP (stmt_info);
2649 if (integer_zerop (step))
2651 if (dump_enabled_p ())
2652 dump_printf_loc (MSG_NOTE, vect_location,
2653 "zero step in outer loop.\n");
2654 return DR_IS_READ (dr);
2658 /* Consecutive? */
2659 if (TREE_CODE (step) == INTEGER_CST)
2661 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2662 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2663 || (dr_step < 0
2664 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2666 /* Mark that it is not interleaving. */
2667 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2668 return true;
2672 if (loop && nested_in_vect_loop_p (loop, stmt))
2674 if (dump_enabled_p ())
2675 dump_printf_loc (MSG_NOTE, vect_location,
2676 "grouped access in outer loop.\n");
2677 return false;
2681 /* Assume this is a DR handled by non-constant strided load case. */
2682 if (TREE_CODE (step) != INTEGER_CST)
2683 return (STMT_VINFO_STRIDED_P (stmt_info)
2684 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2685 || vect_analyze_group_access (dr)));
2687 /* Not consecutive access - check if it's a part of interleaving group. */
2688 return vect_analyze_group_access (dr);
2691 /* Compare two data-references DRA and DRB to group them into chunks
2692 suitable for grouping. */
2694 static int
2695 dr_group_sort_cmp (const void *dra_, const void *drb_)
2697 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2698 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2699 int cmp;
2701 /* Stabilize sort. */
2702 if (dra == drb)
2703 return 0;
2705 /* DRs in different loops never belong to the same group. */
2706 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2707 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2708 if (loopa != loopb)
2709 return loopa->num < loopb->num ? -1 : 1;
2711 /* Ordering of DRs according to base. */
2712 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2714 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2715 DR_BASE_ADDRESS (drb));
2716 if (cmp != 0)
2717 return cmp;
2720 /* And according to DR_OFFSET. */
2721 if (!dr_equal_offsets_p (dra, drb))
2723 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2724 if (cmp != 0)
2725 return cmp;
2728 /* Put reads before writes. */
2729 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2730 return DR_IS_READ (dra) ? -1 : 1;
2732 /* Then sort after access size. */
2733 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2734 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2736 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2737 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2738 if (cmp != 0)
2739 return cmp;
2742 /* And after step. */
2743 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2745 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2746 if (cmp != 0)
2747 return cmp;
2750 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2751 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2752 if (cmp == 0)
2753 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2754 return cmp;
2757 /* Function vect_analyze_data_ref_accesses.
2759 Analyze the access pattern of all the data references in the loop.
2761 FORNOW: the only access pattern that is considered vectorizable is a
2762 simple step 1 (consecutive) access.
2764 FORNOW: handle only arrays and pointer accesses. */
2766 bool
2767 vect_analyze_data_ref_accesses (vec_info *vinfo)
2769 unsigned int i;
2770 vec<data_reference_p> datarefs = vinfo->datarefs;
2771 struct data_reference *dr;
2773 if (dump_enabled_p ())
2774 dump_printf_loc (MSG_NOTE, vect_location,
2775 "=== vect_analyze_data_ref_accesses ===\n");
2777 if (datarefs.is_empty ())
2778 return true;
2780 /* Sort the array of datarefs to make building the interleaving chains
2781 linear. Don't modify the original vector's order, it is needed for
2782 determining what dependencies are reversed. */
2783 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2784 datarefs_copy.qsort (dr_group_sort_cmp);
2786 /* Build the interleaving chains. */
2787 for (i = 0; i < datarefs_copy.length () - 1;)
2789 data_reference_p dra = datarefs_copy[i];
2790 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2791 stmt_vec_info lastinfo = NULL;
2792 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2794 ++i;
2795 continue;
2797 for (i = i + 1; i < datarefs_copy.length (); ++i)
2799 data_reference_p drb = datarefs_copy[i];
2800 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2801 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2802 break;
2804 /* ??? Imperfect sorting (non-compatible types, non-modulo
2805 accesses, same accesses) can lead to a group to be artificially
2806 split here as we don't just skip over those. If it really
2807 matters we can push those to a worklist and re-iterate
2808 over them. The we can just skip ahead to the next DR here. */
2810 /* DRs in a different loop should not be put into the same
2811 interleaving group. */
2812 if (gimple_bb (DR_STMT (dra))->loop_father
2813 != gimple_bb (DR_STMT (drb))->loop_father)
2814 break;
2816 /* Check that the data-refs have same first location (except init)
2817 and they are both either store or load (not load and store,
2818 not masked loads or stores). */
2819 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2820 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2821 DR_BASE_ADDRESS (drb), 0)
2822 || !dr_equal_offsets_p (dra, drb)
2823 || !gimple_assign_single_p (DR_STMT (dra))
2824 || !gimple_assign_single_p (DR_STMT (drb)))
2825 break;
2827 /* Check that the data-refs have the same constant size. */
2828 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2829 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2830 if (!tree_fits_uhwi_p (sza)
2831 || !tree_fits_uhwi_p (szb)
2832 || !tree_int_cst_equal (sza, szb))
2833 break;
2835 /* Check that the data-refs have the same step. */
2836 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2837 break;
2839 /* Do not place the same access in the interleaving chain twice. */
2840 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2841 break;
2843 /* Check the types are compatible.
2844 ??? We don't distinguish this during sorting. */
2845 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2846 TREE_TYPE (DR_REF (drb))))
2847 break;
2849 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2850 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2851 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2852 gcc_assert (init_a <= init_b);
2854 /* If init_b == init_a + the size of the type * k, we have an
2855 interleaving, and DRA is accessed before DRB. */
2856 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2857 if (type_size_a == 0
2858 || (init_b - init_a) % type_size_a != 0)
2859 break;
2861 /* If we have a store, the accesses are adjacent. This splits
2862 groups into chunks we support (we don't support vectorization
2863 of stores with gaps). */
2864 if (!DR_IS_READ (dra)
2865 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2866 (DR_INIT (datarefs_copy[i-1]))
2867 != type_size_a))
2868 break;
2870 /* If the step (if not zero or non-constant) is greater than the
2871 difference between data-refs' inits this splits groups into
2872 suitable sizes. */
2873 if (tree_fits_shwi_p (DR_STEP (dra)))
2875 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2876 if (step != 0 && step <= (init_b - init_a))
2877 break;
2880 if (dump_enabled_p ())
2882 dump_printf_loc (MSG_NOTE, vect_location,
2883 "Detected interleaving ");
2884 if (DR_IS_READ (dra))
2885 dump_printf (MSG_NOTE, "load ");
2886 else
2887 dump_printf (MSG_NOTE, "store ");
2888 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2889 dump_printf (MSG_NOTE, " and ");
2890 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2891 dump_printf (MSG_NOTE, "\n");
2894 /* Link the found element into the group list. */
2895 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2897 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2898 lastinfo = stmtinfo_a;
2900 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2901 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2902 lastinfo = stmtinfo_b;
2906 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2907 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2908 && !vect_analyze_data_ref_access (dr))
2910 if (dump_enabled_p ())
2911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2912 "not vectorized: complicated access pattern.\n");
2914 if (is_a <bb_vec_info> (vinfo))
2916 /* Mark the statement as not vectorizable. */
2917 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2918 continue;
2920 else
2922 datarefs_copy.release ();
2923 return false;
2927 datarefs_copy.release ();
2928 return true;
2931 /* Function vect_vfa_segment_size.
2933 Create an expression that computes the size of segment
2934 that will be accessed for a data reference. The functions takes into
2935 account that realignment loads may access one more vector.
2937 Input:
2938 DR: The data reference.
2939 LENGTH_FACTOR: segment length to consider.
2941 Return an expression whose value is the size of segment which will be
2942 accessed by DR. */
2944 static tree
2945 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2947 tree segment_length;
2949 if (integer_zerop (DR_STEP (dr)))
2950 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2951 else
2952 segment_length = size_binop (MULT_EXPR,
2953 fold_convert (sizetype, DR_STEP (dr)),
2954 fold_convert (sizetype, length_factor));
2956 if (vect_supportable_dr_alignment (dr, false)
2957 == dr_explicit_realign_optimized)
2959 tree vector_size = TYPE_SIZE_UNIT
2960 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2962 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2964 return segment_length;
2967 /* Function vect_no_alias_p.
2969 Given data references A and B with equal base and offset, the alias
2970 relation can be decided at compilation time, return TRUE if they do
2971 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2972 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2973 respectively. */
2975 static bool
2976 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2977 tree segment_length_a, tree segment_length_b)
2979 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2980 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2981 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2982 return false;
2984 tree seg_a_min = DR_INIT (a);
2985 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2986 seg_a_min, segment_length_a);
2987 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2988 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2989 [a, a+12) */
2990 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2992 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2993 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2994 seg_a_max, unit_size);
2995 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2996 DR_INIT (a), unit_size);
2998 tree seg_b_min = DR_INIT (b);
2999 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3000 seg_b_min, segment_length_b);
3001 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3003 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3004 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3005 seg_b_max, unit_size);
3006 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3007 DR_INIT (b), unit_size);
3010 if (tree_int_cst_le (seg_a_max, seg_b_min)
3011 || tree_int_cst_le (seg_b_max, seg_a_min))
3012 return true;
3014 return false;
3017 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3018 in DDR is >= VF. */
3020 static bool
3021 dependence_distance_ge_vf (data_dependence_relation *ddr,
3022 unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3024 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3025 || DDR_NUM_DIST_VECTS (ddr) == 0)
3026 return false;
3028 /* If the dependence is exact, we should have limited the VF instead. */
3029 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3031 unsigned int i;
3032 lambda_vector dist_v;
3033 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3035 HOST_WIDE_INT dist = dist_v[loop_depth];
3036 if (dist != 0
3037 && !(dist > 0 && DDR_REVERSED_P (ddr))
3038 && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3039 return false;
3042 if (dump_enabled_p ())
3044 dump_printf_loc (MSG_NOTE, vect_location,
3045 "dependence distance between ");
3046 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3047 dump_printf (MSG_NOTE, " and ");
3048 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3049 dump_printf (MSG_NOTE, " is >= VF\n");
3052 return true;
3055 /* Function vect_prune_runtime_alias_test_list.
3057 Prune a list of ddrs to be tested at run-time by versioning for alias.
3058 Merge several alias checks into one if possible.
3059 Return FALSE if resulting list of ddrs is longer then allowed by
3060 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3062 bool
3063 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3065 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3066 hash_set <tree_pair_hash> compared_objects;
3068 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3069 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3070 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3071 vec<vec_object_pair> &check_unequal_addrs
3072 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3073 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3074 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3076 ddr_p ddr;
3077 unsigned int i;
3078 tree length_factor;
3080 if (dump_enabled_p ())
3081 dump_printf_loc (MSG_NOTE, vect_location,
3082 "=== vect_prune_runtime_alias_test_list ===\n");
3084 if (may_alias_ddrs.is_empty ())
3085 return true;
3087 comp_alias_ddrs.create (may_alias_ddrs.length ());
3089 unsigned int loop_depth
3090 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3091 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3093 /* First, we collect all data ref pairs for aliasing checks. */
3094 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3096 int comp_res;
3097 struct data_reference *dr_a, *dr_b;
3098 gimple *dr_group_first_a, *dr_group_first_b;
3099 tree segment_length_a, segment_length_b;
3100 gimple *stmt_a, *stmt_b;
3102 /* Ignore the alias if the VF we chose ended up being no greater
3103 than the dependence distance. */
3104 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3105 continue;
3107 if (DDR_OBJECT_A (ddr))
3109 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3110 if (!compared_objects.add (new_pair))
3112 if (dump_enabled_p ())
3114 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3115 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3116 dump_printf (MSG_NOTE, " and ");
3117 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3118 dump_printf (MSG_NOTE, " have different addresses\n");
3120 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3122 continue;
3125 dr_a = DDR_A (ddr);
3126 stmt_a = DR_STMT (DDR_A (ddr));
3127 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3128 if (dr_group_first_a)
3130 stmt_a = dr_group_first_a;
3131 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3134 dr_b = DDR_B (ddr);
3135 stmt_b = DR_STMT (DDR_B (ddr));
3136 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3137 if (dr_group_first_b)
3139 stmt_b = dr_group_first_b;
3140 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3143 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3144 length_factor = scalar_loop_iters;
3145 else
3146 length_factor = size_int (vect_factor);
3147 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3148 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3150 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3151 DR_BASE_ADDRESS (dr_b));
3152 if (comp_res == 0)
3153 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3154 DR_OFFSET (dr_b));
3156 /* Alias is known at compilation time. */
3157 if (comp_res == 0
3158 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3159 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3160 && TREE_CODE (segment_length_a) == INTEGER_CST
3161 && TREE_CODE (segment_length_b) == INTEGER_CST)
3163 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3164 continue;
3166 if (dump_enabled_p ())
3167 dump_printf_loc (MSG_NOTE, vect_location,
3168 "not vectorized: compilation time alias.\n");
3170 return false;
3173 dr_with_seg_len_pair_t dr_with_seg_len_pair
3174 (dr_with_seg_len (dr_a, segment_length_a),
3175 dr_with_seg_len (dr_b, segment_length_b));
3177 /* Canonicalize pairs by sorting the two DR members. */
3178 if (comp_res > 0)
3179 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3181 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3184 prune_runtime_alias_test_list (&comp_alias_ddrs,
3185 (unsigned HOST_WIDE_INT) vect_factor);
3187 unsigned int count = (comp_alias_ddrs.length ()
3188 + check_unequal_addrs.length ());
3189 dump_printf_loc (MSG_NOTE, vect_location,
3190 "improved number of alias checks from %d to %d\n",
3191 may_alias_ddrs.length (), count);
3192 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3194 if (dump_enabled_p ())
3195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3196 "number of versioning for alias "
3197 "run-time tests exceeds %d "
3198 "(--param vect-max-version-for-alias-checks)\n",
3199 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3200 return false;
3203 return true;
3206 /* Return true if a non-affine read or write in STMT is suitable for a
3207 gather load or scatter store. Describe the operation in *INFO if so. */
3209 bool
3210 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3211 gather_scatter_info *info)
3213 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3214 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3215 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3216 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3217 tree offtype = NULL_TREE;
3218 tree decl, base, off;
3219 machine_mode pmode;
3220 int punsignedp, reversep, pvolatilep = 0;
3222 base = DR_REF (dr);
3223 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3224 see if we can use the def stmt of the address. */
3225 if (is_gimple_call (stmt)
3226 && gimple_call_internal_p (stmt)
3227 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3228 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3229 && TREE_CODE (base) == MEM_REF
3230 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3231 && integer_zerop (TREE_OPERAND (base, 1))
3232 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3234 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3235 if (is_gimple_assign (def_stmt)
3236 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3237 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3240 /* The gather and scatter builtins need address of the form
3241 loop_invariant + vector * {1, 2, 4, 8}
3243 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3244 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3245 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3246 multiplications and additions in it. To get a vector, we need
3247 a single SSA_NAME that will be defined in the loop and will
3248 contain everything that is not loop invariant and that can be
3249 vectorized. The following code attempts to find such a preexistng
3250 SSA_NAME OFF and put the loop invariants into a tree BASE
3251 that can be gimplified before the loop. */
3252 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3253 &punsignedp, &reversep, &pvolatilep);
3254 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3256 if (TREE_CODE (base) == MEM_REF)
3258 if (!integer_zerop (TREE_OPERAND (base, 1)))
3260 if (off == NULL_TREE)
3262 offset_int moff = mem_ref_offset (base);
3263 off = wide_int_to_tree (sizetype, moff);
3265 else
3266 off = size_binop (PLUS_EXPR, off,
3267 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3269 base = TREE_OPERAND (base, 0);
3271 else
3272 base = build_fold_addr_expr (base);
3274 if (off == NULL_TREE)
3275 off = size_zero_node;
3277 /* If base is not loop invariant, either off is 0, then we start with just
3278 the constant offset in the loop invariant BASE and continue with base
3279 as OFF, otherwise give up.
3280 We could handle that case by gimplifying the addition of base + off
3281 into some SSA_NAME and use that as off, but for now punt. */
3282 if (!expr_invariant_in_loop_p (loop, base))
3284 if (!integer_zerop (off))
3285 return false;
3286 off = base;
3287 base = size_int (pbitpos / BITS_PER_UNIT);
3289 /* Otherwise put base + constant offset into the loop invariant BASE
3290 and continue with OFF. */
3291 else
3293 base = fold_convert (sizetype, base);
3294 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3297 /* OFF at this point may be either a SSA_NAME or some tree expression
3298 from get_inner_reference. Try to peel off loop invariants from it
3299 into BASE as long as possible. */
3300 STRIP_NOPS (off);
3301 while (offtype == NULL_TREE)
3303 enum tree_code code;
3304 tree op0, op1, add = NULL_TREE;
3306 if (TREE_CODE (off) == SSA_NAME)
3308 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3310 if (expr_invariant_in_loop_p (loop, off))
3311 return false;
3313 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3314 break;
3316 op0 = gimple_assign_rhs1 (def_stmt);
3317 code = gimple_assign_rhs_code (def_stmt);
3318 op1 = gimple_assign_rhs2 (def_stmt);
3320 else
3322 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3323 return false;
3324 code = TREE_CODE (off);
3325 extract_ops_from_tree (off, &code, &op0, &op1);
3327 switch (code)
3329 case POINTER_PLUS_EXPR:
3330 case PLUS_EXPR:
3331 if (expr_invariant_in_loop_p (loop, op0))
3333 add = op0;
3334 off = op1;
3335 do_add:
3336 add = fold_convert (sizetype, add);
3337 if (scale != 1)
3338 add = size_binop (MULT_EXPR, add, size_int (scale));
3339 base = size_binop (PLUS_EXPR, base, add);
3340 continue;
3342 if (expr_invariant_in_loop_p (loop, op1))
3344 add = op1;
3345 off = op0;
3346 goto do_add;
3348 break;
3349 case MINUS_EXPR:
3350 if (expr_invariant_in_loop_p (loop, op1))
3352 add = fold_convert (sizetype, op1);
3353 add = size_binop (MINUS_EXPR, size_zero_node, add);
3354 off = op0;
3355 goto do_add;
3357 break;
3358 case MULT_EXPR:
3359 if (scale == 1 && tree_fits_shwi_p (op1))
3361 scale = tree_to_shwi (op1);
3362 off = op0;
3363 continue;
3365 break;
3366 case SSA_NAME:
3367 off = op0;
3368 continue;
3369 CASE_CONVERT:
3370 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3371 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3372 break;
3373 if (TYPE_PRECISION (TREE_TYPE (op0))
3374 == TYPE_PRECISION (TREE_TYPE (off)))
3376 off = op0;
3377 continue;
3379 if (TYPE_PRECISION (TREE_TYPE (op0))
3380 < TYPE_PRECISION (TREE_TYPE (off)))
3382 off = op0;
3383 offtype = TREE_TYPE (off);
3384 STRIP_NOPS (off);
3385 continue;
3387 break;
3388 default:
3389 break;
3391 break;
3394 /* If at the end OFF still isn't a SSA_NAME or isn't
3395 defined in the loop, punt. */
3396 if (TREE_CODE (off) != SSA_NAME
3397 || expr_invariant_in_loop_p (loop, off))
3398 return false;
3400 if (offtype == NULL_TREE)
3401 offtype = TREE_TYPE (off);
3403 if (DR_IS_READ (dr))
3404 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3405 offtype, scale);
3406 else
3407 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3408 offtype, scale);
3410 if (decl == NULL_TREE)
3411 return false;
3413 info->decl = decl;
3414 info->base = base;
3415 info->offset = off;
3416 info->offset_dt = vect_unknown_def_type;
3417 info->offset_vectype = NULL_TREE;
3418 info->scale = scale;
3419 return true;
3422 /* Function vect_analyze_data_refs.
3424 Find all the data references in the loop or basic block.
3426 The general structure of the analysis of data refs in the vectorizer is as
3427 follows:
3428 1- vect_analyze_data_refs(loop/bb): call
3429 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3430 in the loop/bb and their dependences.
3431 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3432 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3433 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3437 bool
3438 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3440 struct loop *loop = NULL;
3441 unsigned int i;
3442 struct data_reference *dr;
3443 tree scalar_type;
3445 if (dump_enabled_p ())
3446 dump_printf_loc (MSG_NOTE, vect_location,
3447 "=== vect_analyze_data_refs ===\n");
3449 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3450 loop = LOOP_VINFO_LOOP (loop_vinfo);
3452 /* Go through the data-refs, check that the analysis succeeded. Update
3453 pointer from stmt_vec_info struct to DR and vectype. */
3455 vec<data_reference_p> datarefs = vinfo->datarefs;
3456 FOR_EACH_VEC_ELT (datarefs, i, dr)
3458 gimple *stmt;
3459 stmt_vec_info stmt_info;
3460 tree base, offset, init;
3461 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3462 bool simd_lane_access = false;
3463 int vf;
3465 again:
3466 if (!dr || !DR_REF (dr))
3468 if (dump_enabled_p ())
3469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3470 "not vectorized: unhandled data-ref\n");
3471 return false;
3474 stmt = DR_STMT (dr);
3475 stmt_info = vinfo_for_stmt (stmt);
3477 /* Discard clobbers from the dataref vector. We will remove
3478 clobber stmts during vectorization. */
3479 if (gimple_clobber_p (stmt))
3481 free_data_ref (dr);
3482 if (i == datarefs.length () - 1)
3484 datarefs.pop ();
3485 break;
3487 datarefs.ordered_remove (i);
3488 dr = datarefs[i];
3489 goto again;
3492 /* Check that analysis of the data-ref succeeded. */
3493 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3494 || !DR_STEP (dr))
3496 bool maybe_gather
3497 = DR_IS_READ (dr)
3498 && !TREE_THIS_VOLATILE (DR_REF (dr))
3499 && targetm.vectorize.builtin_gather != NULL;
3500 bool maybe_scatter
3501 = DR_IS_WRITE (dr)
3502 && !TREE_THIS_VOLATILE (DR_REF (dr))
3503 && targetm.vectorize.builtin_scatter != NULL;
3504 bool maybe_simd_lane_access
3505 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3507 /* If target supports vector gather loads or scatter stores, or if
3508 this might be a SIMD lane access, see if they can't be used. */
3509 if (is_a <loop_vec_info> (vinfo)
3510 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3511 && !nested_in_vect_loop_p (loop, stmt))
3513 struct data_reference *newdr
3514 = create_data_ref (NULL, loop_containing_stmt (stmt),
3515 DR_REF (dr), stmt, !maybe_scatter,
3516 DR_IS_CONDITIONAL_IN_STMT (dr));
3517 gcc_assert (newdr != NULL && DR_REF (newdr));
3518 if (DR_BASE_ADDRESS (newdr)
3519 && DR_OFFSET (newdr)
3520 && DR_INIT (newdr)
3521 && DR_STEP (newdr)
3522 && integer_zerop (DR_STEP (newdr)))
3524 if (maybe_simd_lane_access)
3526 tree off = DR_OFFSET (newdr);
3527 STRIP_NOPS (off);
3528 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3529 && TREE_CODE (off) == MULT_EXPR
3530 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3532 tree step = TREE_OPERAND (off, 1);
3533 off = TREE_OPERAND (off, 0);
3534 STRIP_NOPS (off);
3535 if (CONVERT_EXPR_P (off)
3536 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3537 0)))
3538 < TYPE_PRECISION (TREE_TYPE (off)))
3539 off = TREE_OPERAND (off, 0);
3540 if (TREE_CODE (off) == SSA_NAME)
3542 gimple *def = SSA_NAME_DEF_STMT (off);
3543 tree reft = TREE_TYPE (DR_REF (newdr));
3544 if (is_gimple_call (def)
3545 && gimple_call_internal_p (def)
3546 && (gimple_call_internal_fn (def)
3547 == IFN_GOMP_SIMD_LANE))
3549 tree arg = gimple_call_arg (def, 0);
3550 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3551 arg = SSA_NAME_VAR (arg);
3552 if (arg == loop->simduid
3553 /* For now. */
3554 && tree_int_cst_equal
3555 (TYPE_SIZE_UNIT (reft),
3556 step))
3558 DR_OFFSET (newdr) = ssize_int (0);
3559 DR_STEP (newdr) = step;
3560 DR_OFFSET_ALIGNMENT (newdr)
3561 = BIGGEST_ALIGNMENT;
3562 DR_STEP_ALIGNMENT (newdr)
3563 = highest_pow2_factor (step);
3564 dr = newdr;
3565 simd_lane_access = true;
3571 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3573 dr = newdr;
3574 if (maybe_gather)
3575 gatherscatter = GATHER;
3576 else
3577 gatherscatter = SCATTER;
3580 if (gatherscatter == SG_NONE && !simd_lane_access)
3581 free_data_ref (newdr);
3584 if (gatherscatter == SG_NONE && !simd_lane_access)
3586 if (dump_enabled_p ())
3588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589 "not vectorized: data ref analysis "
3590 "failed ");
3591 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3594 if (is_a <bb_vec_info> (vinfo))
3595 break;
3597 return false;
3601 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3603 if (dump_enabled_p ())
3604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3605 "not vectorized: base addr of dr is a "
3606 "constant\n");
3608 if (is_a <bb_vec_info> (vinfo))
3609 break;
3611 if (gatherscatter != SG_NONE || simd_lane_access)
3612 free_data_ref (dr);
3613 return false;
3616 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3618 if (dump_enabled_p ())
3620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3621 "not vectorized: volatile type ");
3622 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3625 if (is_a <bb_vec_info> (vinfo))
3626 break;
3628 return false;
3631 if (stmt_can_throw_internal (stmt))
3633 if (dump_enabled_p ())
3635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3636 "not vectorized: statement can throw an "
3637 "exception ");
3638 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3641 if (is_a <bb_vec_info> (vinfo))
3642 break;
3644 if (gatherscatter != SG_NONE || simd_lane_access)
3645 free_data_ref (dr);
3646 return false;
3649 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3650 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3652 if (dump_enabled_p ())
3654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3655 "not vectorized: statement is bitfield "
3656 "access ");
3657 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3660 if (is_a <bb_vec_info> (vinfo))
3661 break;
3663 if (gatherscatter != SG_NONE || simd_lane_access)
3664 free_data_ref (dr);
3665 return false;
3668 base = unshare_expr (DR_BASE_ADDRESS (dr));
3669 offset = unshare_expr (DR_OFFSET (dr));
3670 init = unshare_expr (DR_INIT (dr));
3672 if (is_gimple_call (stmt)
3673 && (!gimple_call_internal_p (stmt)
3674 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3675 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3677 if (dump_enabled_p ())
3679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3680 "not vectorized: dr in a call ");
3681 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3684 if (is_a <bb_vec_info> (vinfo))
3685 break;
3687 if (gatherscatter != SG_NONE || simd_lane_access)
3688 free_data_ref (dr);
3689 return false;
3692 /* Update DR field in stmt_vec_info struct. */
3694 /* If the dataref is in an inner-loop of the loop that is considered for
3695 for vectorization, we also want to analyze the access relative to
3696 the outer-loop (DR contains information only relative to the
3697 inner-most enclosing loop). We do that by building a reference to the
3698 first location accessed by the inner-loop, and analyze it relative to
3699 the outer-loop. */
3700 if (loop && nested_in_vect_loop_p (loop, stmt))
3702 /* Build a reference to the first location accessed by the
3703 inner loop: *(BASE + INIT + OFFSET). By construction,
3704 this address must be invariant in the inner loop, so we
3705 can consider it as being used in the outer loop. */
3706 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3707 init, offset);
3708 tree init_addr = fold_build_pointer_plus (base, init_offset);
3709 tree init_ref = build_fold_indirect_ref (init_addr);
3711 if (dump_enabled_p ())
3713 dump_printf_loc (MSG_NOTE, vect_location,
3714 "analyze in outer loop: ");
3715 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3716 dump_printf (MSG_NOTE, "\n");
3719 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3720 init_ref, loop))
3721 /* dr_analyze_innermost already explained the failure. */
3722 return false;
3724 if (dump_enabled_p ())
3726 dump_printf_loc (MSG_NOTE, vect_location,
3727 "\touter base_address: ");
3728 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3729 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3730 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3731 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3732 STMT_VINFO_DR_OFFSET (stmt_info));
3733 dump_printf (MSG_NOTE,
3734 "\n\touter constant offset from base address: ");
3735 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3736 STMT_VINFO_DR_INIT (stmt_info));
3737 dump_printf (MSG_NOTE, "\n\touter step: ");
3738 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3739 STMT_VINFO_DR_STEP (stmt_info));
3740 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3741 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3742 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3743 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3744 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3745 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3746 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3747 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3751 if (STMT_VINFO_DATA_REF (stmt_info))
3753 if (dump_enabled_p ())
3755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3756 "not vectorized: more than one data ref "
3757 "in stmt: ");
3758 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3761 if (is_a <bb_vec_info> (vinfo))
3762 break;
3764 if (gatherscatter != SG_NONE || simd_lane_access)
3765 free_data_ref (dr);
3766 return false;
3769 STMT_VINFO_DATA_REF (stmt_info) = dr;
3770 if (simd_lane_access)
3772 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3773 free_data_ref (datarefs[i]);
3774 datarefs[i] = dr;
3777 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3778 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3779 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3781 if (dump_enabled_p ())
3783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3784 "not vectorized: base object not addressable "
3785 "for stmt: ");
3786 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3788 if (is_a <bb_vec_info> (vinfo))
3790 /* In BB vectorization the ref can still participate
3791 in dependence analysis, we just can't vectorize it. */
3792 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3793 continue;
3795 return false;
3798 /* Set vectype for STMT. */
3799 scalar_type = TREE_TYPE (DR_REF (dr));
3800 STMT_VINFO_VECTYPE (stmt_info)
3801 = get_vectype_for_scalar_type (scalar_type);
3802 if (!STMT_VINFO_VECTYPE (stmt_info))
3804 if (dump_enabled_p ())
3806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3807 "not vectorized: no vectype for stmt: ");
3808 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3809 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3810 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3811 scalar_type);
3812 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3815 if (is_a <bb_vec_info> (vinfo))
3817 /* No vector type is fine, the ref can still participate
3818 in dependence analysis, we just can't vectorize it. */
3819 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3820 continue;
3823 if (gatherscatter != SG_NONE || simd_lane_access)
3825 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3826 if (gatherscatter != SG_NONE)
3827 free_data_ref (dr);
3829 return false;
3831 else
3833 if (dump_enabled_p ())
3835 dump_printf_loc (MSG_NOTE, vect_location,
3836 "got vectype for stmt: ");
3837 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3838 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3839 STMT_VINFO_VECTYPE (stmt_info));
3840 dump_printf (MSG_NOTE, "\n");
3844 /* Adjust the minimal vectorization factor according to the
3845 vector type. */
3846 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3847 if (vf > *min_vf)
3848 *min_vf = vf;
3850 if (gatherscatter != SG_NONE)
3852 gather_scatter_info gs_info;
3853 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3854 &gs_info)
3855 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3857 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3858 free_data_ref (dr);
3859 if (dump_enabled_p ())
3861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3862 (gatherscatter == GATHER) ?
3863 "not vectorized: not suitable for gather "
3864 "load " :
3865 "not vectorized: not suitable for scatter "
3866 "store ");
3867 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3869 return false;
3872 free_data_ref (datarefs[i]);
3873 datarefs[i] = dr;
3874 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3877 else if (is_a <loop_vec_info> (vinfo)
3878 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3880 if (nested_in_vect_loop_p (loop, stmt))
3882 if (dump_enabled_p ())
3884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3885 "not vectorized: not suitable for strided "
3886 "load ");
3887 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3889 return false;
3891 STMT_VINFO_STRIDED_P (stmt_info) = true;
3895 /* If we stopped analysis at the first dataref we could not analyze
3896 when trying to vectorize a basic-block mark the rest of the datarefs
3897 as not vectorizable and truncate the vector of datarefs. That
3898 avoids spending useless time in analyzing their dependence. */
3899 if (i != datarefs.length ())
3901 gcc_assert (is_a <bb_vec_info> (vinfo));
3902 for (unsigned j = i; j < datarefs.length (); ++j)
3904 data_reference_p dr = datarefs[j];
3905 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3906 free_data_ref (dr);
3908 datarefs.truncate (i);
3911 return true;
3915 /* Function vect_get_new_vect_var.
3917 Returns a name for a new variable. The current naming scheme appends the
3918 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3919 the name of vectorizer generated variables, and appends that to NAME if
3920 provided. */
3922 tree
3923 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3925 const char *prefix;
3926 tree new_vect_var;
3928 switch (var_kind)
3930 case vect_simple_var:
3931 prefix = "vect";
3932 break;
3933 case vect_scalar_var:
3934 prefix = "stmp";
3935 break;
3936 case vect_mask_var:
3937 prefix = "mask";
3938 break;
3939 case vect_pointer_var:
3940 prefix = "vectp";
3941 break;
3942 default:
3943 gcc_unreachable ();
3946 if (name)
3948 char* tmp = concat (prefix, "_", name, NULL);
3949 new_vect_var = create_tmp_reg (type, tmp);
3950 free (tmp);
3952 else
3953 new_vect_var = create_tmp_reg (type, prefix);
3955 return new_vect_var;
3958 /* Like vect_get_new_vect_var but return an SSA name. */
3960 tree
3961 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3963 const char *prefix;
3964 tree new_vect_var;
3966 switch (var_kind)
3968 case vect_simple_var:
3969 prefix = "vect";
3970 break;
3971 case vect_scalar_var:
3972 prefix = "stmp";
3973 break;
3974 case vect_pointer_var:
3975 prefix = "vectp";
3976 break;
3977 default:
3978 gcc_unreachable ();
3981 if (name)
3983 char* tmp = concat (prefix, "_", name, NULL);
3984 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3985 free (tmp);
3987 else
3988 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3990 return new_vect_var;
3993 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3995 static void
3996 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3997 stmt_vec_info stmt_info)
3999 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4000 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4001 int misalign = DR_MISALIGNMENT (dr);
4002 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4003 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4004 else
4005 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4008 /* Function vect_create_addr_base_for_vector_ref.
4010 Create an expression that computes the address of the first memory location
4011 that will be accessed for a data reference.
4013 Input:
4014 STMT: The statement containing the data reference.
4015 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4016 OFFSET: Optional. If supplied, it is be added to the initial address.
4017 LOOP: Specify relative to which loop-nest should the address be computed.
4018 For example, when the dataref is in an inner-loop nested in an
4019 outer-loop that is now being vectorized, LOOP can be either the
4020 outer-loop, or the inner-loop. The first memory location accessed
4021 by the following dataref ('in' points to short):
4023 for (i=0; i<N; i++)
4024 for (j=0; j<M; j++)
4025 s += in[i+j]
4027 is as follows:
4028 if LOOP=i_loop: &in (relative to i_loop)
4029 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4030 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4031 initial address. Unlike OFFSET, which is number of elements to
4032 be added, BYTE_OFFSET is measured in bytes.
4034 Output:
4035 1. Return an SSA_NAME whose value is the address of the memory location of
4036 the first vector of the data reference.
4037 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4038 these statement(s) which define the returned SSA_NAME.
4040 FORNOW: We are only handling array accesses with step 1. */
4042 tree
4043 vect_create_addr_base_for_vector_ref (gimple *stmt,
4044 gimple_seq *new_stmt_list,
4045 tree offset,
4046 tree byte_offset)
4048 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4049 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4050 const char *base_name;
4051 tree addr_base;
4052 tree dest;
4053 gimple_seq seq = NULL;
4054 tree vect_ptr_type;
4055 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4056 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4057 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4059 tree data_ref_base = unshare_expr (drb->base_address);
4060 tree base_offset = unshare_expr (drb->offset);
4061 tree init = unshare_expr (drb->init);
4063 if (loop_vinfo)
4064 base_name = get_name (data_ref_base);
4065 else
4067 base_offset = ssize_int (0);
4068 init = ssize_int (0);
4069 base_name = get_name (DR_REF (dr));
4072 /* Create base_offset */
4073 base_offset = size_binop (PLUS_EXPR,
4074 fold_convert (sizetype, base_offset),
4075 fold_convert (sizetype, init));
4077 if (offset)
4079 offset = fold_build2 (MULT_EXPR, sizetype,
4080 fold_convert (sizetype, offset), step);
4081 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4082 base_offset, offset);
4084 if (byte_offset)
4086 byte_offset = fold_convert (sizetype, byte_offset);
4087 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4088 base_offset, byte_offset);
4091 /* base + base_offset */
4092 if (loop_vinfo)
4093 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4094 else
4096 addr_base = build1 (ADDR_EXPR,
4097 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4098 unshare_expr (DR_REF (dr)));
4101 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4102 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4103 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4104 gimple_seq_add_seq (new_stmt_list, seq);
4106 if (DR_PTR_INFO (dr)
4107 && TREE_CODE (addr_base) == SSA_NAME
4108 && !SSA_NAME_PTR_INFO (addr_base))
4110 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4111 if (offset || byte_offset)
4112 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4115 if (dump_enabled_p ())
4117 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4118 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4119 dump_printf (MSG_NOTE, "\n");
4122 return addr_base;
4126 /* Function vect_create_data_ref_ptr.
4128 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4129 location accessed in the loop by STMT, along with the def-use update
4130 chain to appropriately advance the pointer through the loop iterations.
4131 Also set aliasing information for the pointer. This pointer is used by
4132 the callers to this function to create a memory reference expression for
4133 vector load/store access.
4135 Input:
4136 1. STMT: a stmt that references memory. Expected to be of the form
4137 GIMPLE_ASSIGN <name, data-ref> or
4138 GIMPLE_ASSIGN <data-ref, name>.
4139 2. AGGR_TYPE: the type of the reference, which should be either a vector
4140 or an array.
4141 3. AT_LOOP: the loop where the vector memref is to be created.
4142 4. OFFSET (optional): an offset to be added to the initial address accessed
4143 by the data-ref in STMT.
4144 5. BSI: location where the new stmts are to be placed if there is no loop
4145 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4146 pointing to the initial address.
4147 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4148 to the initial address accessed by the data-ref in STMT. This is
4149 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4150 in bytes.
4152 Output:
4153 1. Declare a new ptr to vector_type, and have it point to the base of the
4154 data reference (initial addressed accessed by the data reference).
4155 For example, for vector of type V8HI, the following code is generated:
4157 v8hi *ap;
4158 ap = (v8hi *)initial_address;
4160 if OFFSET is not supplied:
4161 initial_address = &a[init];
4162 if OFFSET is supplied:
4163 initial_address = &a[init + OFFSET];
4164 if BYTE_OFFSET is supplied:
4165 initial_address = &a[init] + BYTE_OFFSET;
4167 Return the initial_address in INITIAL_ADDRESS.
4169 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4170 update the pointer in each iteration of the loop.
4172 Return the increment stmt that updates the pointer in PTR_INCR.
4174 3. Set INV_P to true if the access pattern of the data reference in the
4175 vectorized loop is invariant. Set it to false otherwise.
4177 4. Return the pointer. */
4179 tree
4180 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4181 tree offset, tree *initial_address,
4182 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4183 bool only_init, bool *inv_p, tree byte_offset)
4185 const char *base_name;
4186 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4187 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4188 struct loop *loop = NULL;
4189 bool nested_in_vect_loop = false;
4190 struct loop *containing_loop = NULL;
4191 tree aggr_ptr_type;
4192 tree aggr_ptr;
4193 tree new_temp;
4194 gimple_seq new_stmt_list = NULL;
4195 edge pe = NULL;
4196 basic_block new_bb;
4197 tree aggr_ptr_init;
4198 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4199 tree aptr;
4200 gimple_stmt_iterator incr_gsi;
4201 bool insert_after;
4202 tree indx_before_incr, indx_after_incr;
4203 gimple *incr;
4204 tree step;
4205 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4207 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4208 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4210 if (loop_vinfo)
4212 loop = LOOP_VINFO_LOOP (loop_vinfo);
4213 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4214 containing_loop = (gimple_bb (stmt))->loop_father;
4215 pe = loop_preheader_edge (loop);
4217 else
4219 gcc_assert (bb_vinfo);
4220 only_init = true;
4221 *ptr_incr = NULL;
4224 /* Check the step (evolution) of the load in LOOP, and record
4225 whether it's invariant. */
4226 step = vect_dr_behavior (dr)->step;
4227 if (integer_zerop (step))
4228 *inv_p = true;
4229 else
4230 *inv_p = false;
4232 /* Create an expression for the first address accessed by this load
4233 in LOOP. */
4234 base_name = get_name (DR_BASE_ADDRESS (dr));
4236 if (dump_enabled_p ())
4238 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4239 dump_printf_loc (MSG_NOTE, vect_location,
4240 "create %s-pointer variable to type: ",
4241 get_tree_code_name (TREE_CODE (aggr_type)));
4242 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4243 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4244 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4245 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4246 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4247 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4248 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4249 else
4250 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4251 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4252 dump_printf (MSG_NOTE, "\n");
4255 /* (1) Create the new aggregate-pointer variable.
4256 Vector and array types inherit the alias set of their component
4257 type by default so we need to use a ref-all pointer if the data
4258 reference does not conflict with the created aggregated data
4259 reference because it is not addressable. */
4260 bool need_ref_all = false;
4261 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4262 get_alias_set (DR_REF (dr))))
4263 need_ref_all = true;
4264 /* Likewise for any of the data references in the stmt group. */
4265 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4267 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4270 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4271 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4272 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4273 get_alias_set (DR_REF (sdr))))
4275 need_ref_all = true;
4276 break;
4278 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4280 while (orig_stmt);
4282 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4283 need_ref_all);
4284 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4287 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4288 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4289 def-use update cycles for the pointer: one relative to the outer-loop
4290 (LOOP), which is what steps (3) and (4) below do. The other is relative
4291 to the inner-loop (which is the inner-most loop containing the dataref),
4292 and this is done be step (5) below.
4294 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4295 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4296 redundant. Steps (3),(4) create the following:
4298 vp0 = &base_addr;
4299 LOOP: vp1 = phi(vp0,vp2)
4302 vp2 = vp1 + step
4303 goto LOOP
4305 If there is an inner-loop nested in loop, then step (5) will also be
4306 applied, and an additional update in the inner-loop will be created:
4308 vp0 = &base_addr;
4309 LOOP: vp1 = phi(vp0,vp2)
4311 inner: vp3 = phi(vp1,vp4)
4312 vp4 = vp3 + inner_step
4313 if () goto inner
4315 vp2 = vp1 + step
4316 if () goto LOOP */
4318 /* (2) Calculate the initial address of the aggregate-pointer, and set
4319 the aggregate-pointer to point to it before the loop. */
4321 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4323 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4324 offset, byte_offset);
4325 if (new_stmt_list)
4327 if (pe)
4329 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4330 gcc_assert (!new_bb);
4332 else
4333 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4336 *initial_address = new_temp;
4337 aggr_ptr_init = new_temp;
4339 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4340 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4341 inner-loop nested in LOOP (during outer-loop vectorization). */
4343 /* No update in loop is required. */
4344 if (only_init && (!loop_vinfo || at_loop == loop))
4345 aptr = aggr_ptr_init;
4346 else
4348 /* The step of the aggregate pointer is the type size. */
4349 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4350 /* One exception to the above is when the scalar step of the load in
4351 LOOP is zero. In this case the step here is also zero. */
4352 if (*inv_p)
4353 iv_step = size_zero_node;
4354 else if (tree_int_cst_sgn (step) == -1)
4355 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4357 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4359 create_iv (aggr_ptr_init,
4360 fold_convert (aggr_ptr_type, iv_step),
4361 aggr_ptr, loop, &incr_gsi, insert_after,
4362 &indx_before_incr, &indx_after_incr);
4363 incr = gsi_stmt (incr_gsi);
4364 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4366 /* Copy the points-to information if it exists. */
4367 if (DR_PTR_INFO (dr))
4369 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4370 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4372 if (ptr_incr)
4373 *ptr_incr = incr;
4375 aptr = indx_before_incr;
4378 if (!nested_in_vect_loop || only_init)
4379 return aptr;
4382 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4383 nested in LOOP, if exists. */
4385 gcc_assert (nested_in_vect_loop);
4386 if (!only_init)
4388 standard_iv_increment_position (containing_loop, &incr_gsi,
4389 &insert_after);
4390 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4391 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4392 &indx_after_incr);
4393 incr = gsi_stmt (incr_gsi);
4394 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4396 /* Copy the points-to information if it exists. */
4397 if (DR_PTR_INFO (dr))
4399 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4400 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4402 if (ptr_incr)
4403 *ptr_incr = incr;
4405 return indx_before_incr;
4407 else
4408 gcc_unreachable ();
4412 /* Function bump_vector_ptr
4414 Increment a pointer (to a vector type) by vector-size. If requested,
4415 i.e. if PTR-INCR is given, then also connect the new increment stmt
4416 to the existing def-use update-chain of the pointer, by modifying
4417 the PTR_INCR as illustrated below:
4419 The pointer def-use update-chain before this function:
4420 DATAREF_PTR = phi (p_0, p_2)
4421 ....
4422 PTR_INCR: p_2 = DATAREF_PTR + step
4424 The pointer def-use update-chain after this function:
4425 DATAREF_PTR = phi (p_0, p_2)
4426 ....
4427 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4428 ....
4429 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4431 Input:
4432 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4433 in the loop.
4434 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4435 the loop. The increment amount across iterations is expected
4436 to be vector_size.
4437 BSI - location where the new update stmt is to be placed.
4438 STMT - the original scalar memory-access stmt that is being vectorized.
4439 BUMP - optional. The offset by which to bump the pointer. If not given,
4440 the offset is assumed to be vector_size.
4442 Output: Return NEW_DATAREF_PTR as illustrated above.
4446 tree
4447 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4448 gimple *stmt, tree bump)
4450 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4451 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4452 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4453 tree update = TYPE_SIZE_UNIT (vectype);
4454 gassign *incr_stmt;
4455 ssa_op_iter iter;
4456 use_operand_p use_p;
4457 tree new_dataref_ptr;
4459 if (bump)
4460 update = bump;
4462 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4463 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4464 else
4465 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4466 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4467 dataref_ptr, update);
4468 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4470 /* Copy the points-to information if it exists. */
4471 if (DR_PTR_INFO (dr))
4473 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4474 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4477 if (!ptr_incr)
4478 return new_dataref_ptr;
4480 /* Update the vector-pointer's cross-iteration increment. */
4481 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4483 tree use = USE_FROM_PTR (use_p);
4485 if (use == dataref_ptr)
4486 SET_USE (use_p, new_dataref_ptr);
4487 else
4488 gcc_assert (tree_int_cst_compare (use, update) == 0);
4491 return new_dataref_ptr;
4495 /* Function vect_create_destination_var.
4497 Create a new temporary of type VECTYPE. */
4499 tree
4500 vect_create_destination_var (tree scalar_dest, tree vectype)
4502 tree vec_dest;
4503 const char *name;
4504 char *new_name;
4505 tree type;
4506 enum vect_var_kind kind;
4508 kind = vectype
4509 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4510 ? vect_mask_var
4511 : vect_simple_var
4512 : vect_scalar_var;
4513 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4515 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4517 name = get_name (scalar_dest);
4518 if (name)
4519 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4520 else
4521 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4522 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4523 free (new_name);
4525 return vec_dest;
4528 /* Function vect_grouped_store_supported.
4530 Returns TRUE if interleave high and interleave low permutations
4531 are supported, and FALSE otherwise. */
4533 bool
4534 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4536 machine_mode mode = TYPE_MODE (vectype);
4538 /* vect_permute_store_chain requires the group size to be equal to 3 or
4539 be a power of two. */
4540 if (count != 3 && exact_log2 (count) == -1)
4542 if (dump_enabled_p ())
4543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4544 "the size of the group of accesses"
4545 " is not a power of 2 or not eqaul to 3\n");
4546 return false;
4549 /* Check that the permutation is supported. */
4550 if (VECTOR_MODE_P (mode))
4552 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4553 auto_vec_perm_indices sel (nelt);
4554 sel.quick_grow (nelt);
4556 if (count == 3)
4558 unsigned int j0 = 0, j1 = 0, j2 = 0;
4559 unsigned int i, j;
4561 for (j = 0; j < 3; j++)
4563 int nelt0 = ((3 - j) * nelt) % 3;
4564 int nelt1 = ((3 - j) * nelt + 1) % 3;
4565 int nelt2 = ((3 - j) * nelt + 2) % 3;
4566 for (i = 0; i < nelt; i++)
4568 if (3 * i + nelt0 < nelt)
4569 sel[3 * i + nelt0] = j0++;
4570 if (3 * i + nelt1 < nelt)
4571 sel[3 * i + nelt1] = nelt + j1++;
4572 if (3 * i + nelt2 < nelt)
4573 sel[3 * i + nelt2] = 0;
4575 if (!can_vec_perm_p (mode, false, &sel))
4577 if (dump_enabled_p ())
4578 dump_printf (MSG_MISSED_OPTIMIZATION,
4579 "permutaion op not supported by target.\n");
4580 return false;
4583 for (i = 0; i < nelt; i++)
4585 if (3 * i + nelt0 < nelt)
4586 sel[3 * i + nelt0] = 3 * i + nelt0;
4587 if (3 * i + nelt1 < nelt)
4588 sel[3 * i + nelt1] = 3 * i + nelt1;
4589 if (3 * i + nelt2 < nelt)
4590 sel[3 * i + nelt2] = nelt + j2++;
4592 if (!can_vec_perm_p (mode, false, &sel))
4594 if (dump_enabled_p ())
4595 dump_printf (MSG_MISSED_OPTIMIZATION,
4596 "permutaion op not supported by target.\n");
4597 return false;
4600 return true;
4602 else
4604 /* If length is not equal to 3 then only power of 2 is supported. */
4605 gcc_assert (pow2p_hwi (count));
4607 for (i = 0; i < nelt / 2; i++)
4609 sel[i * 2] = i;
4610 sel[i * 2 + 1] = i + nelt;
4612 if (can_vec_perm_p (mode, false, &sel))
4614 for (i = 0; i < nelt; i++)
4615 sel[i] += nelt / 2;
4616 if (can_vec_perm_p (mode, false, &sel))
4617 return true;
4622 if (dump_enabled_p ())
4623 dump_printf (MSG_MISSED_OPTIMIZATION,
4624 "permutaion op not supported by target.\n");
4625 return false;
4629 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4630 type VECTYPE. */
4632 bool
4633 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4635 return vect_lanes_optab_supported_p ("vec_store_lanes",
4636 vec_store_lanes_optab,
4637 vectype, count);
4641 /* Function vect_permute_store_chain.
4643 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4644 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4645 the data correctly for the stores. Return the final references for stores
4646 in RESULT_CHAIN.
4648 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4649 The input is 4 vectors each containing 8 elements. We assign a number to
4650 each element, the input sequence is:
4652 1st vec: 0 1 2 3 4 5 6 7
4653 2nd vec: 8 9 10 11 12 13 14 15
4654 3rd vec: 16 17 18 19 20 21 22 23
4655 4th vec: 24 25 26 27 28 29 30 31
4657 The output sequence should be:
4659 1st vec: 0 8 16 24 1 9 17 25
4660 2nd vec: 2 10 18 26 3 11 19 27
4661 3rd vec: 4 12 20 28 5 13 21 30
4662 4th vec: 6 14 22 30 7 15 23 31
4664 i.e., we interleave the contents of the four vectors in their order.
4666 We use interleave_high/low instructions to create such output. The input of
4667 each interleave_high/low operation is two vectors:
4668 1st vec 2nd vec
4669 0 1 2 3 4 5 6 7
4670 the even elements of the result vector are obtained left-to-right from the
4671 high/low elements of the first vector. The odd elements of the result are
4672 obtained left-to-right from the high/low elements of the second vector.
4673 The output of interleave_high will be: 0 4 1 5
4674 and of interleave_low: 2 6 3 7
4677 The permutation is done in log LENGTH stages. In each stage interleave_high
4678 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4679 where the first argument is taken from the first half of DR_CHAIN and the
4680 second argument from it's second half.
4681 In our example,
4683 I1: interleave_high (1st vec, 3rd vec)
4684 I2: interleave_low (1st vec, 3rd vec)
4685 I3: interleave_high (2nd vec, 4th vec)
4686 I4: interleave_low (2nd vec, 4th vec)
4688 The output for the first stage is:
4690 I1: 0 16 1 17 2 18 3 19
4691 I2: 4 20 5 21 6 22 7 23
4692 I3: 8 24 9 25 10 26 11 27
4693 I4: 12 28 13 29 14 30 15 31
4695 The output of the second stage, i.e. the final result is:
4697 I1: 0 8 16 24 1 9 17 25
4698 I2: 2 10 18 26 3 11 19 27
4699 I3: 4 12 20 28 5 13 21 30
4700 I4: 6 14 22 30 7 15 23 31. */
4702 void
4703 vect_permute_store_chain (vec<tree> dr_chain,
4704 unsigned int length,
4705 gimple *stmt,
4706 gimple_stmt_iterator *gsi,
4707 vec<tree> *result_chain)
4709 tree vect1, vect2, high, low;
4710 gimple *perm_stmt;
4711 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4712 tree perm_mask_low, perm_mask_high;
4713 tree data_ref;
4714 tree perm3_mask_low, perm3_mask_high;
4715 unsigned int i, n, log_length = exact_log2 (length);
4716 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4718 auto_vec_perm_indices sel (nelt);
4719 sel.quick_grow (nelt);
4721 result_chain->quick_grow (length);
4722 memcpy (result_chain->address (), dr_chain.address (),
4723 length * sizeof (tree));
4725 if (length == 3)
4727 unsigned int j0 = 0, j1 = 0, j2 = 0;
4729 for (j = 0; j < 3; j++)
4731 int nelt0 = ((3 - j) * nelt) % 3;
4732 int nelt1 = ((3 - j) * nelt + 1) % 3;
4733 int nelt2 = ((3 - j) * nelt + 2) % 3;
4735 for (i = 0; i < nelt; i++)
4737 if (3 * i + nelt0 < nelt)
4738 sel[3 * i + nelt0] = j0++;
4739 if (3 * i + nelt1 < nelt)
4740 sel[3 * i + nelt1] = nelt + j1++;
4741 if (3 * i + nelt2 < nelt)
4742 sel[3 * i + nelt2] = 0;
4744 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4746 for (i = 0; i < nelt; i++)
4748 if (3 * i + nelt0 < nelt)
4749 sel[3 * i + nelt0] = 3 * i + nelt0;
4750 if (3 * i + nelt1 < nelt)
4751 sel[3 * i + nelt1] = 3 * i + nelt1;
4752 if (3 * i + nelt2 < nelt)
4753 sel[3 * i + nelt2] = nelt + j2++;
4755 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4757 vect1 = dr_chain[0];
4758 vect2 = dr_chain[1];
4760 /* Create interleaving stmt:
4761 low = VEC_PERM_EXPR <vect1, vect2,
4762 {j, nelt, *, j + 1, nelt + j + 1, *,
4763 j + 2, nelt + j + 2, *, ...}> */
4764 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4765 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4766 vect2, perm3_mask_low);
4767 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4769 vect1 = data_ref;
4770 vect2 = dr_chain[2];
4771 /* Create interleaving stmt:
4772 low = VEC_PERM_EXPR <vect1, vect2,
4773 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4774 6, 7, nelt + j + 2, ...}> */
4775 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4776 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4777 vect2, perm3_mask_high);
4778 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4779 (*result_chain)[j] = data_ref;
4782 else
4784 /* If length is not equal to 3 then only power of 2 is supported. */
4785 gcc_assert (pow2p_hwi (length));
4787 for (i = 0, n = nelt / 2; i < n; i++)
4789 sel[i * 2] = i;
4790 sel[i * 2 + 1] = i + nelt;
4792 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4794 for (i = 0; i < nelt; i++)
4795 sel[i] += nelt / 2;
4796 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4798 for (i = 0, n = log_length; i < n; i++)
4800 for (j = 0; j < length/2; j++)
4802 vect1 = dr_chain[j];
4803 vect2 = dr_chain[j+length/2];
4805 /* Create interleaving stmt:
4806 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4807 ...}> */
4808 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4809 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4810 vect2, perm_mask_high);
4811 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4812 (*result_chain)[2*j] = high;
4814 /* Create interleaving stmt:
4815 low = VEC_PERM_EXPR <vect1, vect2,
4816 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4817 ...}> */
4818 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4819 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4820 vect2, perm_mask_low);
4821 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4822 (*result_chain)[2*j+1] = low;
4824 memcpy (dr_chain.address (), result_chain->address (),
4825 length * sizeof (tree));
4830 /* Function vect_setup_realignment
4832 This function is called when vectorizing an unaligned load using
4833 the dr_explicit_realign[_optimized] scheme.
4834 This function generates the following code at the loop prolog:
4836 p = initial_addr;
4837 x msq_init = *(floor(p)); # prolog load
4838 realignment_token = call target_builtin;
4839 loop:
4840 x msq = phi (msq_init, ---)
4842 The stmts marked with x are generated only for the case of
4843 dr_explicit_realign_optimized.
4845 The code above sets up a new (vector) pointer, pointing to the first
4846 location accessed by STMT, and a "floor-aligned" load using that pointer.
4847 It also generates code to compute the "realignment-token" (if the relevant
4848 target hook was defined), and creates a phi-node at the loop-header bb
4849 whose arguments are the result of the prolog-load (created by this
4850 function) and the result of a load that takes place in the loop (to be
4851 created by the caller to this function).
4853 For the case of dr_explicit_realign_optimized:
4854 The caller to this function uses the phi-result (msq) to create the
4855 realignment code inside the loop, and sets up the missing phi argument,
4856 as follows:
4857 loop:
4858 msq = phi (msq_init, lsq)
4859 lsq = *(floor(p')); # load in loop
4860 result = realign_load (msq, lsq, realignment_token);
4862 For the case of dr_explicit_realign:
4863 loop:
4864 msq = *(floor(p)); # load in loop
4865 p' = p + (VS-1);
4866 lsq = *(floor(p')); # load in loop
4867 result = realign_load (msq, lsq, realignment_token);
4869 Input:
4870 STMT - (scalar) load stmt to be vectorized. This load accesses
4871 a memory location that may be unaligned.
4872 BSI - place where new code is to be inserted.
4873 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4874 is used.
4876 Output:
4877 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4878 target hook, if defined.
4879 Return value - the result of the loop-header phi node. */
4881 tree
4882 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4883 tree *realignment_token,
4884 enum dr_alignment_support alignment_support_scheme,
4885 tree init_addr,
4886 struct loop **at_loop)
4888 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4889 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4890 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4891 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4892 struct loop *loop = NULL;
4893 edge pe = NULL;
4894 tree scalar_dest = gimple_assign_lhs (stmt);
4895 tree vec_dest;
4896 gimple *inc;
4897 tree ptr;
4898 tree data_ref;
4899 basic_block new_bb;
4900 tree msq_init = NULL_TREE;
4901 tree new_temp;
4902 gphi *phi_stmt;
4903 tree msq = NULL_TREE;
4904 gimple_seq stmts = NULL;
4905 bool inv_p;
4906 bool compute_in_loop = false;
4907 bool nested_in_vect_loop = false;
4908 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4909 struct loop *loop_for_initial_load = NULL;
4911 if (loop_vinfo)
4913 loop = LOOP_VINFO_LOOP (loop_vinfo);
4914 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4917 gcc_assert (alignment_support_scheme == dr_explicit_realign
4918 || alignment_support_scheme == dr_explicit_realign_optimized);
4920 /* We need to generate three things:
4921 1. the misalignment computation
4922 2. the extra vector load (for the optimized realignment scheme).
4923 3. the phi node for the two vectors from which the realignment is
4924 done (for the optimized realignment scheme). */
4926 /* 1. Determine where to generate the misalignment computation.
4928 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4929 calculation will be generated by this function, outside the loop (in the
4930 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4931 caller, inside the loop.
4933 Background: If the misalignment remains fixed throughout the iterations of
4934 the loop, then both realignment schemes are applicable, and also the
4935 misalignment computation can be done outside LOOP. This is because we are
4936 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4937 are a multiple of VS (the Vector Size), and therefore the misalignment in
4938 different vectorized LOOP iterations is always the same.
4939 The problem arises only if the memory access is in an inner-loop nested
4940 inside LOOP, which is now being vectorized using outer-loop vectorization.
4941 This is the only case when the misalignment of the memory access may not
4942 remain fixed throughout the iterations of the inner-loop (as explained in
4943 detail in vect_supportable_dr_alignment). In this case, not only is the
4944 optimized realignment scheme not applicable, but also the misalignment
4945 computation (and generation of the realignment token that is passed to
4946 REALIGN_LOAD) have to be done inside the loop.
4948 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4949 or not, which in turn determines if the misalignment is computed inside
4950 the inner-loop, or outside LOOP. */
4952 if (init_addr != NULL_TREE || !loop_vinfo)
4954 compute_in_loop = true;
4955 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4959 /* 2. Determine where to generate the extra vector load.
4961 For the optimized realignment scheme, instead of generating two vector
4962 loads in each iteration, we generate a single extra vector load in the
4963 preheader of the loop, and in each iteration reuse the result of the
4964 vector load from the previous iteration. In case the memory access is in
4965 an inner-loop nested inside LOOP, which is now being vectorized using
4966 outer-loop vectorization, we need to determine whether this initial vector
4967 load should be generated at the preheader of the inner-loop, or can be
4968 generated at the preheader of LOOP. If the memory access has no evolution
4969 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4970 to be generated inside LOOP (in the preheader of the inner-loop). */
4972 if (nested_in_vect_loop)
4974 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4975 bool invariant_in_outerloop =
4976 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4977 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4979 else
4980 loop_for_initial_load = loop;
4981 if (at_loop)
4982 *at_loop = loop_for_initial_load;
4984 if (loop_for_initial_load)
4985 pe = loop_preheader_edge (loop_for_initial_load);
4987 /* 3. For the case of the optimized realignment, create the first vector
4988 load at the loop preheader. */
4990 if (alignment_support_scheme == dr_explicit_realign_optimized)
4992 /* Create msq_init = *(floor(p1)) in the loop preheader */
4993 gassign *new_stmt;
4995 gcc_assert (!compute_in_loop);
4996 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4997 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4998 NULL_TREE, &init_addr, NULL, &inc,
4999 true, &inv_p);
5000 if (TREE_CODE (ptr) == SSA_NAME)
5001 new_temp = copy_ssa_name (ptr);
5002 else
5003 new_temp = make_ssa_name (TREE_TYPE (ptr));
5004 new_stmt = gimple_build_assign
5005 (new_temp, BIT_AND_EXPR, ptr,
5006 build_int_cst (TREE_TYPE (ptr),
5007 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5008 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5009 gcc_assert (!new_bb);
5010 data_ref
5011 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5012 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5013 new_stmt = gimple_build_assign (vec_dest, data_ref);
5014 new_temp = make_ssa_name (vec_dest, new_stmt);
5015 gimple_assign_set_lhs (new_stmt, new_temp);
5016 if (pe)
5018 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5019 gcc_assert (!new_bb);
5021 else
5022 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5024 msq_init = gimple_assign_lhs (new_stmt);
5027 /* 4. Create realignment token using a target builtin, if available.
5028 It is done either inside the containing loop, or before LOOP (as
5029 determined above). */
5031 if (targetm.vectorize.builtin_mask_for_load)
5033 gcall *new_stmt;
5034 tree builtin_decl;
5036 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5037 if (!init_addr)
5039 /* Generate the INIT_ADDR computation outside LOOP. */
5040 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5041 NULL_TREE);
5042 if (loop)
5044 pe = loop_preheader_edge (loop);
5045 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5046 gcc_assert (!new_bb);
5048 else
5049 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5052 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5053 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5054 vec_dest =
5055 vect_create_destination_var (scalar_dest,
5056 gimple_call_return_type (new_stmt));
5057 new_temp = make_ssa_name (vec_dest, new_stmt);
5058 gimple_call_set_lhs (new_stmt, new_temp);
5060 if (compute_in_loop)
5061 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5062 else
5064 /* Generate the misalignment computation outside LOOP. */
5065 pe = loop_preheader_edge (loop);
5066 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5067 gcc_assert (!new_bb);
5070 *realignment_token = gimple_call_lhs (new_stmt);
5072 /* The result of the CALL_EXPR to this builtin is determined from
5073 the value of the parameter and no global variables are touched
5074 which makes the builtin a "const" function. Requiring the
5075 builtin to have the "const" attribute makes it unnecessary
5076 to call mark_call_clobbered. */
5077 gcc_assert (TREE_READONLY (builtin_decl));
5080 if (alignment_support_scheme == dr_explicit_realign)
5081 return msq;
5083 gcc_assert (!compute_in_loop);
5084 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5087 /* 5. Create msq = phi <msq_init, lsq> in loop */
5089 pe = loop_preheader_edge (containing_loop);
5090 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5091 msq = make_ssa_name (vec_dest);
5092 phi_stmt = create_phi_node (msq, containing_loop->header);
5093 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5095 return msq;
5099 /* Function vect_grouped_load_supported.
5101 COUNT is the size of the load group (the number of statements plus the
5102 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5103 only one statement, with a gap of COUNT - 1.
5105 Returns true if a suitable permute exists. */
5107 bool
5108 vect_grouped_load_supported (tree vectype, bool single_element_p,
5109 unsigned HOST_WIDE_INT count)
5111 machine_mode mode = TYPE_MODE (vectype);
5113 /* If this is single-element interleaving with an element distance
5114 that leaves unused vector loads around punt - we at least create
5115 very sub-optimal code in that case (and blow up memory,
5116 see PR65518). */
5117 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5119 if (dump_enabled_p ())
5120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5121 "single-element interleaving not supported "
5122 "for not adjacent vector loads\n");
5123 return false;
5126 /* vect_permute_load_chain requires the group size to be equal to 3 or
5127 be a power of two. */
5128 if (count != 3 && exact_log2 (count) == -1)
5130 if (dump_enabled_p ())
5131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5132 "the size of the group of accesses"
5133 " is not a power of 2 or not equal to 3\n");
5134 return false;
5137 /* Check that the permutation is supported. */
5138 if (VECTOR_MODE_P (mode))
5140 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5141 auto_vec_perm_indices sel (nelt);
5142 sel.quick_grow (nelt);
5144 if (count == 3)
5146 unsigned int k;
5147 for (k = 0; k < 3; k++)
5149 for (i = 0; i < nelt; i++)
5150 if (3 * i + k < 2 * nelt)
5151 sel[i] = 3 * i + k;
5152 else
5153 sel[i] = 0;
5154 if (!can_vec_perm_p (mode, false, &sel))
5156 if (dump_enabled_p ())
5157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5158 "shuffle of 3 loads is not supported by"
5159 " target\n");
5160 return false;
5162 for (i = 0, j = 0; i < nelt; i++)
5163 if (3 * i + k < 2 * nelt)
5164 sel[i] = i;
5165 else
5166 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5167 if (!can_vec_perm_p (mode, false, &sel))
5169 if (dump_enabled_p ())
5170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5171 "shuffle of 3 loads is not supported by"
5172 " target\n");
5173 return false;
5176 return true;
5178 else
5180 /* If length is not equal to 3 then only power of 2 is supported. */
5181 gcc_assert (pow2p_hwi (count));
5182 for (i = 0; i < nelt; i++)
5183 sel[i] = i * 2;
5184 if (can_vec_perm_p (mode, false, &sel))
5186 for (i = 0; i < nelt; i++)
5187 sel[i] = i * 2 + 1;
5188 if (can_vec_perm_p (mode, false, &sel))
5189 return true;
5194 if (dump_enabled_p ())
5195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5196 "extract even/odd not supported by target\n");
5197 return false;
5200 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5201 type VECTYPE. */
5203 bool
5204 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5206 return vect_lanes_optab_supported_p ("vec_load_lanes",
5207 vec_load_lanes_optab,
5208 vectype, count);
5211 /* Function vect_permute_load_chain.
5213 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5214 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5215 the input data correctly. Return the final references for loads in
5216 RESULT_CHAIN.
5218 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5219 The input is 4 vectors each containing 8 elements. We assign a number to each
5220 element, the input sequence is:
5222 1st vec: 0 1 2 3 4 5 6 7
5223 2nd vec: 8 9 10 11 12 13 14 15
5224 3rd vec: 16 17 18 19 20 21 22 23
5225 4th vec: 24 25 26 27 28 29 30 31
5227 The output sequence should be:
5229 1st vec: 0 4 8 12 16 20 24 28
5230 2nd vec: 1 5 9 13 17 21 25 29
5231 3rd vec: 2 6 10 14 18 22 26 30
5232 4th vec: 3 7 11 15 19 23 27 31
5234 i.e., the first output vector should contain the first elements of each
5235 interleaving group, etc.
5237 We use extract_even/odd instructions to create such output. The input of
5238 each extract_even/odd operation is two vectors
5239 1st vec 2nd vec
5240 0 1 2 3 4 5 6 7
5242 and the output is the vector of extracted even/odd elements. The output of
5243 extract_even will be: 0 2 4 6
5244 and of extract_odd: 1 3 5 7
5247 The permutation is done in log LENGTH stages. In each stage extract_even
5248 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5249 their order. In our example,
5251 E1: extract_even (1st vec, 2nd vec)
5252 E2: extract_odd (1st vec, 2nd vec)
5253 E3: extract_even (3rd vec, 4th vec)
5254 E4: extract_odd (3rd vec, 4th vec)
5256 The output for the first stage will be:
5258 E1: 0 2 4 6 8 10 12 14
5259 E2: 1 3 5 7 9 11 13 15
5260 E3: 16 18 20 22 24 26 28 30
5261 E4: 17 19 21 23 25 27 29 31
5263 In order to proceed and create the correct sequence for the next stage (or
5264 for the correct output, if the second stage is the last one, as in our
5265 example), we first put the output of extract_even operation and then the
5266 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5267 The input for the second stage is:
5269 1st vec (E1): 0 2 4 6 8 10 12 14
5270 2nd vec (E3): 16 18 20 22 24 26 28 30
5271 3rd vec (E2): 1 3 5 7 9 11 13 15
5272 4th vec (E4): 17 19 21 23 25 27 29 31
5274 The output of the second stage:
5276 E1: 0 4 8 12 16 20 24 28
5277 E2: 2 6 10 14 18 22 26 30
5278 E3: 1 5 9 13 17 21 25 29
5279 E4: 3 7 11 15 19 23 27 31
5281 And RESULT_CHAIN after reordering:
5283 1st vec (E1): 0 4 8 12 16 20 24 28
5284 2nd vec (E3): 1 5 9 13 17 21 25 29
5285 3rd vec (E2): 2 6 10 14 18 22 26 30
5286 4th vec (E4): 3 7 11 15 19 23 27 31. */
5288 static void
5289 vect_permute_load_chain (vec<tree> dr_chain,
5290 unsigned int length,
5291 gimple *stmt,
5292 gimple_stmt_iterator *gsi,
5293 vec<tree> *result_chain)
5295 tree data_ref, first_vect, second_vect;
5296 tree perm_mask_even, perm_mask_odd;
5297 tree perm3_mask_low, perm3_mask_high;
5298 gimple *perm_stmt;
5299 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5300 unsigned int i, j, log_length = exact_log2 (length);
5301 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5303 auto_vec_perm_indices sel (nelt);
5304 sel.quick_grow (nelt);
5306 result_chain->quick_grow (length);
5307 memcpy (result_chain->address (), dr_chain.address (),
5308 length * sizeof (tree));
5310 if (length == 3)
5312 unsigned int k;
5314 for (k = 0; k < 3; k++)
5316 for (i = 0; i < nelt; i++)
5317 if (3 * i + k < 2 * nelt)
5318 sel[i] = 3 * i + k;
5319 else
5320 sel[i] = 0;
5321 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5323 for (i = 0, j = 0; i < nelt; i++)
5324 if (3 * i + k < 2 * nelt)
5325 sel[i] = i;
5326 else
5327 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5329 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5331 first_vect = dr_chain[0];
5332 second_vect = dr_chain[1];
5334 /* Create interleaving stmt (low part of):
5335 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5336 ...}> */
5337 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5338 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5339 second_vect, perm3_mask_low);
5340 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5342 /* Create interleaving stmt (high part of):
5343 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5344 ...}> */
5345 first_vect = data_ref;
5346 second_vect = dr_chain[2];
5347 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5348 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5349 second_vect, perm3_mask_high);
5350 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5351 (*result_chain)[k] = data_ref;
5354 else
5356 /* If length is not equal to 3 then only power of 2 is supported. */
5357 gcc_assert (pow2p_hwi (length));
5359 for (i = 0; i < nelt; ++i)
5360 sel[i] = i * 2;
5361 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5363 for (i = 0; i < nelt; ++i)
5364 sel[i] = i * 2 + 1;
5365 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5367 for (i = 0; i < log_length; i++)
5369 for (j = 0; j < length; j += 2)
5371 first_vect = dr_chain[j];
5372 second_vect = dr_chain[j+1];
5374 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5375 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5376 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5377 first_vect, second_vect,
5378 perm_mask_even);
5379 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5380 (*result_chain)[j/2] = data_ref;
5382 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5383 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5384 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5385 first_vect, second_vect,
5386 perm_mask_odd);
5387 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5388 (*result_chain)[j/2+length/2] = data_ref;
5390 memcpy (dr_chain.address (), result_chain->address (),
5391 length * sizeof (tree));
5396 /* Function vect_shift_permute_load_chain.
5398 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5399 sequence of stmts to reorder the input data accordingly.
5400 Return the final references for loads in RESULT_CHAIN.
5401 Return true if successed, false otherwise.
5403 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5404 The input is 3 vectors each containing 8 elements. We assign a
5405 number to each element, the input sequence is:
5407 1st vec: 0 1 2 3 4 5 6 7
5408 2nd vec: 8 9 10 11 12 13 14 15
5409 3rd vec: 16 17 18 19 20 21 22 23
5411 The output sequence should be:
5413 1st vec: 0 3 6 9 12 15 18 21
5414 2nd vec: 1 4 7 10 13 16 19 22
5415 3rd vec: 2 5 8 11 14 17 20 23
5417 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5419 First we shuffle all 3 vectors to get correct elements order:
5421 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5422 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5423 3rd vec: (16 19 22) (17 20 23) (18 21)
5425 Next we unite and shift vector 3 times:
5427 1st step:
5428 shift right by 6 the concatenation of:
5429 "1st vec" and "2nd vec"
5430 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5431 "2nd vec" and "3rd vec"
5432 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5433 "3rd vec" and "1st vec"
5434 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5435 | New vectors |
5437 So that now new vectors are:
5439 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5440 2nd vec: (10 13) (16 19 22) (17 20 23)
5441 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5443 2nd step:
5444 shift right by 5 the concatenation of:
5445 "1st vec" and "3rd vec"
5446 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5447 "2nd vec" and "1st vec"
5448 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5449 "3rd vec" and "2nd vec"
5450 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5451 | New vectors |
5453 So that now new vectors are:
5455 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5456 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5457 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5459 3rd step:
5460 shift right by 5 the concatenation of:
5461 "1st vec" and "1st vec"
5462 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5463 shift right by 3 the concatenation of:
5464 "2nd vec" and "2nd vec"
5465 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5466 | New vectors |
5468 So that now all vectors are READY:
5469 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5470 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5471 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5473 This algorithm is faster than one in vect_permute_load_chain if:
5474 1. "shift of a concatination" is faster than general permutation.
5475 This is usually so.
5476 2. The TARGET machine can't execute vector instructions in parallel.
5477 This is because each step of the algorithm depends on previous.
5478 The algorithm in vect_permute_load_chain is much more parallel.
5480 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5483 static bool
5484 vect_shift_permute_load_chain (vec<tree> dr_chain,
5485 unsigned int length,
5486 gimple *stmt,
5487 gimple_stmt_iterator *gsi,
5488 vec<tree> *result_chain)
5490 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5491 tree perm2_mask1, perm2_mask2, perm3_mask;
5492 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5493 gimple *perm_stmt;
5495 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5496 unsigned int i;
5497 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5498 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5499 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5501 auto_vec_perm_indices sel (nelt);
5502 sel.quick_grow (nelt);
5504 result_chain->quick_grow (length);
5505 memcpy (result_chain->address (), dr_chain.address (),
5506 length * sizeof (tree));
5508 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5510 unsigned int j, log_length = exact_log2 (length);
5511 for (i = 0; i < nelt / 2; ++i)
5512 sel[i] = i * 2;
5513 for (i = 0; i < nelt / 2; ++i)
5514 sel[nelt / 2 + i] = i * 2 + 1;
5515 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5517 if (dump_enabled_p ())
5518 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5519 "shuffle of 2 fields structure is not \
5520 supported by target\n");
5521 return false;
5523 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5525 for (i = 0; i < nelt / 2; ++i)
5526 sel[i] = i * 2 + 1;
5527 for (i = 0; i < nelt / 2; ++i)
5528 sel[nelt / 2 + i] = i * 2;
5529 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5531 if (dump_enabled_p ())
5532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5533 "shuffle of 2 fields structure is not \
5534 supported by target\n");
5535 return false;
5537 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5539 /* Generating permutation constant to shift all elements.
5540 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5541 for (i = 0; i < nelt; i++)
5542 sel[i] = nelt / 2 + i;
5543 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5545 if (dump_enabled_p ())
5546 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5547 "shift permutation is not supported by target\n");
5548 return false;
5550 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5552 /* Generating permutation constant to select vector from 2.
5553 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5554 for (i = 0; i < nelt / 2; i++)
5555 sel[i] = i;
5556 for (i = nelt / 2; i < nelt; i++)
5557 sel[i] = nelt + i;
5558 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5560 if (dump_enabled_p ())
5561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5562 "select is not supported by target\n");
5563 return false;
5565 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5567 for (i = 0; i < log_length; i++)
5569 for (j = 0; j < length; j += 2)
5571 first_vect = dr_chain[j];
5572 second_vect = dr_chain[j + 1];
5574 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5575 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5576 first_vect, first_vect,
5577 perm2_mask1);
5578 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5579 vect[0] = data_ref;
5581 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5582 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5583 second_vect, second_vect,
5584 perm2_mask2);
5585 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5586 vect[1] = data_ref;
5588 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5589 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5590 vect[0], vect[1], shift1_mask);
5591 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5592 (*result_chain)[j/2 + length/2] = data_ref;
5594 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5595 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5596 vect[0], vect[1], select_mask);
5597 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5598 (*result_chain)[j/2] = data_ref;
5600 memcpy (dr_chain.address (), result_chain->address (),
5601 length * sizeof (tree));
5603 return true;
5605 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5607 unsigned int k = 0, l = 0;
5609 /* Generating permutation constant to get all elements in rigth order.
5610 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5611 for (i = 0; i < nelt; i++)
5613 if (3 * k + (l % 3) >= nelt)
5615 k = 0;
5616 l += (3 - (nelt % 3));
5618 sel[i] = 3 * k + (l % 3);
5619 k++;
5621 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5623 if (dump_enabled_p ())
5624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5625 "shuffle of 3 fields structure is not \
5626 supported by target\n");
5627 return false;
5629 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5631 /* Generating permutation constant to shift all elements.
5632 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5633 for (i = 0; i < nelt; i++)
5634 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5635 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5637 if (dump_enabled_p ())
5638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5639 "shift permutation is not supported by target\n");
5640 return false;
5642 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5644 /* Generating permutation constant to shift all elements.
5645 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5646 for (i = 0; i < nelt; i++)
5647 sel[i] = 2 * (nelt / 3) + 1 + i;
5648 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5650 if (dump_enabled_p ())
5651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5652 "shift permutation is not supported by target\n");
5653 return false;
5655 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5657 /* Generating permutation constant to shift all elements.
5658 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5659 for (i = 0; i < nelt; i++)
5660 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5661 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5663 if (dump_enabled_p ())
5664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5665 "shift permutation is not supported by target\n");
5666 return false;
5668 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5670 /* Generating permutation constant to shift all elements.
5671 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5672 for (i = 0; i < nelt; i++)
5673 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5674 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5676 if (dump_enabled_p ())
5677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5678 "shift permutation is not supported by target\n");
5679 return false;
5681 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5683 for (k = 0; k < 3; k++)
5685 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5686 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5687 dr_chain[k], dr_chain[k],
5688 perm3_mask);
5689 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5690 vect[k] = data_ref;
5693 for (k = 0; k < 3; k++)
5695 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5696 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5697 vect[k % 3], vect[(k + 1) % 3],
5698 shift1_mask);
5699 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5700 vect_shift[k] = data_ref;
5703 for (k = 0; k < 3; k++)
5705 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5706 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5707 vect_shift[(4 - k) % 3],
5708 vect_shift[(3 - k) % 3],
5709 shift2_mask);
5710 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5711 vect[k] = data_ref;
5714 (*result_chain)[3 - (nelt % 3)] = vect[2];
5716 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5717 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5718 vect[0], shift3_mask);
5719 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5720 (*result_chain)[nelt % 3] = data_ref;
5722 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5723 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5724 vect[1], shift4_mask);
5725 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5726 (*result_chain)[0] = data_ref;
5727 return true;
5729 return false;
5732 /* Function vect_transform_grouped_load.
5734 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5735 to perform their permutation and ascribe the result vectorized statements to
5736 the scalar statements.
5739 void
5740 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5741 gimple_stmt_iterator *gsi)
5743 machine_mode mode;
5744 vec<tree> result_chain = vNULL;
5746 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5747 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5748 vectors, that are ready for vector computation. */
5749 result_chain.create (size);
5751 /* If reassociation width for vector type is 2 or greater target machine can
5752 execute 2 or more vector instructions in parallel. Otherwise try to
5753 get chain for loads group using vect_shift_permute_load_chain. */
5754 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5755 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5756 || pow2p_hwi (size)
5757 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5758 gsi, &result_chain))
5759 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5760 vect_record_grouped_load_vectors (stmt, result_chain);
5761 result_chain.release ();
5764 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5765 generated as part of the vectorization of STMT. Assign the statement
5766 for each vector to the associated scalar statement. */
5768 void
5769 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5771 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5772 gimple *next_stmt, *new_stmt;
5773 unsigned int i, gap_count;
5774 tree tmp_data_ref;
5776 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5777 Since we scan the chain starting from it's first node, their order
5778 corresponds the order of data-refs in RESULT_CHAIN. */
5779 next_stmt = first_stmt;
5780 gap_count = 1;
5781 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5783 if (!next_stmt)
5784 break;
5786 /* Skip the gaps. Loads created for the gaps will be removed by dead
5787 code elimination pass later. No need to check for the first stmt in
5788 the group, since it always exists.
5789 GROUP_GAP is the number of steps in elements from the previous
5790 access (if there is no gap GROUP_GAP is 1). We skip loads that
5791 correspond to the gaps. */
5792 if (next_stmt != first_stmt
5793 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5795 gap_count++;
5796 continue;
5799 while (next_stmt)
5801 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5802 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5803 copies, and we put the new vector statement in the first available
5804 RELATED_STMT. */
5805 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5806 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5807 else
5809 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5811 gimple *prev_stmt =
5812 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5813 gimple *rel_stmt =
5814 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5815 while (rel_stmt)
5817 prev_stmt = rel_stmt;
5818 rel_stmt =
5819 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5822 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5823 new_stmt;
5827 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5828 gap_count = 1;
5829 /* If NEXT_STMT accesses the same DR as the previous statement,
5830 put the same TMP_DATA_REF as its vectorized statement; otherwise
5831 get the next data-ref from RESULT_CHAIN. */
5832 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5833 break;
5838 /* Function vect_force_dr_alignment_p.
5840 Returns whether the alignment of a DECL can be forced to be aligned
5841 on ALIGNMENT bit boundary. */
5843 bool
5844 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5846 if (!VAR_P (decl))
5847 return false;
5849 if (decl_in_symtab_p (decl)
5850 && !symtab_node::get (decl)->can_increase_alignment_p ())
5851 return false;
5853 if (TREE_STATIC (decl))
5854 return (alignment <= MAX_OFILE_ALIGNMENT);
5855 else
5856 return (alignment <= MAX_STACK_ALIGNMENT);
5860 /* Return whether the data reference DR is supported with respect to its
5861 alignment.
5862 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5863 it is aligned, i.e., check if it is possible to vectorize it with different
5864 alignment. */
5866 enum dr_alignment_support
5867 vect_supportable_dr_alignment (struct data_reference *dr,
5868 bool check_aligned_accesses)
5870 gimple *stmt = DR_STMT (dr);
5871 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5872 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5873 machine_mode mode = TYPE_MODE (vectype);
5874 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5875 struct loop *vect_loop = NULL;
5876 bool nested_in_vect_loop = false;
5878 if (aligned_access_p (dr) && !check_aligned_accesses)
5879 return dr_aligned;
5881 /* For now assume all conditional loads/stores support unaligned
5882 access without any special code. */
5883 if (is_gimple_call (stmt)
5884 && gimple_call_internal_p (stmt)
5885 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5886 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5887 return dr_unaligned_supported;
5889 if (loop_vinfo)
5891 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5892 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5895 /* Possibly unaligned access. */
5897 /* We can choose between using the implicit realignment scheme (generating
5898 a misaligned_move stmt) and the explicit realignment scheme (generating
5899 aligned loads with a REALIGN_LOAD). There are two variants to the
5900 explicit realignment scheme: optimized, and unoptimized.
5901 We can optimize the realignment only if the step between consecutive
5902 vector loads is equal to the vector size. Since the vector memory
5903 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5904 is guaranteed that the misalignment amount remains the same throughout the
5905 execution of the vectorized loop. Therefore, we can create the
5906 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5907 at the loop preheader.
5909 However, in the case of outer-loop vectorization, when vectorizing a
5910 memory access in the inner-loop nested within the LOOP that is now being
5911 vectorized, while it is guaranteed that the misalignment of the
5912 vectorized memory access will remain the same in different outer-loop
5913 iterations, it is *not* guaranteed that is will remain the same throughout
5914 the execution of the inner-loop. This is because the inner-loop advances
5915 with the original scalar step (and not in steps of VS). If the inner-loop
5916 step happens to be a multiple of VS, then the misalignment remains fixed
5917 and we can use the optimized realignment scheme. For example:
5919 for (i=0; i<N; i++)
5920 for (j=0; j<M; j++)
5921 s += a[i+j];
5923 When vectorizing the i-loop in the above example, the step between
5924 consecutive vector loads is 1, and so the misalignment does not remain
5925 fixed across the execution of the inner-loop, and the realignment cannot
5926 be optimized (as illustrated in the following pseudo vectorized loop):
5928 for (i=0; i<N; i+=4)
5929 for (j=0; j<M; j++){
5930 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5931 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5932 // (assuming that we start from an aligned address).
5935 We therefore have to use the unoptimized realignment scheme:
5937 for (i=0; i<N; i+=4)
5938 for (j=k; j<M; j+=4)
5939 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5940 // that the misalignment of the initial address is
5941 // 0).
5943 The loop can then be vectorized as follows:
5945 for (k=0; k<4; k++){
5946 rt = get_realignment_token (&vp[k]);
5947 for (i=0; i<N; i+=4){
5948 v1 = vp[i+k];
5949 for (j=k; j<M; j+=4){
5950 v2 = vp[i+j+VS-1];
5951 va = REALIGN_LOAD <v1,v2,rt>;
5952 vs += va;
5953 v1 = v2;
5956 } */
5958 if (DR_IS_READ (dr))
5960 bool is_packed = false;
5961 tree type = (TREE_TYPE (DR_REF (dr)));
5963 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5964 && (!targetm.vectorize.builtin_mask_for_load
5965 || targetm.vectorize.builtin_mask_for_load ()))
5967 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5969 /* If we are doing SLP then the accesses need not have the
5970 same alignment, instead it depends on the SLP group size. */
5971 if (loop_vinfo
5972 && STMT_SLP_TYPE (stmt_info)
5973 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5974 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5975 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5977 else if (!loop_vinfo
5978 || (nested_in_vect_loop
5979 && (TREE_INT_CST_LOW (DR_STEP (dr))
5980 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5981 return dr_explicit_realign;
5982 else
5983 return dr_explicit_realign_optimized;
5985 if (!known_alignment_for_access_p (dr))
5986 is_packed = not_size_aligned (DR_REF (dr));
5988 if (targetm.vectorize.support_vector_misalignment
5989 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5990 /* Can't software pipeline the loads, but can at least do them. */
5991 return dr_unaligned_supported;
5993 else
5995 bool is_packed = false;
5996 tree type = (TREE_TYPE (DR_REF (dr)));
5998 if (!known_alignment_for_access_p (dr))
5999 is_packed = not_size_aligned (DR_REF (dr));
6001 if (targetm.vectorize.support_vector_misalignment
6002 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6003 return dr_unaligned_supported;
6006 /* Unsupported. */
6007 return dr_unaligned_unsupported;