2017-09-26 Thomas Koenig <tkoenig@gcc.gnu.org>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobc4314a0e2d823aa5b30d23d2133ae0161115583a
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
56 /* Return true if load- or store-lanes optab OPTAB is implemented for
57 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59 static bool
60 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
61 tree vectype, unsigned HOST_WIDE_INT count)
63 machine_mode mode;
64 scalar_int_mode array_mode;
65 bool limit_p;
67 mode = TYPE_MODE (vectype);
68 limit_p = !targetm.array_mode_supported_p (mode, count);
69 if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
70 limit_p).exists (&array_mode))
72 if (dump_enabled_p ())
73 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
74 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
75 GET_MODE_NAME (mode), count);
76 return false;
79 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
81 if (dump_enabled_p ())
82 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
83 "cannot use %s<%s><%s>\n", name,
84 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
85 return false;
88 if (dump_enabled_p ())
89 dump_printf_loc (MSG_NOTE, vect_location,
90 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
91 GET_MODE_NAME (mode));
93 return true;
97 /* Return the smallest scalar part of STMT.
98 This is used to determine the vectype of the stmt. We generally set the
99 vectype according to the type of the result (lhs). For stmts whose
100 result-type is different than the type of the arguments (e.g., demotion,
101 promotion), vectype will be reset appropriately (later). Note that we have
102 to visit the smallest datatype in this function, because that determines the
103 VF. If the smallest datatype in the loop is present only as the rhs of a
104 promotion operation - we'd miss it.
105 Such a case, where a variable of this datatype does not appear in the lhs
106 anywhere in the loop, can only occur if it's an invariant: e.g.:
107 'int_x = (int) short_inv', which we'd expect to have been optimized away by
108 invariant motion. However, we cannot rely on invariant motion to always
109 take invariants out of the loop, and so in the case of promotion we also
110 have to check the rhs.
111 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
112 types. */
114 tree
115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
116 HOST_WIDE_INT *rhs_size_unit)
118 tree scalar_type = gimple_expr_type (stmt);
119 HOST_WIDE_INT lhs, rhs;
121 /* During the analysis phase, this function is called on arbitrary
122 statements that might not have scalar results. */
123 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
124 return scalar_type;
126 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
128 if (is_gimple_assign (stmt)
129 && (gimple_assign_cast_p (stmt)
130 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
131 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
132 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
134 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
136 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
137 if (rhs < lhs)
138 scalar_type = rhs_type;
141 *lhs_size_unit = lhs;
142 *rhs_size_unit = rhs;
143 return scalar_type;
147 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
148 tested at run-time. Return TRUE if DDR was successfully inserted.
149 Return false if versioning is not supported. */
151 static bool
152 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
154 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
156 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
157 return false;
159 if (!runtime_alias_check_p (ddr, loop,
160 optimize_loop_nest_for_speed_p (loop)))
161 return false;
163 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
164 return true;
168 /* A subroutine of vect_analyze_data_ref_dependence. Handle
169 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
170 distances. These distances are conservatively correct but they don't
171 reflect a guaranteed dependence.
173 Return true if this function does all the work necessary to avoid
174 an alias or false if the caller should use the dependence distances
175 to limit the vectorization factor in the usual way. LOOP_DEPTH is
176 the depth of the loop described by LOOP_VINFO and the other arguments
177 are as for vect_analyze_data_ref_dependence. */
179 static bool
180 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
181 loop_vec_info loop_vinfo,
182 int loop_depth, int *max_vf)
184 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
185 lambda_vector dist_v;
186 unsigned int i;
187 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
189 int dist = dist_v[loop_depth];
190 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
192 /* If the user asserted safelen >= DIST consecutive iterations
193 can be executed concurrently, assume independence.
195 ??? An alternative would be to add the alias check even
196 in this case, and vectorize the fallback loop with the
197 maximum VF set to safelen. However, if the user has
198 explicitly given a length, it's less likely that that
199 would be a win. */
200 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
202 if (loop->safelen < *max_vf)
203 *max_vf = loop->safelen;
204 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
205 continue;
208 /* For dependence distances of 2 or more, we have the option
209 of limiting VF or checking for an alias at runtime.
210 Prefer to check at runtime if we can, to avoid limiting
211 the VF unnecessarily when the bases are in fact independent.
213 Note that the alias checks will be removed if the VF ends up
214 being small enough. */
215 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
218 return true;
222 /* Function vect_analyze_data_ref_dependence.
224 Return TRUE if there (might) exist a dependence between a memory-reference
225 DRA and a memory-reference DRB. When versioning for alias may check a
226 dependence at run-time, return FALSE. Adjust *MAX_VF according to
227 the data dependence. */
229 static bool
230 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
231 loop_vec_info loop_vinfo, int *max_vf)
233 unsigned int i;
234 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
235 struct data_reference *dra = DDR_A (ddr);
236 struct data_reference *drb = DDR_B (ddr);
237 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
238 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
239 lambda_vector dist_v;
240 unsigned int loop_depth;
242 /* In loop analysis all data references should be vectorizable. */
243 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
244 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
245 gcc_unreachable ();
247 /* Independent data accesses. */
248 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
249 return false;
251 if (dra == drb
252 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
253 return false;
255 /* We do not have to consider dependences between accesses that belong
256 to the same group. */
257 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
258 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
259 return false;
261 /* Even if we have an anti-dependence then, as the vectorized loop covers at
262 least two scalar iterations, there is always also a true dependence.
263 As the vectorizer does not re-order loads and stores we can ignore
264 the anti-dependence if TBAA can disambiguate both DRs similar to the
265 case with known negative distance anti-dependences (positive
266 distance anti-dependences would violate TBAA constraints). */
267 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
268 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
269 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
270 get_alias_set (DR_REF (drb))))
271 return false;
273 /* Unknown data dependence. */
274 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
276 /* If user asserted safelen consecutive iterations can be
277 executed concurrently, assume independence. */
278 if (loop->safelen >= 2)
280 if (loop->safelen < *max_vf)
281 *max_vf = loop->safelen;
282 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
283 return false;
286 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
287 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
289 if (dump_enabled_p ())
291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
292 "versioning for alias not supported for: "
293 "can't determine dependence between ");
294 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
295 DR_REF (dra));
296 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
297 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
298 DR_REF (drb));
299 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
301 return true;
304 if (dump_enabled_p ())
306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
307 "versioning for alias required: "
308 "can't determine dependence between ");
309 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
310 DR_REF (dra));
311 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
312 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
313 DR_REF (drb));
314 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
317 /* Add to list of ddrs that need to be tested at run-time. */
318 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
321 /* Known data dependence. */
322 if (DDR_NUM_DIST_VECTS (ddr) == 0)
324 /* If user asserted safelen consecutive iterations can be
325 executed concurrently, assume independence. */
326 if (loop->safelen >= 2)
328 if (loop->safelen < *max_vf)
329 *max_vf = loop->safelen;
330 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
331 return false;
334 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
335 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340 "versioning for alias not supported for: "
341 "bad dist vector for ");
342 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
343 DR_REF (dra));
344 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
345 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
346 DR_REF (drb));
347 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
349 return true;
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "versioning for alias required: "
356 "bad dist vector for ");
357 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
358 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
359 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
360 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
362 /* Add to list of ddrs that need to be tested at run-time. */
363 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
366 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
368 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
369 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
370 loop_depth, max_vf))
371 return false;
373 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
375 int dist = dist_v[loop_depth];
377 if (dump_enabled_p ())
378 dump_printf_loc (MSG_NOTE, vect_location,
379 "dependence distance = %d.\n", dist);
381 if (dist == 0)
383 if (dump_enabled_p ())
385 dump_printf_loc (MSG_NOTE, vect_location,
386 "dependence distance == 0 between ");
387 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
388 dump_printf (MSG_NOTE, " and ");
389 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
390 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
393 /* When we perform grouped accesses and perform implicit CSE
394 by detecting equal accesses and doing disambiguation with
395 runtime alias tests like for
396 .. = a[i];
397 .. = a[i+1];
398 a[i] = ..;
399 a[i+1] = ..;
400 *p = ..;
401 .. = a[i];
402 .. = a[i+1];
403 where we will end up loading { a[i], a[i+1] } once, make
404 sure that inserting group loads before the first load and
405 stores after the last store will do the right thing.
406 Similar for groups like
407 a[i] = ...;
408 ... = a[i];
409 a[i+1] = ...;
410 where loads from the group interleave with the store. */
411 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
412 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
414 gimple *earlier_stmt;
415 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
416 if (DR_IS_WRITE
417 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
419 if (dump_enabled_p ())
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421 "READ_WRITE dependence in interleaving."
422 "\n");
423 return true;
427 continue;
430 if (dist > 0 && DDR_REVERSED_P (ddr))
432 /* If DDR_REVERSED_P the order of the data-refs in DDR was
433 reversed (to make distance vector positive), and the actual
434 distance is negative. */
435 if (dump_enabled_p ())
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
437 "dependence distance negative.\n");
438 /* Record a negative dependence distance to later limit the
439 amount of stmt copying / unrolling we can perform.
440 Only need to handle read-after-write dependence. */
441 if (DR_IS_READ (drb)
442 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
443 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
444 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
445 continue;
448 if (abs (dist) >= 2
449 && abs (dist) < *max_vf)
451 /* The dependence distance requires reduction of the maximal
452 vectorization factor. */
453 *max_vf = abs (dist);
454 if (dump_enabled_p ())
455 dump_printf_loc (MSG_NOTE, vect_location,
456 "adjusting maximal vectorization factor to %i\n",
457 *max_vf);
460 if (abs (dist) >= *max_vf)
462 /* Dependence distance does not create dependence, as far as
463 vectorization is concerned, in this case. */
464 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE, vect_location,
466 "dependence distance >= VF.\n");
467 continue;
470 if (dump_enabled_p ())
472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
473 "not vectorized, possible dependence "
474 "between data-refs ");
475 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
476 dump_printf (MSG_NOTE, " and ");
477 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
478 dump_printf (MSG_NOTE, "\n");
481 return true;
484 return false;
487 /* Function vect_analyze_data_ref_dependences.
489 Examine all the data references in the loop, and make sure there do not
490 exist any data dependences between them. Set *MAX_VF according to
491 the maximum vectorization factor the data dependences allow. */
493 bool
494 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
496 unsigned int i;
497 struct data_dependence_relation *ddr;
499 if (dump_enabled_p ())
500 dump_printf_loc (MSG_NOTE, vect_location,
501 "=== vect_analyze_data_ref_dependences ===\n");
503 LOOP_VINFO_DDRS (loop_vinfo)
504 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
505 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
506 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
507 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
508 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
509 &LOOP_VINFO_DDRS (loop_vinfo),
510 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
511 return false;
513 /* For epilogues we either have no aliases or alias versioning
514 was applied to original loop. Therefore we may just get max_vf
515 using VF of original loop. */
516 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
517 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
518 else
519 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
520 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
521 return false;
523 return true;
527 /* Function vect_slp_analyze_data_ref_dependence.
529 Return TRUE if there (might) exist a dependence between a memory-reference
530 DRA and a memory-reference DRB. When versioning for alias may check a
531 dependence at run-time, return FALSE. Adjust *MAX_VF according to
532 the data dependence. */
534 static bool
535 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
537 struct data_reference *dra = DDR_A (ddr);
538 struct data_reference *drb = DDR_B (ddr);
540 /* We need to check dependences of statements marked as unvectorizable
541 as well, they still can prohibit vectorization. */
543 /* Independent data accesses. */
544 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
545 return false;
547 if (dra == drb)
548 return false;
550 /* Read-read is OK. */
551 if (DR_IS_READ (dra) && DR_IS_READ (drb))
552 return false;
554 /* If dra and drb are part of the same interleaving chain consider
555 them independent. */
556 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
557 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
558 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
559 return false;
561 /* Unknown data dependence. */
562 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
564 if (dump_enabled_p ())
566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
567 "can't determine dependence between ");
568 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
569 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
570 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
571 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
574 else if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location,
577 "determined dependence between ");
578 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
579 dump_printf (MSG_NOTE, " and ");
580 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
581 dump_printf (MSG_NOTE, "\n");
584 return true;
588 /* Analyze dependences involved in the transform of SLP NODE. STORES
589 contain the vector of scalar stores of this instance if we are
590 disambiguating the loads. */
592 static bool
593 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
594 vec<gimple *> stores, gimple *last_store)
596 /* This walks over all stmts involved in the SLP load/store done
597 in NODE verifying we can sink them up to the last stmt in the
598 group. */
599 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
600 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
602 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
603 if (access == last_access)
604 continue;
605 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
606 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
607 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
609 gimple *stmt = gsi_stmt (gsi);
610 if (! gimple_vuse (stmt)
611 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
612 continue;
614 /* If we couldn't record a (single) data reference for this
615 stmt we have to give up. */
616 /* ??? Here and below if dependence analysis fails we can resort
617 to the alias oracle which can handle more kinds of stmts. */
618 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
619 if (!dr_b)
620 return false;
622 bool dependent = false;
623 /* If we run into a store of this same instance (we've just
624 marked those) then delay dependence checking until we run
625 into the last store because this is where it will have
626 been sunk to (and we verify if we can do that as well). */
627 if (gimple_visited_p (stmt))
629 if (stmt != last_store)
630 continue;
631 unsigned i;
632 gimple *store;
633 FOR_EACH_VEC_ELT (stores, i, store)
635 data_reference *store_dr
636 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
637 ddr_p ddr = initialize_data_dependence_relation
638 (dr_a, store_dr, vNULL);
639 dependent = vect_slp_analyze_data_ref_dependence (ddr);
640 free_dependence_relation (ddr);
641 if (dependent)
642 break;
645 else
647 ddr_p ddr = initialize_data_dependence_relation (dr_a,
648 dr_b, vNULL);
649 dependent = vect_slp_analyze_data_ref_dependence (ddr);
650 free_dependence_relation (ddr);
652 if (dependent)
653 return false;
656 return true;
660 /* Function vect_analyze_data_ref_dependences.
662 Examine all the data references in the basic-block, and make sure there
663 do not exist any data dependences between them. Set *MAX_VF according to
664 the maximum vectorization factor the data dependences allow. */
666 bool
667 vect_slp_analyze_instance_dependence (slp_instance instance)
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_NOTE, vect_location,
671 "=== vect_slp_analyze_instance_dependence ===\n");
673 /* The stores of this instance are at the root of the SLP tree. */
674 slp_tree store = SLP_INSTANCE_TREE (instance);
675 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
676 store = NULL;
678 /* Verify we can sink stores to the vectorized stmt insert location. */
679 gimple *last_store = NULL;
680 if (store)
682 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
683 return false;
685 /* Mark stores in this instance and remember the last one. */
686 last_store = vect_find_last_scalar_stmt_in_slp (store);
687 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
688 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
691 bool res = true;
693 /* Verify we can sink loads to the vectorized stmt insert location,
694 special-casing stores of this instance. */
695 slp_tree load;
696 unsigned int i;
697 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
698 if (! vect_slp_analyze_node_dependences (instance, load,
699 store
700 ? SLP_TREE_SCALAR_STMTS (store)
701 : vNULL, last_store))
703 res = false;
704 break;
707 /* Unset the visited flag. */
708 if (store)
709 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
710 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
712 return res;
715 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
716 the statement that contains DRB, which is useful for recording in the
717 dump file. */
719 static void
720 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
721 innermost_loop_behavior *drb)
723 bool existed;
724 innermost_loop_behavior *&entry
725 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
726 if (!existed || entry->base_alignment < drb->base_alignment)
728 entry = drb;
729 if (dump_enabled_p ())
731 dump_printf_loc (MSG_NOTE, vect_location,
732 "recording new base alignment for ");
733 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
734 dump_printf (MSG_NOTE, "\n");
735 dump_printf_loc (MSG_NOTE, vect_location,
736 " alignment: %d\n", drb->base_alignment);
737 dump_printf_loc (MSG_NOTE, vect_location,
738 " misalignment: %d\n", drb->base_misalignment);
739 dump_printf_loc (MSG_NOTE, vect_location,
740 " based on: ");
741 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
746 /* If the region we're going to vectorize is reached, all unconditional
747 data references occur at least once. We can therefore pool the base
748 alignment guarantees from each unconditional reference. Do this by
749 going through all the data references in VINFO and checking whether
750 the containing statement makes the reference unconditionally. If so,
751 record the alignment of the base address in VINFO so that it can be
752 used for all other references with the same base. */
754 void
755 vect_record_base_alignments (vec_info *vinfo)
757 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
758 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
759 data_reference *dr;
760 unsigned int i;
761 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
762 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
764 gimple *stmt = DR_STMT (dr);
765 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
767 /* If DR is nested in the loop that is being vectorized, we can also
768 record the alignment of the base wrt the outer loop. */
769 if (loop && nested_in_vect_loop_p (loop, stmt))
771 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
772 vect_record_base_alignment
773 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
778 /* Return the target alignment for the vectorized form of DR. */
780 static unsigned int
781 vect_calculate_target_alignment (struct data_reference *dr)
783 gimple *stmt = DR_STMT (dr);
784 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
785 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
786 return targetm.vectorize.preferred_vector_alignment (vectype);
789 /* Function vect_compute_data_ref_alignment
791 Compute the misalignment of the data reference DR.
793 Output:
794 1. If during the misalignment computation it is found that the data reference
795 cannot be vectorized then false is returned.
796 2. DR_MISALIGNMENT (DR) is defined.
798 FOR NOW: No analysis is actually performed. Misalignment is calculated
799 only for trivial cases. TODO. */
801 bool
802 vect_compute_data_ref_alignment (struct data_reference *dr)
804 gimple *stmt = DR_STMT (dr);
805 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
806 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
807 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
808 struct loop *loop = NULL;
809 tree ref = DR_REF (dr);
810 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
812 if (dump_enabled_p ())
813 dump_printf_loc (MSG_NOTE, vect_location,
814 "vect_compute_data_ref_alignment:\n");
816 if (loop_vinfo)
817 loop = LOOP_VINFO_LOOP (loop_vinfo);
819 /* Initialize misalignment to unknown. */
820 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
822 innermost_loop_behavior *drb = vect_dr_behavior (dr);
823 bool step_preserves_misalignment_p;
825 unsigned HOST_WIDE_INT vector_alignment
826 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
827 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
829 /* No step for BB vectorization. */
830 if (!loop)
832 gcc_assert (integer_zerop (drb->step));
833 step_preserves_misalignment_p = true;
836 /* In case the dataref is in an inner-loop of the loop that is being
837 vectorized (LOOP), we use the base and misalignment information
838 relative to the outer-loop (LOOP). This is ok only if the misalignment
839 stays the same throughout the execution of the inner-loop, which is why
840 we have to check that the stride of the dataref in the inner-loop evenly
841 divides by the vector alignment. */
842 else if (nested_in_vect_loop_p (loop, stmt))
844 step_preserves_misalignment_p
845 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
847 if (dump_enabled_p ())
849 if (step_preserves_misalignment_p)
850 dump_printf_loc (MSG_NOTE, vect_location,
851 "inner step divides the vector alignment.\n");
852 else
853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
854 "inner step doesn't divide the vector"
855 " alignment.\n");
859 /* Similarly we can only use base and misalignment information relative to
860 an innermost loop if the misalignment stays the same throughout the
861 execution of the loop. As above, this is the case if the stride of
862 the dataref evenly divides by the alignment. */
863 else
865 unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
866 step_preserves_misalignment_p
867 = ((DR_STEP_ALIGNMENT (dr) * vf) % vector_alignment) == 0;
869 if (!step_preserves_misalignment_p && dump_enabled_p ())
870 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
871 "step doesn't divide the vector alignment.\n");
874 unsigned int base_alignment = drb->base_alignment;
875 unsigned int base_misalignment = drb->base_misalignment;
877 /* Calculate the maximum of the pooled base address alignment and the
878 alignment that we can compute for DR itself. */
879 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
880 if (entry && base_alignment < (*entry)->base_alignment)
882 base_alignment = (*entry)->base_alignment;
883 base_misalignment = (*entry)->base_misalignment;
886 if (drb->offset_alignment < vector_alignment
887 || !step_preserves_misalignment_p
888 /* We need to know whether the step wrt the vectorized loop is
889 negative when computing the starting misalignment below. */
890 || TREE_CODE (drb->step) != INTEGER_CST)
892 if (dump_enabled_p ())
894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
895 "Unknown alignment for access: ");
896 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
897 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
899 return true;
902 if (base_alignment < vector_alignment)
904 tree base = drb->base_address;
905 if (TREE_CODE (base) == ADDR_EXPR)
906 base = TREE_OPERAND (base, 0);
907 if (!vect_can_force_dr_alignment_p (base,
908 vector_alignment * BITS_PER_UNIT))
910 if (dump_enabled_p ())
912 dump_printf_loc (MSG_NOTE, vect_location,
913 "can't force alignment of ref: ");
914 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
915 dump_printf (MSG_NOTE, "\n");
917 return true;
920 if (DECL_USER_ALIGN (base))
922 if (dump_enabled_p ())
924 dump_printf_loc (MSG_NOTE, vect_location,
925 "not forcing alignment of user-aligned "
926 "variable: ");
927 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
928 dump_printf (MSG_NOTE, "\n");
930 return true;
933 /* Force the alignment of the decl.
934 NOTE: This is the only change to the code we make during
935 the analysis phase, before deciding to vectorize the loop. */
936 if (dump_enabled_p ())
938 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
939 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
940 dump_printf (MSG_NOTE, "\n");
943 DR_VECT_AUX (dr)->base_decl = base;
944 DR_VECT_AUX (dr)->base_misaligned = true;
945 base_misalignment = 0;
947 unsigned int misalignment = (base_misalignment
948 + TREE_INT_CST_LOW (drb->init));
950 /* If this is a backward running DR then first access in the larger
951 vectype actually is N-1 elements before the address in the DR.
952 Adjust misalign accordingly. */
953 if (tree_int_cst_sgn (drb->step) < 0)
954 /* PLUS because STEP is negative. */
955 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
956 * TREE_INT_CST_LOW (drb->step));
958 SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
960 if (dump_enabled_p ())
962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
963 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
964 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
965 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
968 return true;
971 /* Function vect_update_misalignment_for_peel.
972 Sets DR's misalignment
973 - to 0 if it has the same alignment as DR_PEEL,
974 - to the misalignment computed using NPEEL if DR's salignment is known,
975 - to -1 (unknown) otherwise.
977 DR - the data reference whose misalignment is to be adjusted.
978 DR_PEEL - the data reference whose misalignment is being made
979 zero in the vector loop by the peel.
980 NPEEL - the number of iterations in the peel loop if the misalignment
981 of DR_PEEL is known at compile time. */
983 static void
984 vect_update_misalignment_for_peel (struct data_reference *dr,
985 struct data_reference *dr_peel, int npeel)
987 unsigned int i;
988 vec<dr_p> same_aligned_drs;
989 struct data_reference *current_dr;
990 int dr_size = vect_get_scalar_dr_size (dr);
991 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
992 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
993 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
995 /* For interleaved data accesses the step in the loop must be multiplied by
996 the size of the interleaving group. */
997 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
998 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
999 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1000 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1002 /* It can be assumed that the data refs with the same alignment as dr_peel
1003 are aligned in the vector loop. */
1004 same_aligned_drs
1005 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1006 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1008 if (current_dr != dr)
1009 continue;
1010 gcc_assert (!known_alignment_for_access_p (dr)
1011 || !known_alignment_for_access_p (dr_peel)
1012 || (DR_MISALIGNMENT (dr) / dr_size
1013 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1014 SET_DR_MISALIGNMENT (dr, 0);
1015 return;
1018 if (known_alignment_for_access_p (dr)
1019 && known_alignment_for_access_p (dr_peel))
1021 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1022 int misal = DR_MISALIGNMENT (dr);
1023 misal += negative ? -npeel * dr_size : npeel * dr_size;
1024 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1025 SET_DR_MISALIGNMENT (dr, misal);
1026 return;
1029 if (dump_enabled_p ())
1030 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1031 "to unknown (-1).\n");
1032 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1036 /* Function verify_data_ref_alignment
1038 Return TRUE if DR can be handled with respect to alignment. */
1040 static bool
1041 verify_data_ref_alignment (data_reference_p dr)
1043 enum dr_alignment_support supportable_dr_alignment
1044 = vect_supportable_dr_alignment (dr, false);
1045 if (!supportable_dr_alignment)
1047 if (dump_enabled_p ())
1049 if (DR_IS_READ (dr))
1050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1051 "not vectorized: unsupported unaligned load.");
1052 else
1053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1054 "not vectorized: unsupported unaligned "
1055 "store.");
1057 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1058 DR_REF (dr));
1059 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1061 return false;
1064 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1065 dump_printf_loc (MSG_NOTE, vect_location,
1066 "Vectorizing an unaligned access.\n");
1068 return true;
1071 /* Function vect_verify_datarefs_alignment
1073 Return TRUE if all data references in the loop can be
1074 handled with respect to alignment. */
1076 bool
1077 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1079 vec<data_reference_p> datarefs = vinfo->datarefs;
1080 struct data_reference *dr;
1081 unsigned int i;
1083 FOR_EACH_VEC_ELT (datarefs, i, dr)
1085 gimple *stmt = DR_STMT (dr);
1086 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1088 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1089 continue;
1091 /* For interleaving, only the alignment of the first access matters. */
1092 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1093 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1094 continue;
1096 /* Strided accesses perform only component accesses, alignment is
1097 irrelevant for them. */
1098 if (STMT_VINFO_STRIDED_P (stmt_info)
1099 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1100 continue;
1102 if (! verify_data_ref_alignment (dr))
1103 return false;
1106 return true;
1109 /* Given an memory reference EXP return whether its alignment is less
1110 than its size. */
1112 static bool
1113 not_size_aligned (tree exp)
1115 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1116 return true;
1118 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1119 > get_object_alignment (exp));
1122 /* Function vector_alignment_reachable_p
1124 Return true if vector alignment for DR is reachable by peeling
1125 a few loop iterations. Return false otherwise. */
1127 static bool
1128 vector_alignment_reachable_p (struct data_reference *dr)
1130 gimple *stmt = DR_STMT (dr);
1131 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1132 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1134 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1136 /* For interleaved access we peel only if number of iterations in
1137 the prolog loop ({VF - misalignment}), is a multiple of the
1138 number of the interleaved accesses. */
1139 int elem_size, mis_in_elements;
1140 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1142 /* FORNOW: handle only known alignment. */
1143 if (!known_alignment_for_access_p (dr))
1144 return false;
1146 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1147 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1149 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1150 return false;
1153 /* If misalignment is known at the compile time then allow peeling
1154 only if natural alignment is reachable through peeling. */
1155 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1157 HOST_WIDE_INT elmsize =
1158 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1159 if (dump_enabled_p ())
1161 dump_printf_loc (MSG_NOTE, vect_location,
1162 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1163 dump_printf (MSG_NOTE,
1164 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1166 if (DR_MISALIGNMENT (dr) % elmsize)
1168 if (dump_enabled_p ())
1169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170 "data size does not divide the misalignment.\n");
1171 return false;
1175 if (!known_alignment_for_access_p (dr))
1177 tree type = TREE_TYPE (DR_REF (dr));
1178 bool is_packed = not_size_aligned (DR_REF (dr));
1179 if (dump_enabled_p ())
1180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1181 "Unknown misalignment, %snaturally aligned\n",
1182 is_packed ? "not " : "");
1183 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1186 return true;
1190 /* Calculate the cost of the memory access represented by DR. */
1192 static void
1193 vect_get_data_access_cost (struct data_reference *dr,
1194 unsigned int *inside_cost,
1195 unsigned int *outside_cost,
1196 stmt_vector_for_cost *body_cost_vec)
1198 gimple *stmt = DR_STMT (dr);
1199 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1200 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1201 int ncopies;
1203 if (PURE_SLP_STMT (stmt_info))
1204 ncopies = 1;
1205 else
1206 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1208 if (DR_IS_READ (dr))
1209 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1210 NULL, body_cost_vec, false);
1211 else
1212 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1214 if (dump_enabled_p ())
1215 dump_printf_loc (MSG_NOTE, vect_location,
1216 "vect_get_data_access_cost: inside_cost = %d, "
1217 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1221 typedef struct _vect_peel_info
1223 struct data_reference *dr;
1224 int npeel;
1225 unsigned int count;
1226 } *vect_peel_info;
1228 typedef struct _vect_peel_extended_info
1230 struct _vect_peel_info peel_info;
1231 unsigned int inside_cost;
1232 unsigned int outside_cost;
1233 } *vect_peel_extended_info;
1236 /* Peeling hashtable helpers. */
1238 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1240 static inline hashval_t hash (const _vect_peel_info *);
1241 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1244 inline hashval_t
1245 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1247 return (hashval_t) peel_info->npeel;
1250 inline bool
1251 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1253 return (a->npeel == b->npeel);
1257 /* Insert DR into peeling hash table with NPEEL as key. */
1259 static void
1260 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1261 loop_vec_info loop_vinfo, struct data_reference *dr,
1262 int npeel)
1264 struct _vect_peel_info elem, *slot;
1265 _vect_peel_info **new_slot;
1266 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1268 elem.npeel = npeel;
1269 slot = peeling_htab->find (&elem);
1270 if (slot)
1271 slot->count++;
1272 else
1274 slot = XNEW (struct _vect_peel_info);
1275 slot->npeel = npeel;
1276 slot->dr = dr;
1277 slot->count = 1;
1278 new_slot = peeling_htab->find_slot (slot, INSERT);
1279 *new_slot = slot;
1282 if (!supportable_dr_alignment
1283 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1284 slot->count += VECT_MAX_COST;
1288 /* Traverse peeling hash table to find peeling option that aligns maximum
1289 number of data accesses. */
1292 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1293 _vect_peel_extended_info *max)
1295 vect_peel_info elem = *slot;
1297 if (elem->count > max->peel_info.count
1298 || (elem->count == max->peel_info.count
1299 && max->peel_info.npeel > elem->npeel))
1301 max->peel_info.npeel = elem->npeel;
1302 max->peel_info.count = elem->count;
1303 max->peel_info.dr = elem->dr;
1306 return 1;
1309 /* Get the costs of peeling NPEEL iterations checking data access costs
1310 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1311 misalignment will be zero after peeling. */
1313 static void
1314 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1315 struct data_reference *dr0,
1316 unsigned int *inside_cost,
1317 unsigned int *outside_cost,
1318 stmt_vector_for_cost *body_cost_vec,
1319 unsigned int npeel,
1320 bool unknown_misalignment)
1322 unsigned i;
1323 data_reference *dr;
1325 FOR_EACH_VEC_ELT (datarefs, i, dr)
1327 gimple *stmt = DR_STMT (dr);
1328 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1329 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1330 continue;
1332 /* For interleaving, only the alignment of the first access
1333 matters. */
1334 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1335 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1336 continue;
1338 /* Strided accesses perform only component accesses, alignment is
1339 irrelevant for them. */
1340 if (STMT_VINFO_STRIDED_P (stmt_info)
1341 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1342 continue;
1344 int save_misalignment;
1345 save_misalignment = DR_MISALIGNMENT (dr);
1346 if (npeel == 0)
1348 else if (unknown_misalignment && dr == dr0)
1349 SET_DR_MISALIGNMENT (dr, 0);
1350 else
1351 vect_update_misalignment_for_peel (dr, dr0, npeel);
1352 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1353 body_cost_vec);
1354 SET_DR_MISALIGNMENT (dr, save_misalignment);
1358 /* Traverse peeling hash table and calculate cost for each peeling option.
1359 Find the one with the lowest cost. */
1362 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1363 _vect_peel_extended_info *min)
1365 vect_peel_info elem = *slot;
1366 int dummy;
1367 unsigned int inside_cost = 0, outside_cost = 0;
1368 gimple *stmt = DR_STMT (elem->dr);
1369 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1370 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1371 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1372 epilogue_cost_vec;
1374 prologue_cost_vec.create (2);
1375 body_cost_vec.create (2);
1376 epilogue_cost_vec.create (2);
1378 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1379 elem->dr, &inside_cost, &outside_cost,
1380 &body_cost_vec, elem->npeel, false);
1382 body_cost_vec.release ();
1384 outside_cost += vect_get_known_peeling_cost
1385 (loop_vinfo, elem->npeel, &dummy,
1386 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1387 &prologue_cost_vec, &epilogue_cost_vec);
1389 /* Prologue and epilogue costs are added to the target model later.
1390 These costs depend only on the scalar iteration cost, the
1391 number of peeling iterations finally chosen, and the number of
1392 misaligned statements. So discard the information found here. */
1393 prologue_cost_vec.release ();
1394 epilogue_cost_vec.release ();
1396 if (inside_cost < min->inside_cost
1397 || (inside_cost == min->inside_cost
1398 && outside_cost < min->outside_cost))
1400 min->inside_cost = inside_cost;
1401 min->outside_cost = outside_cost;
1402 min->peel_info.dr = elem->dr;
1403 min->peel_info.npeel = elem->npeel;
1404 min->peel_info.count = elem->count;
1407 return 1;
1411 /* Choose best peeling option by traversing peeling hash table and either
1412 choosing an option with the lowest cost (if cost model is enabled) or the
1413 option that aligns as many accesses as possible. */
1415 static struct _vect_peel_extended_info
1416 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1417 loop_vec_info loop_vinfo)
1419 struct _vect_peel_extended_info res;
1421 res.peel_info.dr = NULL;
1423 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1425 res.inside_cost = INT_MAX;
1426 res.outside_cost = INT_MAX;
1427 peeling_htab->traverse <_vect_peel_extended_info *,
1428 vect_peeling_hash_get_lowest_cost> (&res);
1430 else
1432 res.peel_info.count = 0;
1433 peeling_htab->traverse <_vect_peel_extended_info *,
1434 vect_peeling_hash_get_most_frequent> (&res);
1435 res.inside_cost = 0;
1436 res.outside_cost = 0;
1439 return res;
1442 /* Return true if the new peeling NPEEL is supported. */
1444 static bool
1445 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1446 unsigned npeel)
1448 unsigned i;
1449 struct data_reference *dr = NULL;
1450 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1451 gimple *stmt;
1452 stmt_vec_info stmt_info;
1453 enum dr_alignment_support supportable_dr_alignment;
1455 /* Ensure that all data refs can be vectorized after the peel. */
1456 FOR_EACH_VEC_ELT (datarefs, i, dr)
1458 int save_misalignment;
1460 if (dr == dr0)
1461 continue;
1463 stmt = DR_STMT (dr);
1464 stmt_info = vinfo_for_stmt (stmt);
1465 /* For interleaving, only the alignment of the first access
1466 matters. */
1467 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1468 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1469 continue;
1471 /* Strided accesses perform only component accesses, alignment is
1472 irrelevant for them. */
1473 if (STMT_VINFO_STRIDED_P (stmt_info)
1474 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1475 continue;
1477 save_misalignment = DR_MISALIGNMENT (dr);
1478 vect_update_misalignment_for_peel (dr, dr0, npeel);
1479 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1480 SET_DR_MISALIGNMENT (dr, save_misalignment);
1482 if (!supportable_dr_alignment)
1483 return false;
1486 return true;
1489 /* Function vect_enhance_data_refs_alignment
1491 This pass will use loop versioning and loop peeling in order to enhance
1492 the alignment of data references in the loop.
1494 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1495 original loop is to be vectorized. Any other loops that are created by
1496 the transformations performed in this pass - are not supposed to be
1497 vectorized. This restriction will be relaxed.
1499 This pass will require a cost model to guide it whether to apply peeling
1500 or versioning or a combination of the two. For example, the scheme that
1501 intel uses when given a loop with several memory accesses, is as follows:
1502 choose one memory access ('p') which alignment you want to force by doing
1503 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1504 other accesses are not necessarily aligned, or (2) use loop versioning to
1505 generate one loop in which all accesses are aligned, and another loop in
1506 which only 'p' is necessarily aligned.
1508 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1509 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1510 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1512 Devising a cost model is the most critical aspect of this work. It will
1513 guide us on which access to peel for, whether to use loop versioning, how
1514 many versions to create, etc. The cost model will probably consist of
1515 generic considerations as well as target specific considerations (on
1516 powerpc for example, misaligned stores are more painful than misaligned
1517 loads).
1519 Here are the general steps involved in alignment enhancements:
1521 -- original loop, before alignment analysis:
1522 for (i=0; i<N; i++){
1523 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1524 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1527 -- After vect_compute_data_refs_alignment:
1528 for (i=0; i<N; i++){
1529 x = q[i]; # DR_MISALIGNMENT(q) = 3
1530 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1533 -- Possibility 1: we do loop versioning:
1534 if (p is aligned) {
1535 for (i=0; i<N; i++){ # loop 1A
1536 x = q[i]; # DR_MISALIGNMENT(q) = 3
1537 p[i] = y; # DR_MISALIGNMENT(p) = 0
1540 else {
1541 for (i=0; i<N; i++){ # loop 1B
1542 x = q[i]; # DR_MISALIGNMENT(q) = 3
1543 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1547 -- Possibility 2: we do loop peeling:
1548 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1549 x = q[i];
1550 p[i] = y;
1552 for (i = 3; i < N; i++){ # loop 2A
1553 x = q[i]; # DR_MISALIGNMENT(q) = 0
1554 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1557 -- Possibility 3: combination of loop peeling and versioning:
1558 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1559 x = q[i];
1560 p[i] = y;
1562 if (p is aligned) {
1563 for (i = 3; i<N; i++){ # loop 3A
1564 x = q[i]; # DR_MISALIGNMENT(q) = 0
1565 p[i] = y; # DR_MISALIGNMENT(p) = 0
1568 else {
1569 for (i = 3; i<N; i++){ # loop 3B
1570 x = q[i]; # DR_MISALIGNMENT(q) = 0
1571 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1575 These loops are later passed to loop_transform to be vectorized. The
1576 vectorizer will use the alignment information to guide the transformation
1577 (whether to generate regular loads/stores, or with special handling for
1578 misalignment). */
1580 bool
1581 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1583 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1584 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1585 enum dr_alignment_support supportable_dr_alignment;
1586 struct data_reference *dr0 = NULL, *first_store = NULL;
1587 struct data_reference *dr;
1588 unsigned int i, j;
1589 bool do_peeling = false;
1590 bool do_versioning = false;
1591 bool stat;
1592 gimple *stmt;
1593 stmt_vec_info stmt_info;
1594 unsigned int npeel = 0;
1595 bool one_misalignment_known = false;
1596 bool one_misalignment_unknown = false;
1597 bool one_dr_unsupportable = false;
1598 struct data_reference *unsupportable_dr = NULL;
1599 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1600 unsigned possible_npeel_number = 1;
1601 tree vectype;
1602 unsigned int nelements, mis, same_align_drs_max = 0;
1603 hash_table<peel_info_hasher> peeling_htab (1);
1605 if (dump_enabled_p ())
1606 dump_printf_loc (MSG_NOTE, vect_location,
1607 "=== vect_enhance_data_refs_alignment ===\n");
1609 /* Reset data so we can safely be called multiple times. */
1610 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1611 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1613 /* While cost model enhancements are expected in the future, the high level
1614 view of the code at this time is as follows:
1616 A) If there is a misaligned access then see if peeling to align
1617 this access can make all data references satisfy
1618 vect_supportable_dr_alignment. If so, update data structures
1619 as needed and return true.
1621 B) If peeling wasn't possible and there is a data reference with an
1622 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1623 then see if loop versioning checks can be used to make all data
1624 references satisfy vect_supportable_dr_alignment. If so, update
1625 data structures as needed and return true.
1627 C) If neither peeling nor versioning were successful then return false if
1628 any data reference does not satisfy vect_supportable_dr_alignment.
1630 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1632 Note, Possibility 3 above (which is peeling and versioning together) is not
1633 being done at this time. */
1635 /* (1) Peeling to force alignment. */
1637 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1638 Considerations:
1639 + How many accesses will become aligned due to the peeling
1640 - How many accesses will become unaligned due to the peeling,
1641 and the cost of misaligned accesses.
1642 - The cost of peeling (the extra runtime checks, the increase
1643 in code size). */
1645 FOR_EACH_VEC_ELT (datarefs, i, dr)
1647 stmt = DR_STMT (dr);
1648 stmt_info = vinfo_for_stmt (stmt);
1650 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1651 continue;
1653 /* For interleaving, only the alignment of the first access
1654 matters. */
1655 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1656 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1657 continue;
1659 /* For invariant accesses there is nothing to enhance. */
1660 if (integer_zerop (DR_STEP (dr)))
1661 continue;
1663 /* Strided accesses perform only component accesses, alignment is
1664 irrelevant for them. */
1665 if (STMT_VINFO_STRIDED_P (stmt_info)
1666 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1667 continue;
1669 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1670 do_peeling = vector_alignment_reachable_p (dr);
1671 if (do_peeling)
1673 if (known_alignment_for_access_p (dr))
1675 unsigned int npeel_tmp = 0;
1676 bool negative = tree_int_cst_compare (DR_STEP (dr),
1677 size_zero_node) < 0;
1679 vectype = STMT_VINFO_VECTYPE (stmt_info);
1680 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1681 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1682 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1683 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1684 if (DR_MISALIGNMENT (dr) != 0)
1685 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1687 /* For multiple types, it is possible that the bigger type access
1688 will have more than one peeling option. E.g., a loop with two
1689 types: one of size (vector size / 4), and the other one of
1690 size (vector size / 8). Vectorization factor will 8. If both
1691 accesses are misaligned by 3, the first one needs one scalar
1692 iteration to be aligned, and the second one needs 5. But the
1693 first one will be aligned also by peeling 5 scalar
1694 iterations, and in that case both accesses will be aligned.
1695 Hence, except for the immediate peeling amount, we also want
1696 to try to add full vector size, while we don't exceed
1697 vectorization factor.
1698 We do this automatically for cost model, since we calculate
1699 cost for every peeling option. */
1700 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1702 if (STMT_SLP_TYPE (stmt_info))
1703 possible_npeel_number
1704 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1705 else
1706 possible_npeel_number = vf / nelements;
1708 /* NPEEL_TMP is 0 when there is no misalignment, but also
1709 allow peeling NELEMENTS. */
1710 if (DR_MISALIGNMENT (dr) == 0)
1711 possible_npeel_number++;
1714 /* Save info about DR in the hash table. Also include peeling
1715 amounts according to the explanation above. */
1716 for (j = 0; j < possible_npeel_number; j++)
1718 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1719 dr, npeel_tmp);
1720 npeel_tmp += target_align / dr_size;
1723 one_misalignment_known = true;
1725 else
1727 /* If we don't know any misalignment values, we prefer
1728 peeling for data-ref that has the maximum number of data-refs
1729 with the same alignment, unless the target prefers to align
1730 stores over load. */
1731 unsigned same_align_drs
1732 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1733 if (!dr0
1734 || same_align_drs_max < same_align_drs)
1736 same_align_drs_max = same_align_drs;
1737 dr0 = dr;
1739 /* For data-refs with the same number of related
1740 accesses prefer the one where the misalign
1741 computation will be invariant in the outermost loop. */
1742 else if (same_align_drs_max == same_align_drs)
1744 struct loop *ivloop0, *ivloop;
1745 ivloop0 = outermost_invariant_loop_for_expr
1746 (loop, DR_BASE_ADDRESS (dr0));
1747 ivloop = outermost_invariant_loop_for_expr
1748 (loop, DR_BASE_ADDRESS (dr));
1749 if ((ivloop && !ivloop0)
1750 || (ivloop && ivloop0
1751 && flow_loop_nested_p (ivloop, ivloop0)))
1752 dr0 = dr;
1755 one_misalignment_unknown = true;
1757 /* Check for data refs with unsupportable alignment that
1758 can be peeled. */
1759 if (!supportable_dr_alignment)
1761 one_dr_unsupportable = true;
1762 unsupportable_dr = dr;
1765 if (!first_store && DR_IS_WRITE (dr))
1766 first_store = dr;
1769 else
1771 if (!aligned_access_p (dr))
1773 if (dump_enabled_p ())
1774 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1775 "vector alignment may not be reachable\n");
1776 break;
1781 /* Check if we can possibly peel the loop. */
1782 if (!vect_can_advance_ivs_p (loop_vinfo)
1783 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1784 || loop->inner)
1785 do_peeling = false;
1787 struct _vect_peel_extended_info peel_for_known_alignment;
1788 struct _vect_peel_extended_info peel_for_unknown_alignment;
1789 struct _vect_peel_extended_info best_peel;
1791 peel_for_unknown_alignment.inside_cost = INT_MAX;
1792 peel_for_unknown_alignment.outside_cost = INT_MAX;
1793 peel_for_unknown_alignment.peel_info.count = 0;
1795 if (do_peeling
1796 && one_misalignment_unknown)
1798 /* Check if the target requires to prefer stores over loads, i.e., if
1799 misaligned stores are more expensive than misaligned loads (taking
1800 drs with same alignment into account). */
1801 unsigned int load_inside_cost = 0;
1802 unsigned int load_outside_cost = 0;
1803 unsigned int store_inside_cost = 0;
1804 unsigned int store_outside_cost = 0;
1806 stmt_vector_for_cost dummy;
1807 dummy.create (2);
1808 vect_get_peeling_costs_all_drs (datarefs, dr0,
1809 &load_inside_cost,
1810 &load_outside_cost,
1811 &dummy, vf / 2, true);
1812 dummy.release ();
1814 if (first_store)
1816 dummy.create (2);
1817 vect_get_peeling_costs_all_drs (datarefs, first_store,
1818 &store_inside_cost,
1819 &store_outside_cost,
1820 &dummy, vf / 2, true);
1821 dummy.release ();
1823 else
1825 store_inside_cost = INT_MAX;
1826 store_outside_cost = INT_MAX;
1829 if (load_inside_cost > store_inside_cost
1830 || (load_inside_cost == store_inside_cost
1831 && load_outside_cost > store_outside_cost))
1833 dr0 = first_store;
1834 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1835 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1837 else
1839 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1840 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1843 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1844 prologue_cost_vec.create (2);
1845 epilogue_cost_vec.create (2);
1847 int dummy2;
1848 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1849 (loop_vinfo, vf / 2, &dummy2,
1850 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1851 &prologue_cost_vec, &epilogue_cost_vec);
1853 prologue_cost_vec.release ();
1854 epilogue_cost_vec.release ();
1856 peel_for_unknown_alignment.peel_info.count = 1
1857 + STMT_VINFO_SAME_ALIGN_REFS
1858 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1861 peel_for_unknown_alignment.peel_info.npeel = 0;
1862 peel_for_unknown_alignment.peel_info.dr = dr0;
1864 best_peel = peel_for_unknown_alignment;
1866 peel_for_known_alignment.inside_cost = INT_MAX;
1867 peel_for_known_alignment.outside_cost = INT_MAX;
1868 peel_for_known_alignment.peel_info.count = 0;
1869 peel_for_known_alignment.peel_info.dr = NULL;
1871 if (do_peeling && one_misalignment_known)
1873 /* Peeling is possible, but there is no data access that is not supported
1874 unless aligned. So we try to choose the best possible peeling from
1875 the hash table. */
1876 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1877 (&peeling_htab, loop_vinfo);
1880 /* Compare costs of peeling for known and unknown alignment. */
1881 if (peel_for_known_alignment.peel_info.dr != NULL
1882 && peel_for_unknown_alignment.inside_cost
1883 >= peel_for_known_alignment.inside_cost)
1885 best_peel = peel_for_known_alignment;
1887 /* If the best peeling for known alignment has NPEEL == 0, perform no
1888 peeling at all except if there is an unsupportable dr that we can
1889 align. */
1890 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1891 do_peeling = false;
1894 /* If there is an unsupportable data ref, prefer this over all choices so far
1895 since we'd have to discard a chosen peeling except when it accidentally
1896 aligned the unsupportable data ref. */
1897 if (one_dr_unsupportable)
1898 dr0 = unsupportable_dr;
1899 else if (do_peeling)
1901 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1902 TODO: Use nopeel_outside_cost or get rid of it? */
1903 unsigned nopeel_inside_cost = 0;
1904 unsigned nopeel_outside_cost = 0;
1906 stmt_vector_for_cost dummy;
1907 dummy.create (2);
1908 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1909 &nopeel_outside_cost, &dummy, 0, false);
1910 dummy.release ();
1912 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1913 costs will be recorded. */
1914 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1915 prologue_cost_vec.create (2);
1916 epilogue_cost_vec.create (2);
1918 int dummy2;
1919 nopeel_outside_cost += vect_get_known_peeling_cost
1920 (loop_vinfo, 0, &dummy2,
1921 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1922 &prologue_cost_vec, &epilogue_cost_vec);
1924 prologue_cost_vec.release ();
1925 epilogue_cost_vec.release ();
1927 npeel = best_peel.peel_info.npeel;
1928 dr0 = best_peel.peel_info.dr;
1930 /* If no peeling is not more expensive than the best peeling we
1931 have so far, don't perform any peeling. */
1932 if (nopeel_inside_cost <= best_peel.inside_cost)
1933 do_peeling = false;
1936 if (do_peeling)
1938 stmt = DR_STMT (dr0);
1939 stmt_info = vinfo_for_stmt (stmt);
1940 vectype = STMT_VINFO_VECTYPE (stmt_info);
1942 if (known_alignment_for_access_p (dr0))
1944 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1945 size_zero_node) < 0;
1946 if (!npeel)
1948 /* Since it's known at compile time, compute the number of
1949 iterations in the peeled loop (the peeling factor) for use in
1950 updating DR_MISALIGNMENT values. The peeling factor is the
1951 vectorization factor minus the misalignment as an element
1952 count. */
1953 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1954 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1955 npeel = ((mis & (target_align - 1))
1956 / vect_get_scalar_dr_size (dr0));
1959 /* For interleaved data access every iteration accesses all the
1960 members of the group, therefore we divide the number of iterations
1961 by the group size. */
1962 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1963 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1964 npeel /= GROUP_SIZE (stmt_info);
1966 if (dump_enabled_p ())
1967 dump_printf_loc (MSG_NOTE, vect_location,
1968 "Try peeling by %d\n", npeel);
1971 /* Ensure that all datarefs can be vectorized after the peel. */
1972 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1973 do_peeling = false;
1975 /* Check if all datarefs are supportable and log. */
1976 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1978 stat = vect_verify_datarefs_alignment (loop_vinfo);
1979 if (!stat)
1980 do_peeling = false;
1981 else
1982 return stat;
1985 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1986 if (do_peeling)
1988 unsigned max_allowed_peel
1989 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1990 if (max_allowed_peel != (unsigned)-1)
1992 unsigned max_peel = npeel;
1993 if (max_peel == 0)
1995 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1996 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
1998 if (max_peel > max_allowed_peel)
2000 do_peeling = false;
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_NOTE, vect_location,
2003 "Disable peeling, max peels reached: %d\n", max_peel);
2008 /* Cost model #2 - if peeling may result in a remaining loop not
2009 iterating enough to be vectorized then do not peel. */
2010 if (do_peeling
2011 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2013 unsigned max_peel
2014 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
2015 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
2016 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
2017 do_peeling = false;
2020 if (do_peeling)
2022 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2023 If the misalignment of DR_i is identical to that of dr0 then set
2024 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2025 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2026 by the peeling factor times the element size of DR_i (MOD the
2027 vectorization factor times the size). Otherwise, the
2028 misalignment of DR_i must be set to unknown. */
2029 FOR_EACH_VEC_ELT (datarefs, i, dr)
2030 if (dr != dr0)
2032 /* Strided accesses perform only component accesses, alignment
2033 is irrelevant for them. */
2034 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2035 if (STMT_VINFO_STRIDED_P (stmt_info)
2036 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2037 continue;
2039 vect_update_misalignment_for_peel (dr, dr0, npeel);
2042 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2043 if (npeel)
2044 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2045 else
2046 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2047 = DR_MISALIGNMENT (dr0);
2048 SET_DR_MISALIGNMENT (dr0, 0);
2049 if (dump_enabled_p ())
2051 dump_printf_loc (MSG_NOTE, vect_location,
2052 "Alignment of access forced using peeling.\n");
2053 dump_printf_loc (MSG_NOTE, vect_location,
2054 "Peeling for alignment will be applied.\n");
2057 /* The inside-loop cost will be accounted for in vectorizable_load
2058 and vectorizable_store correctly with adjusted alignments.
2059 Drop the body_cst_vec on the floor here. */
2060 stat = vect_verify_datarefs_alignment (loop_vinfo);
2061 gcc_assert (stat);
2062 return stat;
2066 /* (2) Versioning to force alignment. */
2068 /* Try versioning if:
2069 1) optimize loop for speed
2070 2) there is at least one unsupported misaligned data ref with an unknown
2071 misalignment, and
2072 3) all misaligned data refs with a known misalignment are supported, and
2073 4) the number of runtime alignment checks is within reason. */
2075 do_versioning =
2076 optimize_loop_nest_for_speed_p (loop)
2077 && (!loop->inner); /* FORNOW */
2079 if (do_versioning)
2081 FOR_EACH_VEC_ELT (datarefs, i, dr)
2083 stmt = DR_STMT (dr);
2084 stmt_info = vinfo_for_stmt (stmt);
2086 /* For interleaving, only the alignment of the first access
2087 matters. */
2088 if (aligned_access_p (dr)
2089 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2090 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2091 continue;
2093 if (STMT_VINFO_STRIDED_P (stmt_info))
2095 /* Strided loads perform only component accesses, alignment is
2096 irrelevant for them. */
2097 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2098 continue;
2099 do_versioning = false;
2100 break;
2103 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2105 if (!supportable_dr_alignment)
2107 gimple *stmt;
2108 int mask;
2109 tree vectype;
2111 if (known_alignment_for_access_p (dr)
2112 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2113 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2115 do_versioning = false;
2116 break;
2119 stmt = DR_STMT (dr);
2120 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2121 gcc_assert (vectype);
2123 /* The rightmost bits of an aligned address must be zeros.
2124 Construct the mask needed for this test. For example,
2125 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2126 mask must be 15 = 0xf. */
2127 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2129 /* FORNOW: use the same mask to test all potentially unaligned
2130 references in the loop. The vectorizer currently supports
2131 a single vector size, see the reference to
2132 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2133 vectorization factor is computed. */
2134 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2135 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2136 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2137 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2138 DR_STMT (dr));
2142 /* Versioning requires at least one misaligned data reference. */
2143 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2144 do_versioning = false;
2145 else if (!do_versioning)
2146 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2149 if (do_versioning)
2151 vec<gimple *> may_misalign_stmts
2152 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2153 gimple *stmt;
2155 /* It can now be assumed that the data references in the statements
2156 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2157 of the loop being vectorized. */
2158 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2160 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2161 dr = STMT_VINFO_DATA_REF (stmt_info);
2162 SET_DR_MISALIGNMENT (dr, 0);
2163 if (dump_enabled_p ())
2164 dump_printf_loc (MSG_NOTE, vect_location,
2165 "Alignment of access forced using versioning.\n");
2168 if (dump_enabled_p ())
2169 dump_printf_loc (MSG_NOTE, vect_location,
2170 "Versioning for alignment will be applied.\n");
2172 /* Peeling and versioning can't be done together at this time. */
2173 gcc_assert (! (do_peeling && do_versioning));
2175 stat = vect_verify_datarefs_alignment (loop_vinfo);
2176 gcc_assert (stat);
2177 return stat;
2180 /* This point is reached if neither peeling nor versioning is being done. */
2181 gcc_assert (! (do_peeling || do_versioning));
2183 stat = vect_verify_datarefs_alignment (loop_vinfo);
2184 return stat;
2188 /* Function vect_find_same_alignment_drs.
2190 Update group and alignment relations according to the chosen
2191 vectorization factor. */
2193 static void
2194 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2196 struct data_reference *dra = DDR_A (ddr);
2197 struct data_reference *drb = DDR_B (ddr);
2198 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2199 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2201 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2202 return;
2204 if (dra == drb)
2205 return;
2207 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2208 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2209 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2210 return;
2212 /* Two references with distance zero have the same alignment. */
2213 offset_int diff = (wi::to_offset (DR_INIT (dra))
2214 - wi::to_offset (DR_INIT (drb)));
2215 if (diff != 0)
2217 /* Get the wider of the two alignments. */
2218 unsigned int align_a = (vect_calculate_target_alignment (dra)
2219 / BITS_PER_UNIT);
2220 unsigned int align_b = (vect_calculate_target_alignment (drb)
2221 / BITS_PER_UNIT);
2222 unsigned int max_align = MAX (align_a, align_b);
2224 /* Require the gap to be a multiple of the larger vector alignment. */
2225 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2226 return;
2229 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2230 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2231 if (dump_enabled_p ())
2233 dump_printf_loc (MSG_NOTE, vect_location,
2234 "accesses have the same alignment: ");
2235 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2236 dump_printf (MSG_NOTE, " and ");
2237 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2238 dump_printf (MSG_NOTE, "\n");
2243 /* Function vect_analyze_data_refs_alignment
2245 Analyze the alignment of the data-references in the loop.
2246 Return FALSE if a data reference is found that cannot be vectorized. */
2248 bool
2249 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "=== vect_analyze_data_refs_alignment ===\n");
2255 /* Mark groups of data references with same alignment using
2256 data dependence information. */
2257 vec<ddr_p> ddrs = vinfo->ddrs;
2258 struct data_dependence_relation *ddr;
2259 unsigned int i;
2261 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2262 vect_find_same_alignment_drs (ddr);
2264 vec<data_reference_p> datarefs = vinfo->datarefs;
2265 struct data_reference *dr;
2267 vect_record_base_alignments (vinfo);
2268 FOR_EACH_VEC_ELT (datarefs, i, dr)
2270 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2271 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2272 && !vect_compute_data_ref_alignment (dr))
2274 /* Strided accesses perform only component accesses, misalignment
2275 information is irrelevant for them. */
2276 if (STMT_VINFO_STRIDED_P (stmt_info)
2277 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2278 continue;
2280 if (dump_enabled_p ())
2281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2282 "not vectorized: can't calculate alignment "
2283 "for data ref.\n");
2285 return false;
2289 return true;
2293 /* Analyze alignment of DRs of stmts in NODE. */
2295 static bool
2296 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2298 /* We vectorize from the first scalar stmt in the node unless
2299 the node is permuted in which case we start from the first
2300 element in the group. */
2301 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2302 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2303 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2304 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2306 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2307 if (! vect_compute_data_ref_alignment (dr)
2308 /* For creating the data-ref pointer we need alignment of the
2309 first element anyway. */
2310 || (dr != first_dr
2311 && ! vect_compute_data_ref_alignment (first_dr))
2312 || ! verify_data_ref_alignment (dr))
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316 "not vectorized: bad data alignment in basic "
2317 "block.\n");
2318 return false;
2321 return true;
2324 /* Function vect_slp_analyze_instance_alignment
2326 Analyze the alignment of the data-references in the SLP instance.
2327 Return FALSE if a data reference is found that cannot be vectorized. */
2329 bool
2330 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_NOTE, vect_location,
2334 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2336 slp_tree node;
2337 unsigned i;
2338 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2339 if (! vect_slp_analyze_and_verify_node_alignment (node))
2340 return false;
2342 node = SLP_INSTANCE_TREE (instance);
2343 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2344 && ! vect_slp_analyze_and_verify_node_alignment
2345 (SLP_INSTANCE_TREE (instance)))
2346 return false;
2348 return true;
2352 /* Analyze groups of accesses: check that DR belongs to a group of
2353 accesses of legal size, step, etc. Detect gaps, single element
2354 interleaving, and other special cases. Set grouped access info.
2355 Collect groups of strided stores for further use in SLP analysis.
2356 Worker for vect_analyze_group_access. */
2358 static bool
2359 vect_analyze_group_access_1 (struct data_reference *dr)
2361 tree step = DR_STEP (dr);
2362 tree scalar_type = TREE_TYPE (DR_REF (dr));
2363 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2364 gimple *stmt = DR_STMT (dr);
2365 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2366 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2367 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2368 HOST_WIDE_INT dr_step = -1;
2369 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2370 bool slp_impossible = false;
2372 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2373 size of the interleaving group (including gaps). */
2374 if (tree_fits_shwi_p (step))
2376 dr_step = tree_to_shwi (step);
2377 /* Check that STEP is a multiple of type size. Otherwise there is
2378 a non-element-sized gap at the end of the group which we
2379 cannot represent in GROUP_GAP or GROUP_SIZE.
2380 ??? As we can handle non-constant step fine here we should
2381 simply remove uses of GROUP_GAP between the last and first
2382 element and instead rely on DR_STEP. GROUP_SIZE then would
2383 simply not include that gap. */
2384 if ((dr_step % type_size) != 0)
2386 if (dump_enabled_p ())
2388 dump_printf_loc (MSG_NOTE, vect_location,
2389 "Step ");
2390 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2391 dump_printf (MSG_NOTE,
2392 " is not a multiple of the element size for ");
2393 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2394 dump_printf (MSG_NOTE, "\n");
2396 return false;
2398 groupsize = absu_hwi (dr_step) / type_size;
2400 else
2401 groupsize = 0;
2403 /* Not consecutive access is possible only if it is a part of interleaving. */
2404 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2406 /* Check if it this DR is a part of interleaving, and is a single
2407 element of the group that is accessed in the loop. */
2409 /* Gaps are supported only for loads. STEP must be a multiple of the type
2410 size. The size of the group must be a power of 2. */
2411 if (DR_IS_READ (dr)
2412 && (dr_step % type_size) == 0
2413 && groupsize > 0
2414 && pow2p_hwi (groupsize))
2416 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2417 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2418 GROUP_GAP (stmt_info) = groupsize - 1;
2419 if (dump_enabled_p ())
2421 dump_printf_loc (MSG_NOTE, vect_location,
2422 "Detected single element interleaving ");
2423 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2424 dump_printf (MSG_NOTE, " step ");
2425 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2426 dump_printf (MSG_NOTE, "\n");
2429 return true;
2432 if (dump_enabled_p ())
2434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2435 "not consecutive access ");
2436 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2439 if (bb_vinfo)
2441 /* Mark the statement as unvectorizable. */
2442 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2443 return true;
2446 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2447 STMT_VINFO_STRIDED_P (stmt_info) = true;
2448 return true;
2451 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2453 /* First stmt in the interleaving chain. Check the chain. */
2454 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2455 struct data_reference *data_ref = dr;
2456 unsigned int count = 1;
2457 tree prev_init = DR_INIT (data_ref);
2458 gimple *prev = stmt;
2459 HOST_WIDE_INT diff, gaps = 0;
2461 while (next)
2463 /* Skip same data-refs. In case that two or more stmts share
2464 data-ref (supported only for loads), we vectorize only the first
2465 stmt, and the rest get their vectorized loads from the first
2466 one. */
2467 if (!tree_int_cst_compare (DR_INIT (data_ref),
2468 DR_INIT (STMT_VINFO_DATA_REF (
2469 vinfo_for_stmt (next)))))
2471 if (DR_IS_WRITE (data_ref))
2473 if (dump_enabled_p ())
2474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2475 "Two store stmts share the same dr.\n");
2476 return false;
2479 if (dump_enabled_p ())
2480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2481 "Two or more load stmts share the same dr.\n");
2483 /* For load use the same data-ref load. */
2484 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2486 prev = next;
2487 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2488 continue;
2491 prev = next;
2492 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2494 /* All group members have the same STEP by construction. */
2495 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2497 /* Check that the distance between two accesses is equal to the type
2498 size. Otherwise, we have gaps. */
2499 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2500 - TREE_INT_CST_LOW (prev_init)) / type_size;
2501 if (diff != 1)
2503 /* FORNOW: SLP of accesses with gaps is not supported. */
2504 slp_impossible = true;
2505 if (DR_IS_WRITE (data_ref))
2507 if (dump_enabled_p ())
2508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2509 "interleaved store with gaps\n");
2510 return false;
2513 gaps += diff - 1;
2516 last_accessed_element += diff;
2518 /* Store the gap from the previous member of the group. If there is no
2519 gap in the access, GROUP_GAP is always 1. */
2520 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2522 prev_init = DR_INIT (data_ref);
2523 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2524 /* Count the number of data-refs in the chain. */
2525 count++;
2528 if (groupsize == 0)
2529 groupsize = count + gaps;
2531 /* This could be UINT_MAX but as we are generating code in a very
2532 inefficient way we have to cap earlier. See PR78699 for example. */
2533 if (groupsize > 4096)
2535 if (dump_enabled_p ())
2536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2537 "group is too large\n");
2538 return false;
2541 /* Check that the size of the interleaving is equal to count for stores,
2542 i.e., that there are no gaps. */
2543 if (groupsize != count
2544 && !DR_IS_READ (dr))
2546 if (dump_enabled_p ())
2547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2548 "interleaved store with gaps\n");
2549 return false;
2552 /* If there is a gap after the last load in the group it is the
2553 difference between the groupsize and the last accessed
2554 element.
2555 When there is no gap, this difference should be 0. */
2556 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2558 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2559 if (dump_enabled_p ())
2561 dump_printf_loc (MSG_NOTE, vect_location,
2562 "Detected interleaving ");
2563 if (DR_IS_READ (dr))
2564 dump_printf (MSG_NOTE, "load ");
2565 else
2566 dump_printf (MSG_NOTE, "store ");
2567 dump_printf (MSG_NOTE, "of size %u starting with ",
2568 (unsigned)groupsize);
2569 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2570 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2571 dump_printf_loc (MSG_NOTE, vect_location,
2572 "There is a gap of %u elements after the group\n",
2573 GROUP_GAP (vinfo_for_stmt (stmt)));
2576 /* SLP: create an SLP data structure for every interleaving group of
2577 stores for further analysis in vect_analyse_slp. */
2578 if (DR_IS_WRITE (dr) && !slp_impossible)
2580 if (loop_vinfo)
2581 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2582 if (bb_vinfo)
2583 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2587 return true;
2590 /* Analyze groups of accesses: check that DR belongs to a group of
2591 accesses of legal size, step, etc. Detect gaps, single element
2592 interleaving, and other special cases. Set grouped access info.
2593 Collect groups of strided stores for further use in SLP analysis. */
2595 static bool
2596 vect_analyze_group_access (struct data_reference *dr)
2598 if (!vect_analyze_group_access_1 (dr))
2600 /* Dissolve the group if present. */
2601 gimple *next;
2602 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2603 while (stmt)
2605 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2606 next = GROUP_NEXT_ELEMENT (vinfo);
2607 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2608 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2609 stmt = next;
2611 return false;
2613 return true;
2616 /* Analyze the access pattern of the data-reference DR.
2617 In case of non-consecutive accesses call vect_analyze_group_access() to
2618 analyze groups of accesses. */
2620 static bool
2621 vect_analyze_data_ref_access (struct data_reference *dr)
2623 tree step = DR_STEP (dr);
2624 tree scalar_type = TREE_TYPE (DR_REF (dr));
2625 gimple *stmt = DR_STMT (dr);
2626 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2627 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2628 struct loop *loop = NULL;
2630 if (loop_vinfo)
2631 loop = LOOP_VINFO_LOOP (loop_vinfo);
2633 if (loop_vinfo && !step)
2635 if (dump_enabled_p ())
2636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2637 "bad data-ref access in loop\n");
2638 return false;
2641 /* Allow loads with zero step in inner-loop vectorization. */
2642 if (loop_vinfo && integer_zerop (step))
2644 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2645 if (!nested_in_vect_loop_p (loop, stmt))
2646 return DR_IS_READ (dr);
2647 /* Allow references with zero step for outer loops marked
2648 with pragma omp simd only - it guarantees absence of
2649 loop-carried dependencies between inner loop iterations. */
2650 if (!loop->force_vectorize)
2652 if (dump_enabled_p ())
2653 dump_printf_loc (MSG_NOTE, vect_location,
2654 "zero step in inner loop of nest\n");
2655 return false;
2659 if (loop && nested_in_vect_loop_p (loop, stmt))
2661 /* Interleaved accesses are not yet supported within outer-loop
2662 vectorization for references in the inner-loop. */
2663 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2665 /* For the rest of the analysis we use the outer-loop step. */
2666 step = STMT_VINFO_DR_STEP (stmt_info);
2667 if (integer_zerop (step))
2669 if (dump_enabled_p ())
2670 dump_printf_loc (MSG_NOTE, vect_location,
2671 "zero step in outer loop.\n");
2672 return DR_IS_READ (dr);
2676 /* Consecutive? */
2677 if (TREE_CODE (step) == INTEGER_CST)
2679 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2680 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2681 || (dr_step < 0
2682 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2684 /* Mark that it is not interleaving. */
2685 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2686 return true;
2690 if (loop && nested_in_vect_loop_p (loop, stmt))
2692 if (dump_enabled_p ())
2693 dump_printf_loc (MSG_NOTE, vect_location,
2694 "grouped access in outer loop.\n");
2695 return false;
2699 /* Assume this is a DR handled by non-constant strided load case. */
2700 if (TREE_CODE (step) != INTEGER_CST)
2701 return (STMT_VINFO_STRIDED_P (stmt_info)
2702 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2703 || vect_analyze_group_access (dr)));
2705 /* Not consecutive access - check if it's a part of interleaving group. */
2706 return vect_analyze_group_access (dr);
2709 /* Compare two data-references DRA and DRB to group them into chunks
2710 suitable for grouping. */
2712 static int
2713 dr_group_sort_cmp (const void *dra_, const void *drb_)
2715 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2716 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2717 int cmp;
2719 /* Stabilize sort. */
2720 if (dra == drb)
2721 return 0;
2723 /* DRs in different loops never belong to the same group. */
2724 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2725 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2726 if (loopa != loopb)
2727 return loopa->num < loopb->num ? -1 : 1;
2729 /* Ordering of DRs according to base. */
2730 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2732 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2733 DR_BASE_ADDRESS (drb));
2734 if (cmp != 0)
2735 return cmp;
2738 /* And according to DR_OFFSET. */
2739 if (!dr_equal_offsets_p (dra, drb))
2741 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2742 if (cmp != 0)
2743 return cmp;
2746 /* Put reads before writes. */
2747 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2748 return DR_IS_READ (dra) ? -1 : 1;
2750 /* Then sort after access size. */
2751 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2752 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2754 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2755 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2756 if (cmp != 0)
2757 return cmp;
2760 /* And after step. */
2761 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2763 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2764 if (cmp != 0)
2765 return cmp;
2768 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2769 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2770 if (cmp == 0)
2771 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2772 return cmp;
2775 /* Function vect_analyze_data_ref_accesses.
2777 Analyze the access pattern of all the data references in the loop.
2779 FORNOW: the only access pattern that is considered vectorizable is a
2780 simple step 1 (consecutive) access.
2782 FORNOW: handle only arrays and pointer accesses. */
2784 bool
2785 vect_analyze_data_ref_accesses (vec_info *vinfo)
2787 unsigned int i;
2788 vec<data_reference_p> datarefs = vinfo->datarefs;
2789 struct data_reference *dr;
2791 if (dump_enabled_p ())
2792 dump_printf_loc (MSG_NOTE, vect_location,
2793 "=== vect_analyze_data_ref_accesses ===\n");
2795 if (datarefs.is_empty ())
2796 return true;
2798 /* Sort the array of datarefs to make building the interleaving chains
2799 linear. Don't modify the original vector's order, it is needed for
2800 determining what dependencies are reversed. */
2801 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2802 datarefs_copy.qsort (dr_group_sort_cmp);
2804 /* Build the interleaving chains. */
2805 for (i = 0; i < datarefs_copy.length () - 1;)
2807 data_reference_p dra = datarefs_copy[i];
2808 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2809 stmt_vec_info lastinfo = NULL;
2810 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2812 ++i;
2813 continue;
2815 for (i = i + 1; i < datarefs_copy.length (); ++i)
2817 data_reference_p drb = datarefs_copy[i];
2818 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2819 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2820 break;
2822 /* ??? Imperfect sorting (non-compatible types, non-modulo
2823 accesses, same accesses) can lead to a group to be artificially
2824 split here as we don't just skip over those. If it really
2825 matters we can push those to a worklist and re-iterate
2826 over them. The we can just skip ahead to the next DR here. */
2828 /* DRs in a different loop should not be put into the same
2829 interleaving group. */
2830 if (gimple_bb (DR_STMT (dra))->loop_father
2831 != gimple_bb (DR_STMT (drb))->loop_father)
2832 break;
2834 /* Check that the data-refs have same first location (except init)
2835 and they are both either store or load (not load and store,
2836 not masked loads or stores). */
2837 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2838 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2839 DR_BASE_ADDRESS (drb), 0)
2840 || !dr_equal_offsets_p (dra, drb)
2841 || !gimple_assign_single_p (DR_STMT (dra))
2842 || !gimple_assign_single_p (DR_STMT (drb)))
2843 break;
2845 /* Check that the data-refs have the same constant size. */
2846 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2847 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2848 if (!tree_fits_uhwi_p (sza)
2849 || !tree_fits_uhwi_p (szb)
2850 || !tree_int_cst_equal (sza, szb))
2851 break;
2853 /* Check that the data-refs have the same step. */
2854 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2855 break;
2857 /* Do not place the same access in the interleaving chain twice. */
2858 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2859 break;
2861 /* Check the types are compatible.
2862 ??? We don't distinguish this during sorting. */
2863 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2864 TREE_TYPE (DR_REF (drb))))
2865 break;
2867 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2868 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2869 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2870 gcc_assert (init_a <= init_b);
2872 /* If init_b == init_a + the size of the type * k, we have an
2873 interleaving, and DRA is accessed before DRB. */
2874 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2875 if (type_size_a == 0
2876 || (init_b - init_a) % type_size_a != 0)
2877 break;
2879 /* If we have a store, the accesses are adjacent. This splits
2880 groups into chunks we support (we don't support vectorization
2881 of stores with gaps). */
2882 if (!DR_IS_READ (dra)
2883 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2884 (DR_INIT (datarefs_copy[i-1]))
2885 != type_size_a))
2886 break;
2888 /* If the step (if not zero or non-constant) is greater than the
2889 difference between data-refs' inits this splits groups into
2890 suitable sizes. */
2891 if (tree_fits_shwi_p (DR_STEP (dra)))
2893 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2894 if (step != 0 && step <= (init_b - init_a))
2895 break;
2898 if (dump_enabled_p ())
2900 dump_printf_loc (MSG_NOTE, vect_location,
2901 "Detected interleaving ");
2902 if (DR_IS_READ (dra))
2903 dump_printf (MSG_NOTE, "load ");
2904 else
2905 dump_printf (MSG_NOTE, "store ");
2906 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2907 dump_printf (MSG_NOTE, " and ");
2908 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2909 dump_printf (MSG_NOTE, "\n");
2912 /* Link the found element into the group list. */
2913 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2915 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2916 lastinfo = stmtinfo_a;
2918 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2919 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2920 lastinfo = stmtinfo_b;
2924 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2925 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2926 && !vect_analyze_data_ref_access (dr))
2928 if (dump_enabled_p ())
2929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2930 "not vectorized: complicated access pattern.\n");
2932 if (is_a <bb_vec_info> (vinfo))
2934 /* Mark the statement as not vectorizable. */
2935 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2936 continue;
2938 else
2940 datarefs_copy.release ();
2941 return false;
2945 datarefs_copy.release ();
2946 return true;
2949 /* Function vect_vfa_segment_size.
2951 Create an expression that computes the size of segment
2952 that will be accessed for a data reference. The functions takes into
2953 account that realignment loads may access one more vector.
2955 Input:
2956 DR: The data reference.
2957 LENGTH_FACTOR: segment length to consider.
2959 Return an expression whose value is the size of segment which will be
2960 accessed by DR. */
2962 static tree
2963 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2965 tree segment_length;
2967 if (integer_zerop (DR_STEP (dr)))
2968 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2969 else
2970 segment_length = size_binop (MULT_EXPR,
2971 fold_convert (sizetype, DR_STEP (dr)),
2972 fold_convert (sizetype, length_factor));
2974 if (vect_supportable_dr_alignment (dr, false)
2975 == dr_explicit_realign_optimized)
2977 tree vector_size = TYPE_SIZE_UNIT
2978 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2980 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2982 return segment_length;
2985 /* Function vect_no_alias_p.
2987 Given data references A and B with equal base and offset, the alias
2988 relation can be decided at compilation time, return TRUE if they do
2989 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2990 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2991 respectively. */
2993 static bool
2994 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2995 tree segment_length_a, tree segment_length_b)
2997 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2998 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2999 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
3000 return false;
3002 tree seg_a_min = DR_INIT (a);
3003 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3004 seg_a_min, segment_length_a);
3005 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3006 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3007 [a, a+12) */
3008 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3010 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3011 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3012 seg_a_max, unit_size);
3013 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3014 DR_INIT (a), unit_size);
3016 tree seg_b_min = DR_INIT (b);
3017 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3018 seg_b_min, segment_length_b);
3019 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3021 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3022 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3023 seg_b_max, unit_size);
3024 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3025 DR_INIT (b), unit_size);
3028 if (tree_int_cst_le (seg_a_max, seg_b_min)
3029 || tree_int_cst_le (seg_b_max, seg_a_min))
3030 return true;
3032 return false;
3035 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3036 in DDR is >= VF. */
3038 static bool
3039 dependence_distance_ge_vf (data_dependence_relation *ddr,
3040 unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3042 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3043 || DDR_NUM_DIST_VECTS (ddr) == 0)
3044 return false;
3046 /* If the dependence is exact, we should have limited the VF instead. */
3047 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3049 unsigned int i;
3050 lambda_vector dist_v;
3051 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3053 HOST_WIDE_INT dist = dist_v[loop_depth];
3054 if (dist != 0
3055 && !(dist > 0 && DDR_REVERSED_P (ddr))
3056 && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3057 return false;
3060 if (dump_enabled_p ())
3062 dump_printf_loc (MSG_NOTE, vect_location,
3063 "dependence distance between ");
3064 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3065 dump_printf (MSG_NOTE, " and ");
3066 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3067 dump_printf (MSG_NOTE, " is >= VF\n");
3070 return true;
3073 /* Function vect_prune_runtime_alias_test_list.
3075 Prune a list of ddrs to be tested at run-time by versioning for alias.
3076 Merge several alias checks into one if possible.
3077 Return FALSE if resulting list of ddrs is longer then allowed by
3078 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3080 bool
3081 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3083 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3084 hash_set <tree_pair_hash> compared_objects;
3086 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3087 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3088 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3089 vec<vec_object_pair> &check_unequal_addrs
3090 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3091 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3092 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3094 ddr_p ddr;
3095 unsigned int i;
3096 tree length_factor;
3098 if (dump_enabled_p ())
3099 dump_printf_loc (MSG_NOTE, vect_location,
3100 "=== vect_prune_runtime_alias_test_list ===\n");
3102 if (may_alias_ddrs.is_empty ())
3103 return true;
3105 comp_alias_ddrs.create (may_alias_ddrs.length ());
3107 unsigned int loop_depth
3108 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3109 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3111 /* First, we collect all data ref pairs for aliasing checks. */
3112 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3114 int comp_res;
3115 struct data_reference *dr_a, *dr_b;
3116 gimple *dr_group_first_a, *dr_group_first_b;
3117 tree segment_length_a, segment_length_b;
3118 gimple *stmt_a, *stmt_b;
3120 /* Ignore the alias if the VF we chose ended up being no greater
3121 than the dependence distance. */
3122 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3123 continue;
3125 if (DDR_OBJECT_A (ddr))
3127 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3128 if (!compared_objects.add (new_pair))
3130 if (dump_enabled_p ())
3132 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3133 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3134 dump_printf (MSG_NOTE, " and ");
3135 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3136 dump_printf (MSG_NOTE, " have different addresses\n");
3138 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3140 continue;
3143 dr_a = DDR_A (ddr);
3144 stmt_a = DR_STMT (DDR_A (ddr));
3145 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3146 if (dr_group_first_a)
3148 stmt_a = dr_group_first_a;
3149 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3152 dr_b = DDR_B (ddr);
3153 stmt_b = DR_STMT (DDR_B (ddr));
3154 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3155 if (dr_group_first_b)
3157 stmt_b = dr_group_first_b;
3158 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3161 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3162 length_factor = scalar_loop_iters;
3163 else
3164 length_factor = size_int (vect_factor);
3165 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3166 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3168 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3169 DR_BASE_ADDRESS (dr_b));
3170 if (comp_res == 0)
3171 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3172 DR_OFFSET (dr_b));
3174 /* Alias is known at compilation time. */
3175 if (comp_res == 0
3176 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3177 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3178 && TREE_CODE (segment_length_a) == INTEGER_CST
3179 && TREE_CODE (segment_length_b) == INTEGER_CST)
3181 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3182 continue;
3184 if (dump_enabled_p ())
3185 dump_printf_loc (MSG_NOTE, vect_location,
3186 "not vectorized: compilation time alias.\n");
3188 return false;
3191 dr_with_seg_len_pair_t dr_with_seg_len_pair
3192 (dr_with_seg_len (dr_a, segment_length_a),
3193 dr_with_seg_len (dr_b, segment_length_b));
3195 /* Canonicalize pairs by sorting the two DR members. */
3196 if (comp_res > 0)
3197 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3199 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3202 prune_runtime_alias_test_list (&comp_alias_ddrs,
3203 (unsigned HOST_WIDE_INT) vect_factor);
3205 unsigned int count = (comp_alias_ddrs.length ()
3206 + check_unequal_addrs.length ());
3207 dump_printf_loc (MSG_NOTE, vect_location,
3208 "improved number of alias checks from %d to %d\n",
3209 may_alias_ddrs.length (), count);
3210 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3212 if (dump_enabled_p ())
3213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3214 "number of versioning for alias "
3215 "run-time tests exceeds %d "
3216 "(--param vect-max-version-for-alias-checks)\n",
3217 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3218 return false;
3221 return true;
3224 /* Return true if a non-affine read or write in STMT is suitable for a
3225 gather load or scatter store. Describe the operation in *INFO if so. */
3227 bool
3228 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3229 gather_scatter_info *info)
3231 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3232 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3233 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3234 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3235 tree offtype = NULL_TREE;
3236 tree decl, base, off;
3237 machine_mode pmode;
3238 int punsignedp, reversep, pvolatilep = 0;
3240 base = DR_REF (dr);
3241 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3242 see if we can use the def stmt of the address. */
3243 if (is_gimple_call (stmt)
3244 && gimple_call_internal_p (stmt)
3245 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3246 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3247 && TREE_CODE (base) == MEM_REF
3248 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3249 && integer_zerop (TREE_OPERAND (base, 1))
3250 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3252 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3253 if (is_gimple_assign (def_stmt)
3254 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3255 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3258 /* The gather and scatter builtins need address of the form
3259 loop_invariant + vector * {1, 2, 4, 8}
3261 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3262 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3263 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3264 multiplications and additions in it. To get a vector, we need
3265 a single SSA_NAME that will be defined in the loop and will
3266 contain everything that is not loop invariant and that can be
3267 vectorized. The following code attempts to find such a preexistng
3268 SSA_NAME OFF and put the loop invariants into a tree BASE
3269 that can be gimplified before the loop. */
3270 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3271 &punsignedp, &reversep, &pvolatilep);
3272 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3274 if (TREE_CODE (base) == MEM_REF)
3276 if (!integer_zerop (TREE_OPERAND (base, 1)))
3278 if (off == NULL_TREE)
3280 offset_int moff = mem_ref_offset (base);
3281 off = wide_int_to_tree (sizetype, moff);
3283 else
3284 off = size_binop (PLUS_EXPR, off,
3285 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3287 base = TREE_OPERAND (base, 0);
3289 else
3290 base = build_fold_addr_expr (base);
3292 if (off == NULL_TREE)
3293 off = size_zero_node;
3295 /* If base is not loop invariant, either off is 0, then we start with just
3296 the constant offset in the loop invariant BASE and continue with base
3297 as OFF, otherwise give up.
3298 We could handle that case by gimplifying the addition of base + off
3299 into some SSA_NAME and use that as off, but for now punt. */
3300 if (!expr_invariant_in_loop_p (loop, base))
3302 if (!integer_zerop (off))
3303 return false;
3304 off = base;
3305 base = size_int (pbitpos / BITS_PER_UNIT);
3307 /* Otherwise put base + constant offset into the loop invariant BASE
3308 and continue with OFF. */
3309 else
3311 base = fold_convert (sizetype, base);
3312 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3315 /* OFF at this point may be either a SSA_NAME or some tree expression
3316 from get_inner_reference. Try to peel off loop invariants from it
3317 into BASE as long as possible. */
3318 STRIP_NOPS (off);
3319 while (offtype == NULL_TREE)
3321 enum tree_code code;
3322 tree op0, op1, add = NULL_TREE;
3324 if (TREE_CODE (off) == SSA_NAME)
3326 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3328 if (expr_invariant_in_loop_p (loop, off))
3329 return false;
3331 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3332 break;
3334 op0 = gimple_assign_rhs1 (def_stmt);
3335 code = gimple_assign_rhs_code (def_stmt);
3336 op1 = gimple_assign_rhs2 (def_stmt);
3338 else
3340 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3341 return false;
3342 code = TREE_CODE (off);
3343 extract_ops_from_tree (off, &code, &op0, &op1);
3345 switch (code)
3347 case POINTER_PLUS_EXPR:
3348 case PLUS_EXPR:
3349 if (expr_invariant_in_loop_p (loop, op0))
3351 add = op0;
3352 off = op1;
3353 do_add:
3354 add = fold_convert (sizetype, add);
3355 if (scale != 1)
3356 add = size_binop (MULT_EXPR, add, size_int (scale));
3357 base = size_binop (PLUS_EXPR, base, add);
3358 continue;
3360 if (expr_invariant_in_loop_p (loop, op1))
3362 add = op1;
3363 off = op0;
3364 goto do_add;
3366 break;
3367 case MINUS_EXPR:
3368 if (expr_invariant_in_loop_p (loop, op1))
3370 add = fold_convert (sizetype, op1);
3371 add = size_binop (MINUS_EXPR, size_zero_node, add);
3372 off = op0;
3373 goto do_add;
3375 break;
3376 case MULT_EXPR:
3377 if (scale == 1 && tree_fits_shwi_p (op1))
3379 scale = tree_to_shwi (op1);
3380 off = op0;
3381 continue;
3383 break;
3384 case SSA_NAME:
3385 off = op0;
3386 continue;
3387 CASE_CONVERT:
3388 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3389 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3390 break;
3391 if (TYPE_PRECISION (TREE_TYPE (op0))
3392 == TYPE_PRECISION (TREE_TYPE (off)))
3394 off = op0;
3395 continue;
3397 if (TYPE_PRECISION (TREE_TYPE (op0))
3398 < TYPE_PRECISION (TREE_TYPE (off)))
3400 off = op0;
3401 offtype = TREE_TYPE (off);
3402 STRIP_NOPS (off);
3403 continue;
3405 break;
3406 default:
3407 break;
3409 break;
3412 /* If at the end OFF still isn't a SSA_NAME or isn't
3413 defined in the loop, punt. */
3414 if (TREE_CODE (off) != SSA_NAME
3415 || expr_invariant_in_loop_p (loop, off))
3416 return false;
3418 if (offtype == NULL_TREE)
3419 offtype = TREE_TYPE (off);
3421 if (DR_IS_READ (dr))
3422 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3423 offtype, scale);
3424 else
3425 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3426 offtype, scale);
3428 if (decl == NULL_TREE)
3429 return false;
3431 info->decl = decl;
3432 info->base = base;
3433 info->offset = off;
3434 info->offset_dt = vect_unknown_def_type;
3435 info->offset_vectype = NULL_TREE;
3436 info->scale = scale;
3437 return true;
3440 /* Function vect_analyze_data_refs.
3442 Find all the data references in the loop or basic block.
3444 The general structure of the analysis of data refs in the vectorizer is as
3445 follows:
3446 1- vect_analyze_data_refs(loop/bb): call
3447 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3448 in the loop/bb and their dependences.
3449 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3450 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3451 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3455 bool
3456 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3458 struct loop *loop = NULL;
3459 unsigned int i;
3460 struct data_reference *dr;
3461 tree scalar_type;
3463 if (dump_enabled_p ())
3464 dump_printf_loc (MSG_NOTE, vect_location,
3465 "=== vect_analyze_data_refs ===\n");
3467 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3468 loop = LOOP_VINFO_LOOP (loop_vinfo);
3470 /* Go through the data-refs, check that the analysis succeeded. Update
3471 pointer from stmt_vec_info struct to DR and vectype. */
3473 vec<data_reference_p> datarefs = vinfo->datarefs;
3474 FOR_EACH_VEC_ELT (datarefs, i, dr)
3476 gimple *stmt;
3477 stmt_vec_info stmt_info;
3478 tree base, offset, init;
3479 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3480 bool simd_lane_access = false;
3481 int vf;
3483 again:
3484 if (!dr || !DR_REF (dr))
3486 if (dump_enabled_p ())
3487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3488 "not vectorized: unhandled data-ref\n");
3489 return false;
3492 stmt = DR_STMT (dr);
3493 stmt_info = vinfo_for_stmt (stmt);
3495 /* Discard clobbers from the dataref vector. We will remove
3496 clobber stmts during vectorization. */
3497 if (gimple_clobber_p (stmt))
3499 free_data_ref (dr);
3500 if (i == datarefs.length () - 1)
3502 datarefs.pop ();
3503 break;
3505 datarefs.ordered_remove (i);
3506 dr = datarefs[i];
3507 goto again;
3510 /* Check that analysis of the data-ref succeeded. */
3511 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3512 || !DR_STEP (dr))
3514 bool maybe_gather
3515 = DR_IS_READ (dr)
3516 && !TREE_THIS_VOLATILE (DR_REF (dr))
3517 && targetm.vectorize.builtin_gather != NULL;
3518 bool maybe_scatter
3519 = DR_IS_WRITE (dr)
3520 && !TREE_THIS_VOLATILE (DR_REF (dr))
3521 && targetm.vectorize.builtin_scatter != NULL;
3522 bool maybe_simd_lane_access
3523 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3525 /* If target supports vector gather loads or scatter stores, or if
3526 this might be a SIMD lane access, see if they can't be used. */
3527 if (is_a <loop_vec_info> (vinfo)
3528 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3529 && !nested_in_vect_loop_p (loop, stmt))
3531 struct data_reference *newdr
3532 = create_data_ref (NULL, loop_containing_stmt (stmt),
3533 DR_REF (dr), stmt, !maybe_scatter,
3534 DR_IS_CONDITIONAL_IN_STMT (dr));
3535 gcc_assert (newdr != NULL && DR_REF (newdr));
3536 if (DR_BASE_ADDRESS (newdr)
3537 && DR_OFFSET (newdr)
3538 && DR_INIT (newdr)
3539 && DR_STEP (newdr)
3540 && integer_zerop (DR_STEP (newdr)))
3542 if (maybe_simd_lane_access)
3544 tree off = DR_OFFSET (newdr);
3545 STRIP_NOPS (off);
3546 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3547 && TREE_CODE (off) == MULT_EXPR
3548 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3550 tree step = TREE_OPERAND (off, 1);
3551 off = TREE_OPERAND (off, 0);
3552 STRIP_NOPS (off);
3553 if (CONVERT_EXPR_P (off)
3554 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3555 0)))
3556 < TYPE_PRECISION (TREE_TYPE (off)))
3557 off = TREE_OPERAND (off, 0);
3558 if (TREE_CODE (off) == SSA_NAME)
3560 gimple *def = SSA_NAME_DEF_STMT (off);
3561 tree reft = TREE_TYPE (DR_REF (newdr));
3562 if (is_gimple_call (def)
3563 && gimple_call_internal_p (def)
3564 && (gimple_call_internal_fn (def)
3565 == IFN_GOMP_SIMD_LANE))
3567 tree arg = gimple_call_arg (def, 0);
3568 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3569 arg = SSA_NAME_VAR (arg);
3570 if (arg == loop->simduid
3571 /* For now. */
3572 && tree_int_cst_equal
3573 (TYPE_SIZE_UNIT (reft),
3574 step))
3576 DR_OFFSET (newdr) = ssize_int (0);
3577 DR_STEP (newdr) = step;
3578 DR_OFFSET_ALIGNMENT (newdr)
3579 = BIGGEST_ALIGNMENT;
3580 DR_STEP_ALIGNMENT (newdr)
3581 = highest_pow2_factor (step);
3582 dr = newdr;
3583 simd_lane_access = true;
3589 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3591 dr = newdr;
3592 if (maybe_gather)
3593 gatherscatter = GATHER;
3594 else
3595 gatherscatter = SCATTER;
3598 if (gatherscatter == SG_NONE && !simd_lane_access)
3599 free_data_ref (newdr);
3602 if (gatherscatter == SG_NONE && !simd_lane_access)
3604 if (dump_enabled_p ())
3606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3607 "not vectorized: data ref analysis "
3608 "failed ");
3609 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3612 if (is_a <bb_vec_info> (vinfo))
3613 break;
3615 return false;
3619 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3621 if (dump_enabled_p ())
3622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3623 "not vectorized: base addr of dr is a "
3624 "constant\n");
3626 if (is_a <bb_vec_info> (vinfo))
3627 break;
3629 if (gatherscatter != SG_NONE || simd_lane_access)
3630 free_data_ref (dr);
3631 return false;
3634 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3636 if (dump_enabled_p ())
3638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3639 "not vectorized: volatile type ");
3640 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3643 if (is_a <bb_vec_info> (vinfo))
3644 break;
3646 return false;
3649 if (stmt_can_throw_internal (stmt))
3651 if (dump_enabled_p ())
3653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3654 "not vectorized: statement can throw an "
3655 "exception ");
3656 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3659 if (is_a <bb_vec_info> (vinfo))
3660 break;
3662 if (gatherscatter != SG_NONE || simd_lane_access)
3663 free_data_ref (dr);
3664 return false;
3667 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3668 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3670 if (dump_enabled_p ())
3672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3673 "not vectorized: statement is bitfield "
3674 "access ");
3675 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3678 if (is_a <bb_vec_info> (vinfo))
3679 break;
3681 if (gatherscatter != SG_NONE || simd_lane_access)
3682 free_data_ref (dr);
3683 return false;
3686 base = unshare_expr (DR_BASE_ADDRESS (dr));
3687 offset = unshare_expr (DR_OFFSET (dr));
3688 init = unshare_expr (DR_INIT (dr));
3690 if (is_gimple_call (stmt)
3691 && (!gimple_call_internal_p (stmt)
3692 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3693 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3695 if (dump_enabled_p ())
3697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3698 "not vectorized: dr in a call ");
3699 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3702 if (is_a <bb_vec_info> (vinfo))
3703 break;
3705 if (gatherscatter != SG_NONE || simd_lane_access)
3706 free_data_ref (dr);
3707 return false;
3710 /* Update DR field in stmt_vec_info struct. */
3712 /* If the dataref is in an inner-loop of the loop that is considered for
3713 for vectorization, we also want to analyze the access relative to
3714 the outer-loop (DR contains information only relative to the
3715 inner-most enclosing loop). We do that by building a reference to the
3716 first location accessed by the inner-loop, and analyze it relative to
3717 the outer-loop. */
3718 if (loop && nested_in_vect_loop_p (loop, stmt))
3720 /* Build a reference to the first location accessed by the
3721 inner loop: *(BASE + INIT + OFFSET). By construction,
3722 this address must be invariant in the inner loop, so we
3723 can consider it as being used in the outer loop. */
3724 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3725 init, offset);
3726 tree init_addr = fold_build_pointer_plus (base, init_offset);
3727 tree init_ref = build_fold_indirect_ref (init_addr);
3729 if (dump_enabled_p ())
3731 dump_printf_loc (MSG_NOTE, vect_location,
3732 "analyze in outer loop: ");
3733 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3734 dump_printf (MSG_NOTE, "\n");
3737 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3738 init_ref, loop))
3739 /* dr_analyze_innermost already explained the failure. */
3740 return false;
3742 if (dump_enabled_p ())
3744 dump_printf_loc (MSG_NOTE, vect_location,
3745 "\touter base_address: ");
3746 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3747 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3748 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3749 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3750 STMT_VINFO_DR_OFFSET (stmt_info));
3751 dump_printf (MSG_NOTE,
3752 "\n\touter constant offset from base address: ");
3753 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3754 STMT_VINFO_DR_INIT (stmt_info));
3755 dump_printf (MSG_NOTE, "\n\touter step: ");
3756 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3757 STMT_VINFO_DR_STEP (stmt_info));
3758 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3759 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3760 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3761 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3762 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3763 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3764 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3765 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3769 if (STMT_VINFO_DATA_REF (stmt_info))
3771 if (dump_enabled_p ())
3773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3774 "not vectorized: more than one data ref "
3775 "in stmt: ");
3776 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3779 if (is_a <bb_vec_info> (vinfo))
3780 break;
3782 if (gatherscatter != SG_NONE || simd_lane_access)
3783 free_data_ref (dr);
3784 return false;
3787 STMT_VINFO_DATA_REF (stmt_info) = dr;
3788 if (simd_lane_access)
3790 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3791 free_data_ref (datarefs[i]);
3792 datarefs[i] = dr;
3795 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3796 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3797 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3799 if (dump_enabled_p ())
3801 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3802 "not vectorized: base object not addressable "
3803 "for stmt: ");
3804 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3806 if (is_a <bb_vec_info> (vinfo))
3808 /* In BB vectorization the ref can still participate
3809 in dependence analysis, we just can't vectorize it. */
3810 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3811 continue;
3813 return false;
3816 /* Set vectype for STMT. */
3817 scalar_type = TREE_TYPE (DR_REF (dr));
3818 STMT_VINFO_VECTYPE (stmt_info)
3819 = get_vectype_for_scalar_type (scalar_type);
3820 if (!STMT_VINFO_VECTYPE (stmt_info))
3822 if (dump_enabled_p ())
3824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3825 "not vectorized: no vectype for stmt: ");
3826 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3827 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3828 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3829 scalar_type);
3830 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3833 if (is_a <bb_vec_info> (vinfo))
3835 /* No vector type is fine, the ref can still participate
3836 in dependence analysis, we just can't vectorize it. */
3837 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3838 continue;
3841 if (gatherscatter != SG_NONE || simd_lane_access)
3843 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3844 if (gatherscatter != SG_NONE)
3845 free_data_ref (dr);
3847 return false;
3849 else
3851 if (dump_enabled_p ())
3853 dump_printf_loc (MSG_NOTE, vect_location,
3854 "got vectype for stmt: ");
3855 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3856 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3857 STMT_VINFO_VECTYPE (stmt_info));
3858 dump_printf (MSG_NOTE, "\n");
3862 /* Adjust the minimal vectorization factor according to the
3863 vector type. */
3864 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3865 if (vf > *min_vf)
3866 *min_vf = vf;
3868 if (gatherscatter != SG_NONE)
3870 gather_scatter_info gs_info;
3871 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3872 &gs_info)
3873 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3875 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3876 free_data_ref (dr);
3877 if (dump_enabled_p ())
3879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3880 (gatherscatter == GATHER) ?
3881 "not vectorized: not suitable for gather "
3882 "load " :
3883 "not vectorized: not suitable for scatter "
3884 "store ");
3885 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3887 return false;
3890 free_data_ref (datarefs[i]);
3891 datarefs[i] = dr;
3892 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3895 else if (is_a <loop_vec_info> (vinfo)
3896 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3898 if (nested_in_vect_loop_p (loop, stmt))
3900 if (dump_enabled_p ())
3902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3903 "not vectorized: not suitable for strided "
3904 "load ");
3905 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3907 return false;
3909 STMT_VINFO_STRIDED_P (stmt_info) = true;
3913 /* If we stopped analysis at the first dataref we could not analyze
3914 when trying to vectorize a basic-block mark the rest of the datarefs
3915 as not vectorizable and truncate the vector of datarefs. That
3916 avoids spending useless time in analyzing their dependence. */
3917 if (i != datarefs.length ())
3919 gcc_assert (is_a <bb_vec_info> (vinfo));
3920 for (unsigned j = i; j < datarefs.length (); ++j)
3922 data_reference_p dr = datarefs[j];
3923 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3924 free_data_ref (dr);
3926 datarefs.truncate (i);
3929 return true;
3933 /* Function vect_get_new_vect_var.
3935 Returns a name for a new variable. The current naming scheme appends the
3936 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3937 the name of vectorizer generated variables, and appends that to NAME if
3938 provided. */
3940 tree
3941 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3943 const char *prefix;
3944 tree new_vect_var;
3946 switch (var_kind)
3948 case vect_simple_var:
3949 prefix = "vect";
3950 break;
3951 case vect_scalar_var:
3952 prefix = "stmp";
3953 break;
3954 case vect_mask_var:
3955 prefix = "mask";
3956 break;
3957 case vect_pointer_var:
3958 prefix = "vectp";
3959 break;
3960 default:
3961 gcc_unreachable ();
3964 if (name)
3966 char* tmp = concat (prefix, "_", name, NULL);
3967 new_vect_var = create_tmp_reg (type, tmp);
3968 free (tmp);
3970 else
3971 new_vect_var = create_tmp_reg (type, prefix);
3973 return new_vect_var;
3976 /* Like vect_get_new_vect_var but return an SSA name. */
3978 tree
3979 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3981 const char *prefix;
3982 tree new_vect_var;
3984 switch (var_kind)
3986 case vect_simple_var:
3987 prefix = "vect";
3988 break;
3989 case vect_scalar_var:
3990 prefix = "stmp";
3991 break;
3992 case vect_pointer_var:
3993 prefix = "vectp";
3994 break;
3995 default:
3996 gcc_unreachable ();
3999 if (name)
4001 char* tmp = concat (prefix, "_", name, NULL);
4002 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4003 free (tmp);
4005 else
4006 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4008 return new_vect_var;
4011 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4013 static void
4014 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4016 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4017 int misalign = DR_MISALIGNMENT (dr);
4018 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4019 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4020 else
4021 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4022 DR_TARGET_ALIGNMENT (dr), misalign);
4025 /* Function vect_create_addr_base_for_vector_ref.
4027 Create an expression that computes the address of the first memory location
4028 that will be accessed for a data reference.
4030 Input:
4031 STMT: The statement containing the data reference.
4032 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4033 OFFSET: Optional. If supplied, it is be added to the initial address.
4034 LOOP: Specify relative to which loop-nest should the address be computed.
4035 For example, when the dataref is in an inner-loop nested in an
4036 outer-loop that is now being vectorized, LOOP can be either the
4037 outer-loop, or the inner-loop. The first memory location accessed
4038 by the following dataref ('in' points to short):
4040 for (i=0; i<N; i++)
4041 for (j=0; j<M; j++)
4042 s += in[i+j]
4044 is as follows:
4045 if LOOP=i_loop: &in (relative to i_loop)
4046 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4047 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4048 initial address. Unlike OFFSET, which is number of elements to
4049 be added, BYTE_OFFSET is measured in bytes.
4051 Output:
4052 1. Return an SSA_NAME whose value is the address of the memory location of
4053 the first vector of the data reference.
4054 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4055 these statement(s) which define the returned SSA_NAME.
4057 FORNOW: We are only handling array accesses with step 1. */
4059 tree
4060 vect_create_addr_base_for_vector_ref (gimple *stmt,
4061 gimple_seq *new_stmt_list,
4062 tree offset,
4063 tree byte_offset)
4065 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4066 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4067 const char *base_name;
4068 tree addr_base;
4069 tree dest;
4070 gimple_seq seq = NULL;
4071 tree vect_ptr_type;
4072 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4073 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4074 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4076 tree data_ref_base = unshare_expr (drb->base_address);
4077 tree base_offset = unshare_expr (drb->offset);
4078 tree init = unshare_expr (drb->init);
4080 if (loop_vinfo)
4081 base_name = get_name (data_ref_base);
4082 else
4084 base_offset = ssize_int (0);
4085 init = ssize_int (0);
4086 base_name = get_name (DR_REF (dr));
4089 /* Create base_offset */
4090 base_offset = size_binop (PLUS_EXPR,
4091 fold_convert (sizetype, base_offset),
4092 fold_convert (sizetype, init));
4094 if (offset)
4096 offset = fold_build2 (MULT_EXPR, sizetype,
4097 fold_convert (sizetype, offset), step);
4098 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4099 base_offset, offset);
4101 if (byte_offset)
4103 byte_offset = fold_convert (sizetype, byte_offset);
4104 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4105 base_offset, byte_offset);
4108 /* base + base_offset */
4109 if (loop_vinfo)
4110 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4111 else
4113 addr_base = build1 (ADDR_EXPR,
4114 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4115 unshare_expr (DR_REF (dr)));
4118 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4119 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4120 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4121 gimple_seq_add_seq (new_stmt_list, seq);
4123 if (DR_PTR_INFO (dr)
4124 && TREE_CODE (addr_base) == SSA_NAME
4125 && !SSA_NAME_PTR_INFO (addr_base))
4127 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4128 if (offset || byte_offset)
4129 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4132 if (dump_enabled_p ())
4134 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4135 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4136 dump_printf (MSG_NOTE, "\n");
4139 return addr_base;
4143 /* Function vect_create_data_ref_ptr.
4145 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4146 location accessed in the loop by STMT, along with the def-use update
4147 chain to appropriately advance the pointer through the loop iterations.
4148 Also set aliasing information for the pointer. This pointer is used by
4149 the callers to this function to create a memory reference expression for
4150 vector load/store access.
4152 Input:
4153 1. STMT: a stmt that references memory. Expected to be of the form
4154 GIMPLE_ASSIGN <name, data-ref> or
4155 GIMPLE_ASSIGN <data-ref, name>.
4156 2. AGGR_TYPE: the type of the reference, which should be either a vector
4157 or an array.
4158 3. AT_LOOP: the loop where the vector memref is to be created.
4159 4. OFFSET (optional): an offset to be added to the initial address accessed
4160 by the data-ref in STMT.
4161 5. BSI: location where the new stmts are to be placed if there is no loop
4162 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4163 pointing to the initial address.
4164 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4165 to the initial address accessed by the data-ref in STMT. This is
4166 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4167 in bytes.
4169 Output:
4170 1. Declare a new ptr to vector_type, and have it point to the base of the
4171 data reference (initial addressed accessed by the data reference).
4172 For example, for vector of type V8HI, the following code is generated:
4174 v8hi *ap;
4175 ap = (v8hi *)initial_address;
4177 if OFFSET is not supplied:
4178 initial_address = &a[init];
4179 if OFFSET is supplied:
4180 initial_address = &a[init + OFFSET];
4181 if BYTE_OFFSET is supplied:
4182 initial_address = &a[init] + BYTE_OFFSET;
4184 Return the initial_address in INITIAL_ADDRESS.
4186 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4187 update the pointer in each iteration of the loop.
4189 Return the increment stmt that updates the pointer in PTR_INCR.
4191 3. Set INV_P to true if the access pattern of the data reference in the
4192 vectorized loop is invariant. Set it to false otherwise.
4194 4. Return the pointer. */
4196 tree
4197 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4198 tree offset, tree *initial_address,
4199 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4200 bool only_init, bool *inv_p, tree byte_offset)
4202 const char *base_name;
4203 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4204 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4205 struct loop *loop = NULL;
4206 bool nested_in_vect_loop = false;
4207 struct loop *containing_loop = NULL;
4208 tree aggr_ptr_type;
4209 tree aggr_ptr;
4210 tree new_temp;
4211 gimple_seq new_stmt_list = NULL;
4212 edge pe = NULL;
4213 basic_block new_bb;
4214 tree aggr_ptr_init;
4215 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4216 tree aptr;
4217 gimple_stmt_iterator incr_gsi;
4218 bool insert_after;
4219 tree indx_before_incr, indx_after_incr;
4220 gimple *incr;
4221 tree step;
4222 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4224 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4225 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4227 if (loop_vinfo)
4229 loop = LOOP_VINFO_LOOP (loop_vinfo);
4230 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4231 containing_loop = (gimple_bb (stmt))->loop_father;
4232 pe = loop_preheader_edge (loop);
4234 else
4236 gcc_assert (bb_vinfo);
4237 only_init = true;
4238 *ptr_incr = NULL;
4241 /* Check the step (evolution) of the load in LOOP, and record
4242 whether it's invariant. */
4243 step = vect_dr_behavior (dr)->step;
4244 if (integer_zerop (step))
4245 *inv_p = true;
4246 else
4247 *inv_p = false;
4249 /* Create an expression for the first address accessed by this load
4250 in LOOP. */
4251 base_name = get_name (DR_BASE_ADDRESS (dr));
4253 if (dump_enabled_p ())
4255 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4256 dump_printf_loc (MSG_NOTE, vect_location,
4257 "create %s-pointer variable to type: ",
4258 get_tree_code_name (TREE_CODE (aggr_type)));
4259 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4260 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4261 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4262 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4263 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4264 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4265 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4266 else
4267 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4268 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4269 dump_printf (MSG_NOTE, "\n");
4272 /* (1) Create the new aggregate-pointer variable.
4273 Vector and array types inherit the alias set of their component
4274 type by default so we need to use a ref-all pointer if the data
4275 reference does not conflict with the created aggregated data
4276 reference because it is not addressable. */
4277 bool need_ref_all = false;
4278 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4279 get_alias_set (DR_REF (dr))))
4280 need_ref_all = true;
4281 /* Likewise for any of the data references in the stmt group. */
4282 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4284 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4287 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4288 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4289 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4290 get_alias_set (DR_REF (sdr))))
4292 need_ref_all = true;
4293 break;
4295 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4297 while (orig_stmt);
4299 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4300 need_ref_all);
4301 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4304 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4305 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4306 def-use update cycles for the pointer: one relative to the outer-loop
4307 (LOOP), which is what steps (3) and (4) below do. The other is relative
4308 to the inner-loop (which is the inner-most loop containing the dataref),
4309 and this is done be step (5) below.
4311 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4312 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4313 redundant. Steps (3),(4) create the following:
4315 vp0 = &base_addr;
4316 LOOP: vp1 = phi(vp0,vp2)
4319 vp2 = vp1 + step
4320 goto LOOP
4322 If there is an inner-loop nested in loop, then step (5) will also be
4323 applied, and an additional update in the inner-loop will be created:
4325 vp0 = &base_addr;
4326 LOOP: vp1 = phi(vp0,vp2)
4328 inner: vp3 = phi(vp1,vp4)
4329 vp4 = vp3 + inner_step
4330 if () goto inner
4332 vp2 = vp1 + step
4333 if () goto LOOP */
4335 /* (2) Calculate the initial address of the aggregate-pointer, and set
4336 the aggregate-pointer to point to it before the loop. */
4338 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4340 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4341 offset, byte_offset);
4342 if (new_stmt_list)
4344 if (pe)
4346 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4347 gcc_assert (!new_bb);
4349 else
4350 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4353 *initial_address = new_temp;
4354 aggr_ptr_init = new_temp;
4356 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4357 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4358 inner-loop nested in LOOP (during outer-loop vectorization). */
4360 /* No update in loop is required. */
4361 if (only_init && (!loop_vinfo || at_loop == loop))
4362 aptr = aggr_ptr_init;
4363 else
4365 /* The step of the aggregate pointer is the type size. */
4366 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4367 /* One exception to the above is when the scalar step of the load in
4368 LOOP is zero. In this case the step here is also zero. */
4369 if (*inv_p)
4370 iv_step = size_zero_node;
4371 else if (tree_int_cst_sgn (step) == -1)
4372 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4374 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4376 create_iv (aggr_ptr_init,
4377 fold_convert (aggr_ptr_type, iv_step),
4378 aggr_ptr, loop, &incr_gsi, insert_after,
4379 &indx_before_incr, &indx_after_incr);
4380 incr = gsi_stmt (incr_gsi);
4381 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4383 /* Copy the points-to information if it exists. */
4384 if (DR_PTR_INFO (dr))
4386 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4387 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4389 if (ptr_incr)
4390 *ptr_incr = incr;
4392 aptr = indx_before_incr;
4395 if (!nested_in_vect_loop || only_init)
4396 return aptr;
4399 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4400 nested in LOOP, if exists. */
4402 gcc_assert (nested_in_vect_loop);
4403 if (!only_init)
4405 standard_iv_increment_position (containing_loop, &incr_gsi,
4406 &insert_after);
4407 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4408 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4409 &indx_after_incr);
4410 incr = gsi_stmt (incr_gsi);
4411 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4413 /* Copy the points-to information if it exists. */
4414 if (DR_PTR_INFO (dr))
4416 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4417 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4419 if (ptr_incr)
4420 *ptr_incr = incr;
4422 return indx_before_incr;
4424 else
4425 gcc_unreachable ();
4429 /* Function bump_vector_ptr
4431 Increment a pointer (to a vector type) by vector-size. If requested,
4432 i.e. if PTR-INCR is given, then also connect the new increment stmt
4433 to the existing def-use update-chain of the pointer, by modifying
4434 the PTR_INCR as illustrated below:
4436 The pointer def-use update-chain before this function:
4437 DATAREF_PTR = phi (p_0, p_2)
4438 ....
4439 PTR_INCR: p_2 = DATAREF_PTR + step
4441 The pointer def-use update-chain after this function:
4442 DATAREF_PTR = phi (p_0, p_2)
4443 ....
4444 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4445 ....
4446 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4448 Input:
4449 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4450 in the loop.
4451 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4452 the loop. The increment amount across iterations is expected
4453 to be vector_size.
4454 BSI - location where the new update stmt is to be placed.
4455 STMT - the original scalar memory-access stmt that is being vectorized.
4456 BUMP - optional. The offset by which to bump the pointer. If not given,
4457 the offset is assumed to be vector_size.
4459 Output: Return NEW_DATAREF_PTR as illustrated above.
4463 tree
4464 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4465 gimple *stmt, tree bump)
4467 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4468 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4469 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4470 tree update = TYPE_SIZE_UNIT (vectype);
4471 gassign *incr_stmt;
4472 ssa_op_iter iter;
4473 use_operand_p use_p;
4474 tree new_dataref_ptr;
4476 if (bump)
4477 update = bump;
4479 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4480 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4481 else
4482 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4483 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4484 dataref_ptr, update);
4485 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4487 /* Copy the points-to information if it exists. */
4488 if (DR_PTR_INFO (dr))
4490 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4491 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4494 if (!ptr_incr)
4495 return new_dataref_ptr;
4497 /* Update the vector-pointer's cross-iteration increment. */
4498 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4500 tree use = USE_FROM_PTR (use_p);
4502 if (use == dataref_ptr)
4503 SET_USE (use_p, new_dataref_ptr);
4504 else
4505 gcc_assert (tree_int_cst_compare (use, update) == 0);
4508 return new_dataref_ptr;
4512 /* Function vect_create_destination_var.
4514 Create a new temporary of type VECTYPE. */
4516 tree
4517 vect_create_destination_var (tree scalar_dest, tree vectype)
4519 tree vec_dest;
4520 const char *name;
4521 char *new_name;
4522 tree type;
4523 enum vect_var_kind kind;
4525 kind = vectype
4526 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4527 ? vect_mask_var
4528 : vect_simple_var
4529 : vect_scalar_var;
4530 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4532 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4534 name = get_name (scalar_dest);
4535 if (name)
4536 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4537 else
4538 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4539 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4540 free (new_name);
4542 return vec_dest;
4545 /* Function vect_grouped_store_supported.
4547 Returns TRUE if interleave high and interleave low permutations
4548 are supported, and FALSE otherwise. */
4550 bool
4551 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4553 machine_mode mode = TYPE_MODE (vectype);
4555 /* vect_permute_store_chain requires the group size to be equal to 3 or
4556 be a power of two. */
4557 if (count != 3 && exact_log2 (count) == -1)
4559 if (dump_enabled_p ())
4560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4561 "the size of the group of accesses"
4562 " is not a power of 2 or not eqaul to 3\n");
4563 return false;
4566 /* Check that the permutation is supported. */
4567 if (VECTOR_MODE_P (mode))
4569 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4570 auto_vec_perm_indices sel (nelt);
4571 sel.quick_grow (nelt);
4573 if (count == 3)
4575 unsigned int j0 = 0, j1 = 0, j2 = 0;
4576 unsigned int i, j;
4578 for (j = 0; j < 3; j++)
4580 int nelt0 = ((3 - j) * nelt) % 3;
4581 int nelt1 = ((3 - j) * nelt + 1) % 3;
4582 int nelt2 = ((3 - j) * nelt + 2) % 3;
4583 for (i = 0; i < nelt; i++)
4585 if (3 * i + nelt0 < nelt)
4586 sel[3 * i + nelt0] = j0++;
4587 if (3 * i + nelt1 < nelt)
4588 sel[3 * i + nelt1] = nelt + j1++;
4589 if (3 * i + nelt2 < nelt)
4590 sel[3 * i + nelt2] = 0;
4592 if (!can_vec_perm_p (mode, false, &sel))
4594 if (dump_enabled_p ())
4595 dump_printf (MSG_MISSED_OPTIMIZATION,
4596 "permutaion op not supported by target.\n");
4597 return false;
4600 for (i = 0; i < nelt; i++)
4602 if (3 * i + nelt0 < nelt)
4603 sel[3 * i + nelt0] = 3 * i + nelt0;
4604 if (3 * i + nelt1 < nelt)
4605 sel[3 * i + nelt1] = 3 * i + nelt1;
4606 if (3 * i + nelt2 < nelt)
4607 sel[3 * i + nelt2] = nelt + j2++;
4609 if (!can_vec_perm_p (mode, false, &sel))
4611 if (dump_enabled_p ())
4612 dump_printf (MSG_MISSED_OPTIMIZATION,
4613 "permutaion op not supported by target.\n");
4614 return false;
4617 return true;
4619 else
4621 /* If length is not equal to 3 then only power of 2 is supported. */
4622 gcc_assert (pow2p_hwi (count));
4624 for (i = 0; i < nelt / 2; i++)
4626 sel[i * 2] = i;
4627 sel[i * 2 + 1] = i + nelt;
4629 if (can_vec_perm_p (mode, false, &sel))
4631 for (i = 0; i < nelt; i++)
4632 sel[i] += nelt / 2;
4633 if (can_vec_perm_p (mode, false, &sel))
4634 return true;
4639 if (dump_enabled_p ())
4640 dump_printf (MSG_MISSED_OPTIMIZATION,
4641 "permutaion op not supported by target.\n");
4642 return false;
4646 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4647 type VECTYPE. */
4649 bool
4650 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4652 return vect_lanes_optab_supported_p ("vec_store_lanes",
4653 vec_store_lanes_optab,
4654 vectype, count);
4658 /* Function vect_permute_store_chain.
4660 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4661 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4662 the data correctly for the stores. Return the final references for stores
4663 in RESULT_CHAIN.
4665 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4666 The input is 4 vectors each containing 8 elements. We assign a number to
4667 each element, the input sequence is:
4669 1st vec: 0 1 2 3 4 5 6 7
4670 2nd vec: 8 9 10 11 12 13 14 15
4671 3rd vec: 16 17 18 19 20 21 22 23
4672 4th vec: 24 25 26 27 28 29 30 31
4674 The output sequence should be:
4676 1st vec: 0 8 16 24 1 9 17 25
4677 2nd vec: 2 10 18 26 3 11 19 27
4678 3rd vec: 4 12 20 28 5 13 21 30
4679 4th vec: 6 14 22 30 7 15 23 31
4681 i.e., we interleave the contents of the four vectors in their order.
4683 We use interleave_high/low instructions to create such output. The input of
4684 each interleave_high/low operation is two vectors:
4685 1st vec 2nd vec
4686 0 1 2 3 4 5 6 7
4687 the even elements of the result vector are obtained left-to-right from the
4688 high/low elements of the first vector. The odd elements of the result are
4689 obtained left-to-right from the high/low elements of the second vector.
4690 The output of interleave_high will be: 0 4 1 5
4691 and of interleave_low: 2 6 3 7
4694 The permutation is done in log LENGTH stages. In each stage interleave_high
4695 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4696 where the first argument is taken from the first half of DR_CHAIN and the
4697 second argument from it's second half.
4698 In our example,
4700 I1: interleave_high (1st vec, 3rd vec)
4701 I2: interleave_low (1st vec, 3rd vec)
4702 I3: interleave_high (2nd vec, 4th vec)
4703 I4: interleave_low (2nd vec, 4th vec)
4705 The output for the first stage is:
4707 I1: 0 16 1 17 2 18 3 19
4708 I2: 4 20 5 21 6 22 7 23
4709 I3: 8 24 9 25 10 26 11 27
4710 I4: 12 28 13 29 14 30 15 31
4712 The output of the second stage, i.e. the final result is:
4714 I1: 0 8 16 24 1 9 17 25
4715 I2: 2 10 18 26 3 11 19 27
4716 I3: 4 12 20 28 5 13 21 30
4717 I4: 6 14 22 30 7 15 23 31. */
4719 void
4720 vect_permute_store_chain (vec<tree> dr_chain,
4721 unsigned int length,
4722 gimple *stmt,
4723 gimple_stmt_iterator *gsi,
4724 vec<tree> *result_chain)
4726 tree vect1, vect2, high, low;
4727 gimple *perm_stmt;
4728 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4729 tree perm_mask_low, perm_mask_high;
4730 tree data_ref;
4731 tree perm3_mask_low, perm3_mask_high;
4732 unsigned int i, n, log_length = exact_log2 (length);
4733 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4735 auto_vec_perm_indices sel (nelt);
4736 sel.quick_grow (nelt);
4738 result_chain->quick_grow (length);
4739 memcpy (result_chain->address (), dr_chain.address (),
4740 length * sizeof (tree));
4742 if (length == 3)
4744 unsigned int j0 = 0, j1 = 0, j2 = 0;
4746 for (j = 0; j < 3; j++)
4748 int nelt0 = ((3 - j) * nelt) % 3;
4749 int nelt1 = ((3 - j) * nelt + 1) % 3;
4750 int nelt2 = ((3 - j) * nelt + 2) % 3;
4752 for (i = 0; i < nelt; i++)
4754 if (3 * i + nelt0 < nelt)
4755 sel[3 * i + nelt0] = j0++;
4756 if (3 * i + nelt1 < nelt)
4757 sel[3 * i + nelt1] = nelt + j1++;
4758 if (3 * i + nelt2 < nelt)
4759 sel[3 * i + nelt2] = 0;
4761 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4763 for (i = 0; i < nelt; i++)
4765 if (3 * i + nelt0 < nelt)
4766 sel[3 * i + nelt0] = 3 * i + nelt0;
4767 if (3 * i + nelt1 < nelt)
4768 sel[3 * i + nelt1] = 3 * i + nelt1;
4769 if (3 * i + nelt2 < nelt)
4770 sel[3 * i + nelt2] = nelt + j2++;
4772 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4774 vect1 = dr_chain[0];
4775 vect2 = dr_chain[1];
4777 /* Create interleaving stmt:
4778 low = VEC_PERM_EXPR <vect1, vect2,
4779 {j, nelt, *, j + 1, nelt + j + 1, *,
4780 j + 2, nelt + j + 2, *, ...}> */
4781 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4782 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4783 vect2, perm3_mask_low);
4784 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4786 vect1 = data_ref;
4787 vect2 = dr_chain[2];
4788 /* Create interleaving stmt:
4789 low = VEC_PERM_EXPR <vect1, vect2,
4790 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4791 6, 7, nelt + j + 2, ...}> */
4792 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4793 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4794 vect2, perm3_mask_high);
4795 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4796 (*result_chain)[j] = data_ref;
4799 else
4801 /* If length is not equal to 3 then only power of 2 is supported. */
4802 gcc_assert (pow2p_hwi (length));
4804 for (i = 0, n = nelt / 2; i < n; i++)
4806 sel[i * 2] = i;
4807 sel[i * 2 + 1] = i + nelt;
4809 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4811 for (i = 0; i < nelt; i++)
4812 sel[i] += nelt / 2;
4813 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4815 for (i = 0, n = log_length; i < n; i++)
4817 for (j = 0; j < length/2; j++)
4819 vect1 = dr_chain[j];
4820 vect2 = dr_chain[j+length/2];
4822 /* Create interleaving stmt:
4823 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4824 ...}> */
4825 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4826 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4827 vect2, perm_mask_high);
4828 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4829 (*result_chain)[2*j] = high;
4831 /* Create interleaving stmt:
4832 low = VEC_PERM_EXPR <vect1, vect2,
4833 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4834 ...}> */
4835 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4836 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4837 vect2, perm_mask_low);
4838 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4839 (*result_chain)[2*j+1] = low;
4841 memcpy (dr_chain.address (), result_chain->address (),
4842 length * sizeof (tree));
4847 /* Function vect_setup_realignment
4849 This function is called when vectorizing an unaligned load using
4850 the dr_explicit_realign[_optimized] scheme.
4851 This function generates the following code at the loop prolog:
4853 p = initial_addr;
4854 x msq_init = *(floor(p)); # prolog load
4855 realignment_token = call target_builtin;
4856 loop:
4857 x msq = phi (msq_init, ---)
4859 The stmts marked with x are generated only for the case of
4860 dr_explicit_realign_optimized.
4862 The code above sets up a new (vector) pointer, pointing to the first
4863 location accessed by STMT, and a "floor-aligned" load using that pointer.
4864 It also generates code to compute the "realignment-token" (if the relevant
4865 target hook was defined), and creates a phi-node at the loop-header bb
4866 whose arguments are the result of the prolog-load (created by this
4867 function) and the result of a load that takes place in the loop (to be
4868 created by the caller to this function).
4870 For the case of dr_explicit_realign_optimized:
4871 The caller to this function uses the phi-result (msq) to create the
4872 realignment code inside the loop, and sets up the missing phi argument,
4873 as follows:
4874 loop:
4875 msq = phi (msq_init, lsq)
4876 lsq = *(floor(p')); # load in loop
4877 result = realign_load (msq, lsq, realignment_token);
4879 For the case of dr_explicit_realign:
4880 loop:
4881 msq = *(floor(p)); # load in loop
4882 p' = p + (VS-1);
4883 lsq = *(floor(p')); # load in loop
4884 result = realign_load (msq, lsq, realignment_token);
4886 Input:
4887 STMT - (scalar) load stmt to be vectorized. This load accesses
4888 a memory location that may be unaligned.
4889 BSI - place where new code is to be inserted.
4890 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4891 is used.
4893 Output:
4894 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4895 target hook, if defined.
4896 Return value - the result of the loop-header phi node. */
4898 tree
4899 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4900 tree *realignment_token,
4901 enum dr_alignment_support alignment_support_scheme,
4902 tree init_addr,
4903 struct loop **at_loop)
4905 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4906 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4907 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4908 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4909 struct loop *loop = NULL;
4910 edge pe = NULL;
4911 tree scalar_dest = gimple_assign_lhs (stmt);
4912 tree vec_dest;
4913 gimple *inc;
4914 tree ptr;
4915 tree data_ref;
4916 basic_block new_bb;
4917 tree msq_init = NULL_TREE;
4918 tree new_temp;
4919 gphi *phi_stmt;
4920 tree msq = NULL_TREE;
4921 gimple_seq stmts = NULL;
4922 bool inv_p;
4923 bool compute_in_loop = false;
4924 bool nested_in_vect_loop = false;
4925 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4926 struct loop *loop_for_initial_load = NULL;
4928 if (loop_vinfo)
4930 loop = LOOP_VINFO_LOOP (loop_vinfo);
4931 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4934 gcc_assert (alignment_support_scheme == dr_explicit_realign
4935 || alignment_support_scheme == dr_explicit_realign_optimized);
4937 /* We need to generate three things:
4938 1. the misalignment computation
4939 2. the extra vector load (for the optimized realignment scheme).
4940 3. the phi node for the two vectors from which the realignment is
4941 done (for the optimized realignment scheme). */
4943 /* 1. Determine where to generate the misalignment computation.
4945 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4946 calculation will be generated by this function, outside the loop (in the
4947 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4948 caller, inside the loop.
4950 Background: If the misalignment remains fixed throughout the iterations of
4951 the loop, then both realignment schemes are applicable, and also the
4952 misalignment computation can be done outside LOOP. This is because we are
4953 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4954 are a multiple of VS (the Vector Size), and therefore the misalignment in
4955 different vectorized LOOP iterations is always the same.
4956 The problem arises only if the memory access is in an inner-loop nested
4957 inside LOOP, which is now being vectorized using outer-loop vectorization.
4958 This is the only case when the misalignment of the memory access may not
4959 remain fixed throughout the iterations of the inner-loop (as explained in
4960 detail in vect_supportable_dr_alignment). In this case, not only is the
4961 optimized realignment scheme not applicable, but also the misalignment
4962 computation (and generation of the realignment token that is passed to
4963 REALIGN_LOAD) have to be done inside the loop.
4965 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4966 or not, which in turn determines if the misalignment is computed inside
4967 the inner-loop, or outside LOOP. */
4969 if (init_addr != NULL_TREE || !loop_vinfo)
4971 compute_in_loop = true;
4972 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4976 /* 2. Determine where to generate the extra vector load.
4978 For the optimized realignment scheme, instead of generating two vector
4979 loads in each iteration, we generate a single extra vector load in the
4980 preheader of the loop, and in each iteration reuse the result of the
4981 vector load from the previous iteration. In case the memory access is in
4982 an inner-loop nested inside LOOP, which is now being vectorized using
4983 outer-loop vectorization, we need to determine whether this initial vector
4984 load should be generated at the preheader of the inner-loop, or can be
4985 generated at the preheader of LOOP. If the memory access has no evolution
4986 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4987 to be generated inside LOOP (in the preheader of the inner-loop). */
4989 if (nested_in_vect_loop)
4991 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4992 bool invariant_in_outerloop =
4993 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4994 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4996 else
4997 loop_for_initial_load = loop;
4998 if (at_loop)
4999 *at_loop = loop_for_initial_load;
5001 if (loop_for_initial_load)
5002 pe = loop_preheader_edge (loop_for_initial_load);
5004 /* 3. For the case of the optimized realignment, create the first vector
5005 load at the loop preheader. */
5007 if (alignment_support_scheme == dr_explicit_realign_optimized)
5009 /* Create msq_init = *(floor(p1)) in the loop preheader */
5010 gassign *new_stmt;
5012 gcc_assert (!compute_in_loop);
5013 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5014 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5015 NULL_TREE, &init_addr, NULL, &inc,
5016 true, &inv_p);
5017 if (TREE_CODE (ptr) == SSA_NAME)
5018 new_temp = copy_ssa_name (ptr);
5019 else
5020 new_temp = make_ssa_name (TREE_TYPE (ptr));
5021 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5022 new_stmt = gimple_build_assign
5023 (new_temp, BIT_AND_EXPR, ptr,
5024 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5025 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5026 gcc_assert (!new_bb);
5027 data_ref
5028 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5029 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5030 new_stmt = gimple_build_assign (vec_dest, data_ref);
5031 new_temp = make_ssa_name (vec_dest, new_stmt);
5032 gimple_assign_set_lhs (new_stmt, new_temp);
5033 if (pe)
5035 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5036 gcc_assert (!new_bb);
5038 else
5039 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5041 msq_init = gimple_assign_lhs (new_stmt);
5044 /* 4. Create realignment token using a target builtin, if available.
5045 It is done either inside the containing loop, or before LOOP (as
5046 determined above). */
5048 if (targetm.vectorize.builtin_mask_for_load)
5050 gcall *new_stmt;
5051 tree builtin_decl;
5053 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5054 if (!init_addr)
5056 /* Generate the INIT_ADDR computation outside LOOP. */
5057 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5058 NULL_TREE);
5059 if (loop)
5061 pe = loop_preheader_edge (loop);
5062 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5063 gcc_assert (!new_bb);
5065 else
5066 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5069 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5070 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5071 vec_dest =
5072 vect_create_destination_var (scalar_dest,
5073 gimple_call_return_type (new_stmt));
5074 new_temp = make_ssa_name (vec_dest, new_stmt);
5075 gimple_call_set_lhs (new_stmt, new_temp);
5077 if (compute_in_loop)
5078 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5079 else
5081 /* Generate the misalignment computation outside LOOP. */
5082 pe = loop_preheader_edge (loop);
5083 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5084 gcc_assert (!new_bb);
5087 *realignment_token = gimple_call_lhs (new_stmt);
5089 /* The result of the CALL_EXPR to this builtin is determined from
5090 the value of the parameter and no global variables are touched
5091 which makes the builtin a "const" function. Requiring the
5092 builtin to have the "const" attribute makes it unnecessary
5093 to call mark_call_clobbered. */
5094 gcc_assert (TREE_READONLY (builtin_decl));
5097 if (alignment_support_scheme == dr_explicit_realign)
5098 return msq;
5100 gcc_assert (!compute_in_loop);
5101 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5104 /* 5. Create msq = phi <msq_init, lsq> in loop */
5106 pe = loop_preheader_edge (containing_loop);
5107 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5108 msq = make_ssa_name (vec_dest);
5109 phi_stmt = create_phi_node (msq, containing_loop->header);
5110 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5112 return msq;
5116 /* Function vect_grouped_load_supported.
5118 COUNT is the size of the load group (the number of statements plus the
5119 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5120 only one statement, with a gap of COUNT - 1.
5122 Returns true if a suitable permute exists. */
5124 bool
5125 vect_grouped_load_supported (tree vectype, bool single_element_p,
5126 unsigned HOST_WIDE_INT count)
5128 machine_mode mode = TYPE_MODE (vectype);
5130 /* If this is single-element interleaving with an element distance
5131 that leaves unused vector loads around punt - we at least create
5132 very sub-optimal code in that case (and blow up memory,
5133 see PR65518). */
5134 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5136 if (dump_enabled_p ())
5137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5138 "single-element interleaving not supported "
5139 "for not adjacent vector loads\n");
5140 return false;
5143 /* vect_permute_load_chain requires the group size to be equal to 3 or
5144 be a power of two. */
5145 if (count != 3 && exact_log2 (count) == -1)
5147 if (dump_enabled_p ())
5148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5149 "the size of the group of accesses"
5150 " is not a power of 2 or not equal to 3\n");
5151 return false;
5154 /* Check that the permutation is supported. */
5155 if (VECTOR_MODE_P (mode))
5157 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5158 auto_vec_perm_indices sel (nelt);
5159 sel.quick_grow (nelt);
5161 if (count == 3)
5163 unsigned int k;
5164 for (k = 0; k < 3; k++)
5166 for (i = 0; i < nelt; i++)
5167 if (3 * i + k < 2 * nelt)
5168 sel[i] = 3 * i + k;
5169 else
5170 sel[i] = 0;
5171 if (!can_vec_perm_p (mode, false, &sel))
5173 if (dump_enabled_p ())
5174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5175 "shuffle of 3 loads is not supported by"
5176 " target\n");
5177 return false;
5179 for (i = 0, j = 0; i < nelt; i++)
5180 if (3 * i + k < 2 * nelt)
5181 sel[i] = i;
5182 else
5183 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5184 if (!can_vec_perm_p (mode, false, &sel))
5186 if (dump_enabled_p ())
5187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5188 "shuffle of 3 loads is not supported by"
5189 " target\n");
5190 return false;
5193 return true;
5195 else
5197 /* If length is not equal to 3 then only power of 2 is supported. */
5198 gcc_assert (pow2p_hwi (count));
5199 for (i = 0; i < nelt; i++)
5200 sel[i] = i * 2;
5201 if (can_vec_perm_p (mode, false, &sel))
5203 for (i = 0; i < nelt; i++)
5204 sel[i] = i * 2 + 1;
5205 if (can_vec_perm_p (mode, false, &sel))
5206 return true;
5211 if (dump_enabled_p ())
5212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5213 "extract even/odd not supported by target\n");
5214 return false;
5217 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5218 type VECTYPE. */
5220 bool
5221 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5223 return vect_lanes_optab_supported_p ("vec_load_lanes",
5224 vec_load_lanes_optab,
5225 vectype, count);
5228 /* Function vect_permute_load_chain.
5230 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5231 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5232 the input data correctly. Return the final references for loads in
5233 RESULT_CHAIN.
5235 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5236 The input is 4 vectors each containing 8 elements. We assign a number to each
5237 element, the input sequence is:
5239 1st vec: 0 1 2 3 4 5 6 7
5240 2nd vec: 8 9 10 11 12 13 14 15
5241 3rd vec: 16 17 18 19 20 21 22 23
5242 4th vec: 24 25 26 27 28 29 30 31
5244 The output sequence should be:
5246 1st vec: 0 4 8 12 16 20 24 28
5247 2nd vec: 1 5 9 13 17 21 25 29
5248 3rd vec: 2 6 10 14 18 22 26 30
5249 4th vec: 3 7 11 15 19 23 27 31
5251 i.e., the first output vector should contain the first elements of each
5252 interleaving group, etc.
5254 We use extract_even/odd instructions to create such output. The input of
5255 each extract_even/odd operation is two vectors
5256 1st vec 2nd vec
5257 0 1 2 3 4 5 6 7
5259 and the output is the vector of extracted even/odd elements. The output of
5260 extract_even will be: 0 2 4 6
5261 and of extract_odd: 1 3 5 7
5264 The permutation is done in log LENGTH stages. In each stage extract_even
5265 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5266 their order. In our example,
5268 E1: extract_even (1st vec, 2nd vec)
5269 E2: extract_odd (1st vec, 2nd vec)
5270 E3: extract_even (3rd vec, 4th vec)
5271 E4: extract_odd (3rd vec, 4th vec)
5273 The output for the first stage will be:
5275 E1: 0 2 4 6 8 10 12 14
5276 E2: 1 3 5 7 9 11 13 15
5277 E3: 16 18 20 22 24 26 28 30
5278 E4: 17 19 21 23 25 27 29 31
5280 In order to proceed and create the correct sequence for the next stage (or
5281 for the correct output, if the second stage is the last one, as in our
5282 example), we first put the output of extract_even operation and then the
5283 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5284 The input for the second stage is:
5286 1st vec (E1): 0 2 4 6 8 10 12 14
5287 2nd vec (E3): 16 18 20 22 24 26 28 30
5288 3rd vec (E2): 1 3 5 7 9 11 13 15
5289 4th vec (E4): 17 19 21 23 25 27 29 31
5291 The output of the second stage:
5293 E1: 0 4 8 12 16 20 24 28
5294 E2: 2 6 10 14 18 22 26 30
5295 E3: 1 5 9 13 17 21 25 29
5296 E4: 3 7 11 15 19 23 27 31
5298 And RESULT_CHAIN after reordering:
5300 1st vec (E1): 0 4 8 12 16 20 24 28
5301 2nd vec (E3): 1 5 9 13 17 21 25 29
5302 3rd vec (E2): 2 6 10 14 18 22 26 30
5303 4th vec (E4): 3 7 11 15 19 23 27 31. */
5305 static void
5306 vect_permute_load_chain (vec<tree> dr_chain,
5307 unsigned int length,
5308 gimple *stmt,
5309 gimple_stmt_iterator *gsi,
5310 vec<tree> *result_chain)
5312 tree data_ref, first_vect, second_vect;
5313 tree perm_mask_even, perm_mask_odd;
5314 tree perm3_mask_low, perm3_mask_high;
5315 gimple *perm_stmt;
5316 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5317 unsigned int i, j, log_length = exact_log2 (length);
5318 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5320 auto_vec_perm_indices sel (nelt);
5321 sel.quick_grow (nelt);
5323 result_chain->quick_grow (length);
5324 memcpy (result_chain->address (), dr_chain.address (),
5325 length * sizeof (tree));
5327 if (length == 3)
5329 unsigned int k;
5331 for (k = 0; k < 3; k++)
5333 for (i = 0; i < nelt; i++)
5334 if (3 * i + k < 2 * nelt)
5335 sel[i] = 3 * i + k;
5336 else
5337 sel[i] = 0;
5338 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5340 for (i = 0, j = 0; i < nelt; i++)
5341 if (3 * i + k < 2 * nelt)
5342 sel[i] = i;
5343 else
5344 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5346 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5348 first_vect = dr_chain[0];
5349 second_vect = dr_chain[1];
5351 /* Create interleaving stmt (low part of):
5352 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5353 ...}> */
5354 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5355 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5356 second_vect, perm3_mask_low);
5357 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5359 /* Create interleaving stmt (high part of):
5360 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5361 ...}> */
5362 first_vect = data_ref;
5363 second_vect = dr_chain[2];
5364 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5365 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5366 second_vect, perm3_mask_high);
5367 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5368 (*result_chain)[k] = data_ref;
5371 else
5373 /* If length is not equal to 3 then only power of 2 is supported. */
5374 gcc_assert (pow2p_hwi (length));
5376 for (i = 0; i < nelt; ++i)
5377 sel[i] = i * 2;
5378 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5380 for (i = 0; i < nelt; ++i)
5381 sel[i] = i * 2 + 1;
5382 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5384 for (i = 0; i < log_length; i++)
5386 for (j = 0; j < length; j += 2)
5388 first_vect = dr_chain[j];
5389 second_vect = dr_chain[j+1];
5391 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5392 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5393 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5394 first_vect, second_vect,
5395 perm_mask_even);
5396 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5397 (*result_chain)[j/2] = data_ref;
5399 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5400 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5401 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5402 first_vect, second_vect,
5403 perm_mask_odd);
5404 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5405 (*result_chain)[j/2+length/2] = data_ref;
5407 memcpy (dr_chain.address (), result_chain->address (),
5408 length * sizeof (tree));
5413 /* Function vect_shift_permute_load_chain.
5415 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5416 sequence of stmts to reorder the input data accordingly.
5417 Return the final references for loads in RESULT_CHAIN.
5418 Return true if successed, false otherwise.
5420 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5421 The input is 3 vectors each containing 8 elements. We assign a
5422 number to each element, the input sequence is:
5424 1st vec: 0 1 2 3 4 5 6 7
5425 2nd vec: 8 9 10 11 12 13 14 15
5426 3rd vec: 16 17 18 19 20 21 22 23
5428 The output sequence should be:
5430 1st vec: 0 3 6 9 12 15 18 21
5431 2nd vec: 1 4 7 10 13 16 19 22
5432 3rd vec: 2 5 8 11 14 17 20 23
5434 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5436 First we shuffle all 3 vectors to get correct elements order:
5438 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5439 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5440 3rd vec: (16 19 22) (17 20 23) (18 21)
5442 Next we unite and shift vector 3 times:
5444 1st step:
5445 shift right by 6 the concatenation of:
5446 "1st vec" and "2nd vec"
5447 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5448 "2nd vec" and "3rd vec"
5449 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5450 "3rd vec" and "1st vec"
5451 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5452 | New vectors |
5454 So that now new vectors are:
5456 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5457 2nd vec: (10 13) (16 19 22) (17 20 23)
5458 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5460 2nd step:
5461 shift right by 5 the concatenation of:
5462 "1st vec" and "3rd vec"
5463 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5464 "2nd vec" and "1st vec"
5465 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5466 "3rd vec" and "2nd vec"
5467 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5468 | New vectors |
5470 So that now new vectors are:
5472 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5473 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5474 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5476 3rd step:
5477 shift right by 5 the concatenation of:
5478 "1st vec" and "1st vec"
5479 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5480 shift right by 3 the concatenation of:
5481 "2nd vec" and "2nd vec"
5482 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5483 | New vectors |
5485 So that now all vectors are READY:
5486 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5487 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5488 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5490 This algorithm is faster than one in vect_permute_load_chain if:
5491 1. "shift of a concatination" is faster than general permutation.
5492 This is usually so.
5493 2. The TARGET machine can't execute vector instructions in parallel.
5494 This is because each step of the algorithm depends on previous.
5495 The algorithm in vect_permute_load_chain is much more parallel.
5497 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5500 static bool
5501 vect_shift_permute_load_chain (vec<tree> dr_chain,
5502 unsigned int length,
5503 gimple *stmt,
5504 gimple_stmt_iterator *gsi,
5505 vec<tree> *result_chain)
5507 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5508 tree perm2_mask1, perm2_mask2, perm3_mask;
5509 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5510 gimple *perm_stmt;
5512 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5513 unsigned int i;
5514 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5515 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5516 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5518 auto_vec_perm_indices sel (nelt);
5519 sel.quick_grow (nelt);
5521 result_chain->quick_grow (length);
5522 memcpy (result_chain->address (), dr_chain.address (),
5523 length * sizeof (tree));
5525 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5527 unsigned int j, log_length = exact_log2 (length);
5528 for (i = 0; i < nelt / 2; ++i)
5529 sel[i] = i * 2;
5530 for (i = 0; i < nelt / 2; ++i)
5531 sel[nelt / 2 + i] = i * 2 + 1;
5532 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5534 if (dump_enabled_p ())
5535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5536 "shuffle of 2 fields structure is not \
5537 supported by target\n");
5538 return false;
5540 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5542 for (i = 0; i < nelt / 2; ++i)
5543 sel[i] = i * 2 + 1;
5544 for (i = 0; i < nelt / 2; ++i)
5545 sel[nelt / 2 + i] = i * 2;
5546 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5548 if (dump_enabled_p ())
5549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5550 "shuffle of 2 fields structure is not \
5551 supported by target\n");
5552 return false;
5554 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5556 /* Generating permutation constant to shift all elements.
5557 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5558 for (i = 0; i < nelt; i++)
5559 sel[i] = nelt / 2 + i;
5560 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5562 if (dump_enabled_p ())
5563 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5564 "shift permutation is not supported by target\n");
5565 return false;
5567 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5569 /* Generating permutation constant to select vector from 2.
5570 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5571 for (i = 0; i < nelt / 2; i++)
5572 sel[i] = i;
5573 for (i = nelt / 2; i < nelt; i++)
5574 sel[i] = nelt + i;
5575 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5577 if (dump_enabled_p ())
5578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5579 "select is not supported by target\n");
5580 return false;
5582 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5584 for (i = 0; i < log_length; i++)
5586 for (j = 0; j < length; j += 2)
5588 first_vect = dr_chain[j];
5589 second_vect = dr_chain[j + 1];
5591 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5592 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5593 first_vect, first_vect,
5594 perm2_mask1);
5595 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5596 vect[0] = data_ref;
5598 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5599 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5600 second_vect, second_vect,
5601 perm2_mask2);
5602 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5603 vect[1] = data_ref;
5605 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5606 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5607 vect[0], vect[1], shift1_mask);
5608 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5609 (*result_chain)[j/2 + length/2] = data_ref;
5611 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5612 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5613 vect[0], vect[1], select_mask);
5614 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5615 (*result_chain)[j/2] = data_ref;
5617 memcpy (dr_chain.address (), result_chain->address (),
5618 length * sizeof (tree));
5620 return true;
5622 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5624 unsigned int k = 0, l = 0;
5626 /* Generating permutation constant to get all elements in rigth order.
5627 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5628 for (i = 0; i < nelt; i++)
5630 if (3 * k + (l % 3) >= nelt)
5632 k = 0;
5633 l += (3 - (nelt % 3));
5635 sel[i] = 3 * k + (l % 3);
5636 k++;
5638 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5640 if (dump_enabled_p ())
5641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5642 "shuffle of 3 fields structure is not \
5643 supported by target\n");
5644 return false;
5646 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5648 /* Generating permutation constant to shift all elements.
5649 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5650 for (i = 0; i < nelt; i++)
5651 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5652 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5654 if (dump_enabled_p ())
5655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5656 "shift permutation is not supported by target\n");
5657 return false;
5659 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5661 /* Generating permutation constant to shift all elements.
5662 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5663 for (i = 0; i < nelt; i++)
5664 sel[i] = 2 * (nelt / 3) + 1 + i;
5665 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5667 if (dump_enabled_p ())
5668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5669 "shift permutation is not supported by target\n");
5670 return false;
5672 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5674 /* Generating permutation constant to shift all elements.
5675 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5676 for (i = 0; i < nelt; i++)
5677 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5678 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5680 if (dump_enabled_p ())
5681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5682 "shift permutation is not supported by target\n");
5683 return false;
5685 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5687 /* Generating permutation constant to shift all elements.
5688 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5689 for (i = 0; i < nelt; i++)
5690 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5691 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5693 if (dump_enabled_p ())
5694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5695 "shift permutation is not supported by target\n");
5696 return false;
5698 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5700 for (k = 0; k < 3; k++)
5702 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5703 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5704 dr_chain[k], dr_chain[k],
5705 perm3_mask);
5706 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5707 vect[k] = data_ref;
5710 for (k = 0; k < 3; k++)
5712 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5713 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5714 vect[k % 3], vect[(k + 1) % 3],
5715 shift1_mask);
5716 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5717 vect_shift[k] = data_ref;
5720 for (k = 0; k < 3; k++)
5722 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5723 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5724 vect_shift[(4 - k) % 3],
5725 vect_shift[(3 - k) % 3],
5726 shift2_mask);
5727 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5728 vect[k] = data_ref;
5731 (*result_chain)[3 - (nelt % 3)] = vect[2];
5733 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5734 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5735 vect[0], shift3_mask);
5736 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5737 (*result_chain)[nelt % 3] = data_ref;
5739 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5740 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5741 vect[1], shift4_mask);
5742 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5743 (*result_chain)[0] = data_ref;
5744 return true;
5746 return false;
5749 /* Function vect_transform_grouped_load.
5751 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5752 to perform their permutation and ascribe the result vectorized statements to
5753 the scalar statements.
5756 void
5757 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5758 gimple_stmt_iterator *gsi)
5760 machine_mode mode;
5761 vec<tree> result_chain = vNULL;
5763 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5764 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5765 vectors, that are ready for vector computation. */
5766 result_chain.create (size);
5768 /* If reassociation width for vector type is 2 or greater target machine can
5769 execute 2 or more vector instructions in parallel. Otherwise try to
5770 get chain for loads group using vect_shift_permute_load_chain. */
5771 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5772 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5773 || pow2p_hwi (size)
5774 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5775 gsi, &result_chain))
5776 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5777 vect_record_grouped_load_vectors (stmt, result_chain);
5778 result_chain.release ();
5781 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5782 generated as part of the vectorization of STMT. Assign the statement
5783 for each vector to the associated scalar statement. */
5785 void
5786 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5788 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5789 gimple *next_stmt, *new_stmt;
5790 unsigned int i, gap_count;
5791 tree tmp_data_ref;
5793 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5794 Since we scan the chain starting from it's first node, their order
5795 corresponds the order of data-refs in RESULT_CHAIN. */
5796 next_stmt = first_stmt;
5797 gap_count = 1;
5798 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5800 if (!next_stmt)
5801 break;
5803 /* Skip the gaps. Loads created for the gaps will be removed by dead
5804 code elimination pass later. No need to check for the first stmt in
5805 the group, since it always exists.
5806 GROUP_GAP is the number of steps in elements from the previous
5807 access (if there is no gap GROUP_GAP is 1). We skip loads that
5808 correspond to the gaps. */
5809 if (next_stmt != first_stmt
5810 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5812 gap_count++;
5813 continue;
5816 while (next_stmt)
5818 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5819 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5820 copies, and we put the new vector statement in the first available
5821 RELATED_STMT. */
5822 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5823 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5824 else
5826 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5828 gimple *prev_stmt =
5829 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5830 gimple *rel_stmt =
5831 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5832 while (rel_stmt)
5834 prev_stmt = rel_stmt;
5835 rel_stmt =
5836 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5839 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5840 new_stmt;
5844 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5845 gap_count = 1;
5846 /* If NEXT_STMT accesses the same DR as the previous statement,
5847 put the same TMP_DATA_REF as its vectorized statement; otherwise
5848 get the next data-ref from RESULT_CHAIN. */
5849 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5850 break;
5855 /* Function vect_force_dr_alignment_p.
5857 Returns whether the alignment of a DECL can be forced to be aligned
5858 on ALIGNMENT bit boundary. */
5860 bool
5861 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5863 if (!VAR_P (decl))
5864 return false;
5866 if (decl_in_symtab_p (decl)
5867 && !symtab_node::get (decl)->can_increase_alignment_p ())
5868 return false;
5870 if (TREE_STATIC (decl))
5871 return (alignment <= MAX_OFILE_ALIGNMENT);
5872 else
5873 return (alignment <= MAX_STACK_ALIGNMENT);
5877 /* Return whether the data reference DR is supported with respect to its
5878 alignment.
5879 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5880 it is aligned, i.e., check if it is possible to vectorize it with different
5881 alignment. */
5883 enum dr_alignment_support
5884 vect_supportable_dr_alignment (struct data_reference *dr,
5885 bool check_aligned_accesses)
5887 gimple *stmt = DR_STMT (dr);
5888 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5889 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5890 machine_mode mode = TYPE_MODE (vectype);
5891 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5892 struct loop *vect_loop = NULL;
5893 bool nested_in_vect_loop = false;
5895 if (aligned_access_p (dr) && !check_aligned_accesses)
5896 return dr_aligned;
5898 /* For now assume all conditional loads/stores support unaligned
5899 access without any special code. */
5900 if (is_gimple_call (stmt)
5901 && gimple_call_internal_p (stmt)
5902 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5903 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5904 return dr_unaligned_supported;
5906 if (loop_vinfo)
5908 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5909 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5912 /* Possibly unaligned access. */
5914 /* We can choose between using the implicit realignment scheme (generating
5915 a misaligned_move stmt) and the explicit realignment scheme (generating
5916 aligned loads with a REALIGN_LOAD). There are two variants to the
5917 explicit realignment scheme: optimized, and unoptimized.
5918 We can optimize the realignment only if the step between consecutive
5919 vector loads is equal to the vector size. Since the vector memory
5920 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5921 is guaranteed that the misalignment amount remains the same throughout the
5922 execution of the vectorized loop. Therefore, we can create the
5923 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5924 at the loop preheader.
5926 However, in the case of outer-loop vectorization, when vectorizing a
5927 memory access in the inner-loop nested within the LOOP that is now being
5928 vectorized, while it is guaranteed that the misalignment of the
5929 vectorized memory access will remain the same in different outer-loop
5930 iterations, it is *not* guaranteed that is will remain the same throughout
5931 the execution of the inner-loop. This is because the inner-loop advances
5932 with the original scalar step (and not in steps of VS). If the inner-loop
5933 step happens to be a multiple of VS, then the misalignment remains fixed
5934 and we can use the optimized realignment scheme. For example:
5936 for (i=0; i<N; i++)
5937 for (j=0; j<M; j++)
5938 s += a[i+j];
5940 When vectorizing the i-loop in the above example, the step between
5941 consecutive vector loads is 1, and so the misalignment does not remain
5942 fixed across the execution of the inner-loop, and the realignment cannot
5943 be optimized (as illustrated in the following pseudo vectorized loop):
5945 for (i=0; i<N; i+=4)
5946 for (j=0; j<M; j++){
5947 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5948 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5949 // (assuming that we start from an aligned address).
5952 We therefore have to use the unoptimized realignment scheme:
5954 for (i=0; i<N; i+=4)
5955 for (j=k; j<M; j+=4)
5956 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5957 // that the misalignment of the initial address is
5958 // 0).
5960 The loop can then be vectorized as follows:
5962 for (k=0; k<4; k++){
5963 rt = get_realignment_token (&vp[k]);
5964 for (i=0; i<N; i+=4){
5965 v1 = vp[i+k];
5966 for (j=k; j<M; j+=4){
5967 v2 = vp[i+j+VS-1];
5968 va = REALIGN_LOAD <v1,v2,rt>;
5969 vs += va;
5970 v1 = v2;
5973 } */
5975 if (DR_IS_READ (dr))
5977 bool is_packed = false;
5978 tree type = (TREE_TYPE (DR_REF (dr)));
5980 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5981 && (!targetm.vectorize.builtin_mask_for_load
5982 || targetm.vectorize.builtin_mask_for_load ()))
5984 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5986 /* If we are doing SLP then the accesses need not have the
5987 same alignment, instead it depends on the SLP group size. */
5988 if (loop_vinfo
5989 && STMT_SLP_TYPE (stmt_info)
5990 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5991 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5992 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5994 else if (!loop_vinfo
5995 || (nested_in_vect_loop
5996 && (TREE_INT_CST_LOW (DR_STEP (dr))
5997 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5998 return dr_explicit_realign;
5999 else
6000 return dr_explicit_realign_optimized;
6002 if (!known_alignment_for_access_p (dr))
6003 is_packed = not_size_aligned (DR_REF (dr));
6005 if (targetm.vectorize.support_vector_misalignment
6006 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6007 /* Can't software pipeline the loads, but can at least do them. */
6008 return dr_unaligned_supported;
6010 else
6012 bool is_packed = false;
6013 tree type = (TREE_TYPE (DR_REF (dr)));
6015 if (!known_alignment_for_access_p (dr))
6016 is_packed = not_size_aligned (DR_REF (dr));
6018 if (targetm.vectorize.support_vector_misalignment
6019 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6020 return dr_unaligned_supported;
6023 /* Unsupported. */
6024 return dr_unaligned_unsupported;