2017-09-21 Paul Thomas <pault@gcc.gnu.org>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobcab2f2f935bdf1b3479799a93539e53248f0696f
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
56 /* Return true if load- or store-lanes optab OPTAB is implemented for
57 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59 static bool
60 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
61 tree vectype, unsigned HOST_WIDE_INT count)
63 machine_mode mode;
64 scalar_int_mode array_mode;
65 bool limit_p;
67 mode = TYPE_MODE (vectype);
68 limit_p = !targetm.array_mode_supported_p (mode, count);
69 if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
70 limit_p).exists (&array_mode))
72 if (dump_enabled_p ())
73 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
74 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
75 GET_MODE_NAME (mode), count);
76 return false;
79 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
81 if (dump_enabled_p ())
82 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
83 "cannot use %s<%s><%s>\n", name,
84 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
85 return false;
88 if (dump_enabled_p ())
89 dump_printf_loc (MSG_NOTE, vect_location,
90 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
91 GET_MODE_NAME (mode));
93 return true;
97 /* Return the smallest scalar part of STMT.
98 This is used to determine the vectype of the stmt. We generally set the
99 vectype according to the type of the result (lhs). For stmts whose
100 result-type is different than the type of the arguments (e.g., demotion,
101 promotion), vectype will be reset appropriately (later). Note that we have
102 to visit the smallest datatype in this function, because that determines the
103 VF. If the smallest datatype in the loop is present only as the rhs of a
104 promotion operation - we'd miss it.
105 Such a case, where a variable of this datatype does not appear in the lhs
106 anywhere in the loop, can only occur if it's an invariant: e.g.:
107 'int_x = (int) short_inv', which we'd expect to have been optimized away by
108 invariant motion. However, we cannot rely on invariant motion to always
109 take invariants out of the loop, and so in the case of promotion we also
110 have to check the rhs.
111 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
112 types. */
114 tree
115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
116 HOST_WIDE_INT *rhs_size_unit)
118 tree scalar_type = gimple_expr_type (stmt);
119 HOST_WIDE_INT lhs, rhs;
121 /* During the analysis phase, this function is called on arbitrary
122 statements that might not have scalar results. */
123 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
124 return scalar_type;
126 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
128 if (is_gimple_assign (stmt)
129 && (gimple_assign_cast_p (stmt)
130 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
131 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
132 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
134 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
136 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
137 if (rhs < lhs)
138 scalar_type = rhs_type;
141 *lhs_size_unit = lhs;
142 *rhs_size_unit = rhs;
143 return scalar_type;
147 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
148 tested at run-time. Return TRUE if DDR was successfully inserted.
149 Return false if versioning is not supported. */
151 static bool
152 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
154 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
156 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
157 return false;
159 if (!runtime_alias_check_p (ddr, loop,
160 optimize_loop_nest_for_speed_p (loop)))
161 return false;
163 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
164 return true;
168 /* A subroutine of vect_analyze_data_ref_dependence. Handle
169 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
170 distances. These distances are conservatively correct but they don't
171 reflect a guaranteed dependence.
173 Return true if this function does all the work necessary to avoid
174 an alias or false if the caller should use the dependence distances
175 to limit the vectorization factor in the usual way. LOOP_DEPTH is
176 the depth of the loop described by LOOP_VINFO and the other arguments
177 are as for vect_analyze_data_ref_dependence. */
179 static bool
180 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
181 loop_vec_info loop_vinfo,
182 int loop_depth, int *max_vf)
184 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
185 lambda_vector dist_v;
186 unsigned int i;
187 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
189 int dist = dist_v[loop_depth];
190 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
192 /* If the user asserted safelen >= DIST consecutive iterations
193 can be executed concurrently, assume independence.
195 ??? An alternative would be to add the alias check even
196 in this case, and vectorize the fallback loop with the
197 maximum VF set to safelen. However, if the user has
198 explicitly given a length, it's less likely that that
199 would be a win. */
200 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
202 if (loop->safelen < *max_vf)
203 *max_vf = loop->safelen;
204 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
205 continue;
208 /* For dependence distances of 2 or more, we have the option
209 of limiting VF or checking for an alias at runtime.
210 Prefer to check at runtime if we can, to avoid limiting
211 the VF unnecessarily when the bases are in fact independent.
213 Note that the alias checks will be removed if the VF ends up
214 being small enough. */
215 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
218 return true;
222 /* Function vect_analyze_data_ref_dependence.
224 Return TRUE if there (might) exist a dependence between a memory-reference
225 DRA and a memory-reference DRB. When versioning for alias may check a
226 dependence at run-time, return FALSE. Adjust *MAX_VF according to
227 the data dependence. */
229 static bool
230 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
231 loop_vec_info loop_vinfo, int *max_vf)
233 unsigned int i;
234 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
235 struct data_reference *dra = DDR_A (ddr);
236 struct data_reference *drb = DDR_B (ddr);
237 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
238 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
239 lambda_vector dist_v;
240 unsigned int loop_depth;
242 /* In loop analysis all data references should be vectorizable. */
243 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
244 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
245 gcc_unreachable ();
247 /* Independent data accesses. */
248 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
249 return false;
251 if (dra == drb
252 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
253 return false;
255 /* We do not have to consider dependences between accesses that belong
256 to the same group. */
257 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
258 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
259 return false;
261 /* Even if we have an anti-dependence then, as the vectorized loop covers at
262 least two scalar iterations, there is always also a true dependence.
263 As the vectorizer does not re-order loads and stores we can ignore
264 the anti-dependence if TBAA can disambiguate both DRs similar to the
265 case with known negative distance anti-dependences (positive
266 distance anti-dependences would violate TBAA constraints). */
267 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
268 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
269 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
270 get_alias_set (DR_REF (drb))))
271 return false;
273 /* Unknown data dependence. */
274 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
276 /* If user asserted safelen consecutive iterations can be
277 executed concurrently, assume independence. */
278 if (loop->safelen >= 2)
280 if (loop->safelen < *max_vf)
281 *max_vf = loop->safelen;
282 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
283 return false;
286 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
287 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
289 if (dump_enabled_p ())
291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
292 "versioning for alias not supported for: "
293 "can't determine dependence between ");
294 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
295 DR_REF (dra));
296 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
297 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
298 DR_REF (drb));
299 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
301 return true;
304 if (dump_enabled_p ())
306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
307 "versioning for alias required: "
308 "can't determine dependence between ");
309 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
310 DR_REF (dra));
311 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
312 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
313 DR_REF (drb));
314 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
317 /* Add to list of ddrs that need to be tested at run-time. */
318 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
321 /* Known data dependence. */
322 if (DDR_NUM_DIST_VECTS (ddr) == 0)
324 /* If user asserted safelen consecutive iterations can be
325 executed concurrently, assume independence. */
326 if (loop->safelen >= 2)
328 if (loop->safelen < *max_vf)
329 *max_vf = loop->safelen;
330 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
331 return false;
334 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
335 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340 "versioning for alias not supported for: "
341 "bad dist vector for ");
342 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
343 DR_REF (dra));
344 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
345 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
346 DR_REF (drb));
347 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
349 return true;
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "versioning for alias required: "
356 "bad dist vector for ");
357 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
358 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
359 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
360 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
362 /* Add to list of ddrs that need to be tested at run-time. */
363 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
366 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
368 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
369 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
370 loop_depth, max_vf))
371 return false;
373 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
375 int dist = dist_v[loop_depth];
377 if (dump_enabled_p ())
378 dump_printf_loc (MSG_NOTE, vect_location,
379 "dependence distance = %d.\n", dist);
381 if (dist == 0)
383 if (dump_enabled_p ())
385 dump_printf_loc (MSG_NOTE, vect_location,
386 "dependence distance == 0 between ");
387 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
388 dump_printf (MSG_NOTE, " and ");
389 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
390 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
393 /* When we perform grouped accesses and perform implicit CSE
394 by detecting equal accesses and doing disambiguation with
395 runtime alias tests like for
396 .. = a[i];
397 .. = a[i+1];
398 a[i] = ..;
399 a[i+1] = ..;
400 *p = ..;
401 .. = a[i];
402 .. = a[i+1];
403 where we will end up loading { a[i], a[i+1] } once, make
404 sure that inserting group loads before the first load and
405 stores after the last store will do the right thing.
406 Similar for groups like
407 a[i] = ...;
408 ... = a[i];
409 a[i+1] = ...;
410 where loads from the group interleave with the store. */
411 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
412 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
414 gimple *earlier_stmt;
415 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
416 if (DR_IS_WRITE
417 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
419 if (dump_enabled_p ())
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421 "READ_WRITE dependence in interleaving."
422 "\n");
423 return true;
427 continue;
430 if (dist > 0 && DDR_REVERSED_P (ddr))
432 /* If DDR_REVERSED_P the order of the data-refs in DDR was
433 reversed (to make distance vector positive), and the actual
434 distance is negative. */
435 if (dump_enabled_p ())
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
437 "dependence distance negative.\n");
438 /* Record a negative dependence distance to later limit the
439 amount of stmt copying / unrolling we can perform.
440 Only need to handle read-after-write dependence. */
441 if (DR_IS_READ (drb)
442 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
443 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
444 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
445 continue;
448 if (abs (dist) >= 2
449 && abs (dist) < *max_vf)
451 /* The dependence distance requires reduction of the maximal
452 vectorization factor. */
453 *max_vf = abs (dist);
454 if (dump_enabled_p ())
455 dump_printf_loc (MSG_NOTE, vect_location,
456 "adjusting maximal vectorization factor to %i\n",
457 *max_vf);
460 if (abs (dist) >= *max_vf)
462 /* Dependence distance does not create dependence, as far as
463 vectorization is concerned, in this case. */
464 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE, vect_location,
466 "dependence distance >= VF.\n");
467 continue;
470 if (dump_enabled_p ())
472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
473 "not vectorized, possible dependence "
474 "between data-refs ");
475 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
476 dump_printf (MSG_NOTE, " and ");
477 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
478 dump_printf (MSG_NOTE, "\n");
481 return true;
484 return false;
487 /* Function vect_analyze_data_ref_dependences.
489 Examine all the data references in the loop, and make sure there do not
490 exist any data dependences between them. Set *MAX_VF according to
491 the maximum vectorization factor the data dependences allow. */
493 bool
494 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
496 unsigned int i;
497 struct data_dependence_relation *ddr;
499 if (dump_enabled_p ())
500 dump_printf_loc (MSG_NOTE, vect_location,
501 "=== vect_analyze_data_ref_dependences ===\n");
503 LOOP_VINFO_DDRS (loop_vinfo)
504 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
505 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
506 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
507 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
508 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
509 &LOOP_VINFO_DDRS (loop_vinfo),
510 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
511 return false;
513 /* For epilogues we either have no aliases or alias versioning
514 was applied to original loop. Therefore we may just get max_vf
515 using VF of original loop. */
516 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
517 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
518 else
519 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
520 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
521 return false;
523 return true;
527 /* Function vect_slp_analyze_data_ref_dependence.
529 Return TRUE if there (might) exist a dependence between a memory-reference
530 DRA and a memory-reference DRB. When versioning for alias may check a
531 dependence at run-time, return FALSE. Adjust *MAX_VF according to
532 the data dependence. */
534 static bool
535 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
537 struct data_reference *dra = DDR_A (ddr);
538 struct data_reference *drb = DDR_B (ddr);
540 /* We need to check dependences of statements marked as unvectorizable
541 as well, they still can prohibit vectorization. */
543 /* Independent data accesses. */
544 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
545 return false;
547 if (dra == drb)
548 return false;
550 /* Read-read is OK. */
551 if (DR_IS_READ (dra) && DR_IS_READ (drb))
552 return false;
554 /* If dra and drb are part of the same interleaving chain consider
555 them independent. */
556 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
557 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
558 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
559 return false;
561 /* Unknown data dependence. */
562 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
564 if (dump_enabled_p ())
566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
567 "can't determine dependence between ");
568 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
569 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
570 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
571 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
574 else if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location,
577 "determined dependence between ");
578 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
579 dump_printf (MSG_NOTE, " and ");
580 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
581 dump_printf (MSG_NOTE, "\n");
584 return true;
588 /* Analyze dependences involved in the transform of SLP NODE. STORES
589 contain the vector of scalar stores of this instance if we are
590 disambiguating the loads. */
592 static bool
593 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
594 vec<gimple *> stores, gimple *last_store)
596 /* This walks over all stmts involved in the SLP load/store done
597 in NODE verifying we can sink them up to the last stmt in the
598 group. */
599 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
600 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
602 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
603 if (access == last_access)
604 continue;
605 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
606 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
607 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
609 gimple *stmt = gsi_stmt (gsi);
610 if (! gimple_vuse (stmt)
611 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
612 continue;
614 /* If we couldn't record a (single) data reference for this
615 stmt we have to give up. */
616 /* ??? Here and below if dependence analysis fails we can resort
617 to the alias oracle which can handle more kinds of stmts. */
618 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
619 if (!dr_b)
620 return false;
622 bool dependent = false;
623 /* If we run into a store of this same instance (we've just
624 marked those) then delay dependence checking until we run
625 into the last store because this is where it will have
626 been sunk to (and we verify if we can do that as well). */
627 if (gimple_visited_p (stmt))
629 if (stmt != last_store)
630 continue;
631 unsigned i;
632 gimple *store;
633 FOR_EACH_VEC_ELT (stores, i, store)
635 data_reference *store_dr
636 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
637 ddr_p ddr = initialize_data_dependence_relation
638 (dr_a, store_dr, vNULL);
639 dependent = vect_slp_analyze_data_ref_dependence (ddr);
640 free_dependence_relation (ddr);
641 if (dependent)
642 break;
645 else
647 ddr_p ddr = initialize_data_dependence_relation (dr_a,
648 dr_b, vNULL);
649 dependent = vect_slp_analyze_data_ref_dependence (ddr);
650 free_dependence_relation (ddr);
652 if (dependent)
653 return false;
656 return true;
660 /* Function vect_analyze_data_ref_dependences.
662 Examine all the data references in the basic-block, and make sure there
663 do not exist any data dependences between them. Set *MAX_VF according to
664 the maximum vectorization factor the data dependences allow. */
666 bool
667 vect_slp_analyze_instance_dependence (slp_instance instance)
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_NOTE, vect_location,
671 "=== vect_slp_analyze_instance_dependence ===\n");
673 /* The stores of this instance are at the root of the SLP tree. */
674 slp_tree store = SLP_INSTANCE_TREE (instance);
675 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
676 store = NULL;
678 /* Verify we can sink stores to the vectorized stmt insert location. */
679 gimple *last_store = NULL;
680 if (store)
682 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
683 return false;
685 /* Mark stores in this instance and remember the last one. */
686 last_store = vect_find_last_scalar_stmt_in_slp (store);
687 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
688 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
691 bool res = true;
693 /* Verify we can sink loads to the vectorized stmt insert location,
694 special-casing stores of this instance. */
695 slp_tree load;
696 unsigned int i;
697 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
698 if (! vect_slp_analyze_node_dependences (instance, load,
699 store
700 ? SLP_TREE_SCALAR_STMTS (store)
701 : vNULL, last_store))
703 res = false;
704 break;
707 /* Unset the visited flag. */
708 if (store)
709 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
710 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
712 return res;
715 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
716 the statement that contains DRB, which is useful for recording in the
717 dump file. */
719 static void
720 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
721 innermost_loop_behavior *drb)
723 bool existed;
724 innermost_loop_behavior *&entry
725 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
726 if (!existed || entry->base_alignment < drb->base_alignment)
728 entry = drb;
729 if (dump_enabled_p ())
731 dump_printf_loc (MSG_NOTE, vect_location,
732 "recording new base alignment for ");
733 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
734 dump_printf (MSG_NOTE, "\n");
735 dump_printf_loc (MSG_NOTE, vect_location,
736 " alignment: %d\n", drb->base_alignment);
737 dump_printf_loc (MSG_NOTE, vect_location,
738 " misalignment: %d\n", drb->base_misalignment);
739 dump_printf_loc (MSG_NOTE, vect_location,
740 " based on: ");
741 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
746 /* If the region we're going to vectorize is reached, all unconditional
747 data references occur at least once. We can therefore pool the base
748 alignment guarantees from each unconditional reference. Do this by
749 going through all the data references in VINFO and checking whether
750 the containing statement makes the reference unconditionally. If so,
751 record the alignment of the base address in VINFO so that it can be
752 used for all other references with the same base. */
754 void
755 vect_record_base_alignments (vec_info *vinfo)
757 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
758 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
759 data_reference *dr;
760 unsigned int i;
761 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
762 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
764 gimple *stmt = DR_STMT (dr);
765 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
767 /* If DR is nested in the loop that is being vectorized, we can also
768 record the alignment of the base wrt the outer loop. */
769 if (loop && nested_in_vect_loop_p (loop, stmt))
771 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
772 vect_record_base_alignment
773 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
778 /* Function vect_compute_data_ref_alignment
780 Compute the misalignment of the data reference DR.
782 Output:
783 1. If during the misalignment computation it is found that the data reference
784 cannot be vectorized then false is returned.
785 2. DR_MISALIGNMENT (DR) is defined.
787 FOR NOW: No analysis is actually performed. Misalignment is calculated
788 only for trivial cases. TODO. */
790 bool
791 vect_compute_data_ref_alignment (struct data_reference *dr)
793 gimple *stmt = DR_STMT (dr);
794 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
795 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
796 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
797 struct loop *loop = NULL;
798 tree ref = DR_REF (dr);
799 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
801 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE, vect_location,
803 "vect_compute_data_ref_alignment:\n");
805 if (loop_vinfo)
806 loop = LOOP_VINFO_LOOP (loop_vinfo);
808 /* Initialize misalignment to unknown. */
809 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
811 innermost_loop_behavior *drb = vect_dr_behavior (dr);
812 bool step_preserves_misalignment_p;
814 /* No step for BB vectorization. */
815 if (!loop)
817 gcc_assert (integer_zerop (drb->step));
818 step_preserves_misalignment_p = true;
821 /* In case the dataref is in an inner-loop of the loop that is being
822 vectorized (LOOP), we use the base and misalignment information
823 relative to the outer-loop (LOOP). This is ok only if the misalignment
824 stays the same throughout the execution of the inner-loop, which is why
825 we have to check that the stride of the dataref in the inner-loop evenly
826 divides by the vector size. */
827 else if (nested_in_vect_loop_p (loop, stmt))
829 step_preserves_misalignment_p
830 = (DR_STEP_ALIGNMENT (dr)
831 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
833 if (dump_enabled_p ())
835 if (step_preserves_misalignment_p)
836 dump_printf_loc (MSG_NOTE, vect_location,
837 "inner step divides the vector-size.\n");
838 else
839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
840 "inner step doesn't divide the vector-size.\n");
844 /* Similarly we can only use base and misalignment information relative to
845 an innermost loop if the misalignment stays the same throughout the
846 execution of the loop. As above, this is the case if the stride of
847 the dataref evenly divides by the vector size. */
848 else
850 unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
851 step_preserves_misalignment_p
852 = ((DR_STEP_ALIGNMENT (dr) * vf)
853 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
855 if (!step_preserves_misalignment_p && dump_enabled_p ())
856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
857 "step doesn't divide the vector-size.\n");
860 unsigned int base_alignment = drb->base_alignment;
861 unsigned int base_misalignment = drb->base_misalignment;
862 unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
864 /* Calculate the maximum of the pooled base address alignment and the
865 alignment that we can compute for DR itself. */
866 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
867 if (entry && base_alignment < (*entry)->base_alignment)
869 base_alignment = (*entry)->base_alignment;
870 base_misalignment = (*entry)->base_misalignment;
873 if (drb->offset_alignment < vector_alignment
874 || !step_preserves_misalignment_p
875 /* We need to know whether the step wrt the vectorized loop is
876 negative when computing the starting misalignment below. */
877 || TREE_CODE (drb->step) != INTEGER_CST)
879 if (dump_enabled_p ())
881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
882 "Unknown alignment for access: ");
883 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
884 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
886 return true;
889 if (base_alignment < vector_alignment)
891 tree base = drb->base_address;
892 if (TREE_CODE (base) == ADDR_EXPR)
893 base = TREE_OPERAND (base, 0);
894 if (!vect_can_force_dr_alignment_p (base,
895 vector_alignment * BITS_PER_UNIT))
897 if (dump_enabled_p ())
899 dump_printf_loc (MSG_NOTE, vect_location,
900 "can't force alignment of ref: ");
901 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
902 dump_printf (MSG_NOTE, "\n");
904 return true;
907 if (DECL_USER_ALIGN (base))
909 if (dump_enabled_p ())
911 dump_printf_loc (MSG_NOTE, vect_location,
912 "not forcing alignment of user-aligned "
913 "variable: ");
914 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
915 dump_printf (MSG_NOTE, "\n");
917 return true;
920 /* Force the alignment of the decl.
921 NOTE: This is the only change to the code we make during
922 the analysis phase, before deciding to vectorize the loop. */
923 if (dump_enabled_p ())
925 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
926 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
927 dump_printf (MSG_NOTE, "\n");
930 DR_VECT_AUX (dr)->base_decl = base;
931 DR_VECT_AUX (dr)->base_misaligned = true;
932 base_misalignment = 0;
934 unsigned int misalignment = (base_misalignment
935 + TREE_INT_CST_LOW (drb->init));
937 /* If this is a backward running DR then first access in the larger
938 vectype actually is N-1 elements before the address in the DR.
939 Adjust misalign accordingly. */
940 if (tree_int_cst_sgn (drb->step) < 0)
941 /* PLUS because STEP is negative. */
942 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
943 * TREE_INT_CST_LOW (drb->step));
945 SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
947 if (dump_enabled_p ())
949 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
950 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
951 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
952 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
955 return true;
959 /* Function vect_update_misalignment_for_peel.
960 Sets DR's misalignment
961 - to 0 if it has the same alignment as DR_PEEL,
962 - to the misalignment computed using NPEEL if DR's salignment is known,
963 - to -1 (unknown) otherwise.
965 DR - the data reference whose misalignment is to be adjusted.
966 DR_PEEL - the data reference whose misalignment is being made
967 zero in the vector loop by the peel.
968 NPEEL - the number of iterations in the peel loop if the misalignment
969 of DR_PEEL is known at compile time. */
971 static void
972 vect_update_misalignment_for_peel (struct data_reference *dr,
973 struct data_reference *dr_peel, int npeel)
975 unsigned int i;
976 vec<dr_p> same_aligned_drs;
977 struct data_reference *current_dr;
978 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
979 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
980 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
981 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
983 /* For interleaved data accesses the step in the loop must be multiplied by
984 the size of the interleaving group. */
985 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
986 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
987 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
988 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
990 /* It can be assumed that the data refs with the same alignment as dr_peel
991 are aligned in the vector loop. */
992 same_aligned_drs
993 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
994 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
996 if (current_dr != dr)
997 continue;
998 gcc_assert (!known_alignment_for_access_p (dr)
999 || !known_alignment_for_access_p (dr_peel)
1000 || (DR_MISALIGNMENT (dr) / dr_size
1001 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1002 SET_DR_MISALIGNMENT (dr, 0);
1003 return;
1006 if (known_alignment_for_access_p (dr)
1007 && known_alignment_for_access_p (dr_peel))
1009 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1010 int misal = DR_MISALIGNMENT (dr);
1011 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1012 misal += negative ? -npeel * dr_size : npeel * dr_size;
1013 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
1014 SET_DR_MISALIGNMENT (dr, misal);
1015 return;
1018 if (dump_enabled_p ())
1019 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1020 "to unknown (-1).\n");
1021 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1025 /* Function verify_data_ref_alignment
1027 Return TRUE if DR can be handled with respect to alignment. */
1029 static bool
1030 verify_data_ref_alignment (data_reference_p dr)
1032 enum dr_alignment_support supportable_dr_alignment
1033 = vect_supportable_dr_alignment (dr, false);
1034 if (!supportable_dr_alignment)
1036 if (dump_enabled_p ())
1038 if (DR_IS_READ (dr))
1039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1040 "not vectorized: unsupported unaligned load.");
1041 else
1042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1043 "not vectorized: unsupported unaligned "
1044 "store.");
1046 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1047 DR_REF (dr));
1048 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1050 return false;
1053 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1054 dump_printf_loc (MSG_NOTE, vect_location,
1055 "Vectorizing an unaligned access.\n");
1057 return true;
1060 /* Function vect_verify_datarefs_alignment
1062 Return TRUE if all data references in the loop can be
1063 handled with respect to alignment. */
1065 bool
1066 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1068 vec<data_reference_p> datarefs = vinfo->datarefs;
1069 struct data_reference *dr;
1070 unsigned int i;
1072 FOR_EACH_VEC_ELT (datarefs, i, dr)
1074 gimple *stmt = DR_STMT (dr);
1075 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1077 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1078 continue;
1080 /* For interleaving, only the alignment of the first access matters. */
1081 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1082 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1083 continue;
1085 /* Strided accesses perform only component accesses, alignment is
1086 irrelevant for them. */
1087 if (STMT_VINFO_STRIDED_P (stmt_info)
1088 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1089 continue;
1091 if (! verify_data_ref_alignment (dr))
1092 return false;
1095 return true;
1098 /* Given an memory reference EXP return whether its alignment is less
1099 than its size. */
1101 static bool
1102 not_size_aligned (tree exp)
1104 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1105 return true;
1107 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1108 > get_object_alignment (exp));
1111 /* Function vector_alignment_reachable_p
1113 Return true if vector alignment for DR is reachable by peeling
1114 a few loop iterations. Return false otherwise. */
1116 static bool
1117 vector_alignment_reachable_p (struct data_reference *dr)
1119 gimple *stmt = DR_STMT (dr);
1120 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1121 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1123 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1125 /* For interleaved access we peel only if number of iterations in
1126 the prolog loop ({VF - misalignment}), is a multiple of the
1127 number of the interleaved accesses. */
1128 int elem_size, mis_in_elements;
1129 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1131 /* FORNOW: handle only known alignment. */
1132 if (!known_alignment_for_access_p (dr))
1133 return false;
1135 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1136 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1138 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1139 return false;
1142 /* If misalignment is known at the compile time then allow peeling
1143 only if natural alignment is reachable through peeling. */
1144 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1146 HOST_WIDE_INT elmsize =
1147 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1148 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_NOTE, vect_location,
1151 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1152 dump_printf (MSG_NOTE,
1153 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1155 if (DR_MISALIGNMENT (dr) % elmsize)
1157 if (dump_enabled_p ())
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 "data size does not divide the misalignment.\n");
1160 return false;
1164 if (!known_alignment_for_access_p (dr))
1166 tree type = TREE_TYPE (DR_REF (dr));
1167 bool is_packed = not_size_aligned (DR_REF (dr));
1168 if (dump_enabled_p ())
1169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170 "Unknown misalignment, %snaturally aligned\n",
1171 is_packed ? "not " : "");
1172 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1175 return true;
1179 /* Calculate the cost of the memory access represented by DR. */
1181 static void
1182 vect_get_data_access_cost (struct data_reference *dr,
1183 unsigned int *inside_cost,
1184 unsigned int *outside_cost,
1185 stmt_vector_for_cost *body_cost_vec)
1187 gimple *stmt = DR_STMT (dr);
1188 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1189 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1190 int ncopies;
1192 if (PURE_SLP_STMT (stmt_info))
1193 ncopies = 1;
1194 else
1195 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1197 if (DR_IS_READ (dr))
1198 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1199 NULL, body_cost_vec, false);
1200 else
1201 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1203 if (dump_enabled_p ())
1204 dump_printf_loc (MSG_NOTE, vect_location,
1205 "vect_get_data_access_cost: inside_cost = %d, "
1206 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1210 typedef struct _vect_peel_info
1212 struct data_reference *dr;
1213 int npeel;
1214 unsigned int count;
1215 } *vect_peel_info;
1217 typedef struct _vect_peel_extended_info
1219 struct _vect_peel_info peel_info;
1220 unsigned int inside_cost;
1221 unsigned int outside_cost;
1222 } *vect_peel_extended_info;
1225 /* Peeling hashtable helpers. */
1227 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1229 static inline hashval_t hash (const _vect_peel_info *);
1230 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1233 inline hashval_t
1234 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1236 return (hashval_t) peel_info->npeel;
1239 inline bool
1240 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1242 return (a->npeel == b->npeel);
1246 /* Insert DR into peeling hash table with NPEEL as key. */
1248 static void
1249 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1250 loop_vec_info loop_vinfo, struct data_reference *dr,
1251 int npeel)
1253 struct _vect_peel_info elem, *slot;
1254 _vect_peel_info **new_slot;
1255 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1257 elem.npeel = npeel;
1258 slot = peeling_htab->find (&elem);
1259 if (slot)
1260 slot->count++;
1261 else
1263 slot = XNEW (struct _vect_peel_info);
1264 slot->npeel = npeel;
1265 slot->dr = dr;
1266 slot->count = 1;
1267 new_slot = peeling_htab->find_slot (slot, INSERT);
1268 *new_slot = slot;
1271 if (!supportable_dr_alignment
1272 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1273 slot->count += VECT_MAX_COST;
1277 /* Traverse peeling hash table to find peeling option that aligns maximum
1278 number of data accesses. */
1281 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1282 _vect_peel_extended_info *max)
1284 vect_peel_info elem = *slot;
1286 if (elem->count > max->peel_info.count
1287 || (elem->count == max->peel_info.count
1288 && max->peel_info.npeel > elem->npeel))
1290 max->peel_info.npeel = elem->npeel;
1291 max->peel_info.count = elem->count;
1292 max->peel_info.dr = elem->dr;
1295 return 1;
1298 /* Get the costs of peeling NPEEL iterations checking data access costs
1299 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1300 misalignment will be zero after peeling. */
1302 static void
1303 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1304 struct data_reference *dr0,
1305 unsigned int *inside_cost,
1306 unsigned int *outside_cost,
1307 stmt_vector_for_cost *body_cost_vec,
1308 unsigned int npeel,
1309 bool unknown_misalignment)
1311 unsigned i;
1312 data_reference *dr;
1314 FOR_EACH_VEC_ELT (datarefs, i, dr)
1316 gimple *stmt = DR_STMT (dr);
1317 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1318 /* For interleaving, only the alignment of the first access
1319 matters. */
1320 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1321 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1322 continue;
1324 /* Strided accesses perform only component accesses, alignment is
1325 irrelevant for them. */
1326 if (STMT_VINFO_STRIDED_P (stmt_info)
1327 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1328 continue;
1330 int save_misalignment;
1331 save_misalignment = DR_MISALIGNMENT (dr);
1332 if (npeel == 0)
1334 else if (unknown_misalignment && dr == dr0)
1335 SET_DR_MISALIGNMENT (dr, 0);
1336 else
1337 vect_update_misalignment_for_peel (dr, dr0, npeel);
1338 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1339 body_cost_vec);
1340 SET_DR_MISALIGNMENT (dr, save_misalignment);
1344 /* Traverse peeling hash table and calculate cost for each peeling option.
1345 Find the one with the lowest cost. */
1348 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1349 _vect_peel_extended_info *min)
1351 vect_peel_info elem = *slot;
1352 int dummy;
1353 unsigned int inside_cost = 0, outside_cost = 0;
1354 gimple *stmt = DR_STMT (elem->dr);
1355 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1356 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1357 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1358 epilogue_cost_vec;
1360 prologue_cost_vec.create (2);
1361 body_cost_vec.create (2);
1362 epilogue_cost_vec.create (2);
1364 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1365 elem->dr, &inside_cost, &outside_cost,
1366 &body_cost_vec, elem->npeel, false);
1368 body_cost_vec.release ();
1370 outside_cost += vect_get_known_peeling_cost
1371 (loop_vinfo, elem->npeel, &dummy,
1372 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1373 &prologue_cost_vec, &epilogue_cost_vec);
1375 /* Prologue and epilogue costs are added to the target model later.
1376 These costs depend only on the scalar iteration cost, the
1377 number of peeling iterations finally chosen, and the number of
1378 misaligned statements. So discard the information found here. */
1379 prologue_cost_vec.release ();
1380 epilogue_cost_vec.release ();
1382 if (inside_cost < min->inside_cost
1383 || (inside_cost == min->inside_cost
1384 && outside_cost < min->outside_cost))
1386 min->inside_cost = inside_cost;
1387 min->outside_cost = outside_cost;
1388 min->peel_info.dr = elem->dr;
1389 min->peel_info.npeel = elem->npeel;
1390 min->peel_info.count = elem->count;
1393 return 1;
1397 /* Choose best peeling option by traversing peeling hash table and either
1398 choosing an option with the lowest cost (if cost model is enabled) or the
1399 option that aligns as many accesses as possible. */
1401 static struct _vect_peel_extended_info
1402 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1403 loop_vec_info loop_vinfo)
1405 struct _vect_peel_extended_info res;
1407 res.peel_info.dr = NULL;
1409 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1411 res.inside_cost = INT_MAX;
1412 res.outside_cost = INT_MAX;
1413 peeling_htab->traverse <_vect_peel_extended_info *,
1414 vect_peeling_hash_get_lowest_cost> (&res);
1416 else
1418 res.peel_info.count = 0;
1419 peeling_htab->traverse <_vect_peel_extended_info *,
1420 vect_peeling_hash_get_most_frequent> (&res);
1421 res.inside_cost = 0;
1422 res.outside_cost = 0;
1425 return res;
1428 /* Return true if the new peeling NPEEL is supported. */
1430 static bool
1431 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1432 unsigned npeel)
1434 unsigned i;
1435 struct data_reference *dr = NULL;
1436 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1437 gimple *stmt;
1438 stmt_vec_info stmt_info;
1439 enum dr_alignment_support supportable_dr_alignment;
1441 /* Ensure that all data refs can be vectorized after the peel. */
1442 FOR_EACH_VEC_ELT (datarefs, i, dr)
1444 int save_misalignment;
1446 if (dr == dr0)
1447 continue;
1449 stmt = DR_STMT (dr);
1450 stmt_info = vinfo_for_stmt (stmt);
1451 /* For interleaving, only the alignment of the first access
1452 matters. */
1453 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1454 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1455 continue;
1457 /* Strided accesses perform only component accesses, alignment is
1458 irrelevant for them. */
1459 if (STMT_VINFO_STRIDED_P (stmt_info)
1460 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1461 continue;
1463 save_misalignment = DR_MISALIGNMENT (dr);
1464 vect_update_misalignment_for_peel (dr, dr0, npeel);
1465 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1466 SET_DR_MISALIGNMENT (dr, save_misalignment);
1468 if (!supportable_dr_alignment)
1469 return false;
1472 return true;
1475 /* Function vect_enhance_data_refs_alignment
1477 This pass will use loop versioning and loop peeling in order to enhance
1478 the alignment of data references in the loop.
1480 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1481 original loop is to be vectorized. Any other loops that are created by
1482 the transformations performed in this pass - are not supposed to be
1483 vectorized. This restriction will be relaxed.
1485 This pass will require a cost model to guide it whether to apply peeling
1486 or versioning or a combination of the two. For example, the scheme that
1487 intel uses when given a loop with several memory accesses, is as follows:
1488 choose one memory access ('p') which alignment you want to force by doing
1489 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1490 other accesses are not necessarily aligned, or (2) use loop versioning to
1491 generate one loop in which all accesses are aligned, and another loop in
1492 which only 'p' is necessarily aligned.
1494 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1495 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1496 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1498 Devising a cost model is the most critical aspect of this work. It will
1499 guide us on which access to peel for, whether to use loop versioning, how
1500 many versions to create, etc. The cost model will probably consist of
1501 generic considerations as well as target specific considerations (on
1502 powerpc for example, misaligned stores are more painful than misaligned
1503 loads).
1505 Here are the general steps involved in alignment enhancements:
1507 -- original loop, before alignment analysis:
1508 for (i=0; i<N; i++){
1509 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1510 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1513 -- After vect_compute_data_refs_alignment:
1514 for (i=0; i<N; i++){
1515 x = q[i]; # DR_MISALIGNMENT(q) = 3
1516 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1519 -- Possibility 1: we do loop versioning:
1520 if (p is aligned) {
1521 for (i=0; i<N; i++){ # loop 1A
1522 x = q[i]; # DR_MISALIGNMENT(q) = 3
1523 p[i] = y; # DR_MISALIGNMENT(p) = 0
1526 else {
1527 for (i=0; i<N; i++){ # loop 1B
1528 x = q[i]; # DR_MISALIGNMENT(q) = 3
1529 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1533 -- Possibility 2: we do loop peeling:
1534 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1535 x = q[i];
1536 p[i] = y;
1538 for (i = 3; i < N; i++){ # loop 2A
1539 x = q[i]; # DR_MISALIGNMENT(q) = 0
1540 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1543 -- Possibility 3: combination of loop peeling and versioning:
1544 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1545 x = q[i];
1546 p[i] = y;
1548 if (p is aligned) {
1549 for (i = 3; i<N; i++){ # loop 3A
1550 x = q[i]; # DR_MISALIGNMENT(q) = 0
1551 p[i] = y; # DR_MISALIGNMENT(p) = 0
1554 else {
1555 for (i = 3; i<N; i++){ # loop 3B
1556 x = q[i]; # DR_MISALIGNMENT(q) = 0
1557 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1561 These loops are later passed to loop_transform to be vectorized. The
1562 vectorizer will use the alignment information to guide the transformation
1563 (whether to generate regular loads/stores, or with special handling for
1564 misalignment). */
1566 bool
1567 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1569 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1570 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1571 enum dr_alignment_support supportable_dr_alignment;
1572 struct data_reference *dr0 = NULL, *first_store = NULL;
1573 struct data_reference *dr;
1574 unsigned int i, j;
1575 bool do_peeling = false;
1576 bool do_versioning = false;
1577 bool stat;
1578 gimple *stmt;
1579 stmt_vec_info stmt_info;
1580 unsigned int npeel = 0;
1581 bool one_misalignment_known = false;
1582 bool one_misalignment_unknown = false;
1583 bool one_dr_unsupportable = false;
1584 struct data_reference *unsupportable_dr = NULL;
1585 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1586 unsigned possible_npeel_number = 1;
1587 tree vectype;
1588 unsigned int nelements, mis, same_align_drs_max = 0;
1589 hash_table<peel_info_hasher> peeling_htab (1);
1591 if (dump_enabled_p ())
1592 dump_printf_loc (MSG_NOTE, vect_location,
1593 "=== vect_enhance_data_refs_alignment ===\n");
1595 /* Reset data so we can safely be called multiple times. */
1596 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1597 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1599 /* While cost model enhancements are expected in the future, the high level
1600 view of the code at this time is as follows:
1602 A) If there is a misaligned access then see if peeling to align
1603 this access can make all data references satisfy
1604 vect_supportable_dr_alignment. If so, update data structures
1605 as needed and return true.
1607 B) If peeling wasn't possible and there is a data reference with an
1608 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1609 then see if loop versioning checks can be used to make all data
1610 references satisfy vect_supportable_dr_alignment. If so, update
1611 data structures as needed and return true.
1613 C) If neither peeling nor versioning were successful then return false if
1614 any data reference does not satisfy vect_supportable_dr_alignment.
1616 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1618 Note, Possibility 3 above (which is peeling and versioning together) is not
1619 being done at this time. */
1621 /* (1) Peeling to force alignment. */
1623 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1624 Considerations:
1625 + How many accesses will become aligned due to the peeling
1626 - How many accesses will become unaligned due to the peeling,
1627 and the cost of misaligned accesses.
1628 - The cost of peeling (the extra runtime checks, the increase
1629 in code size). */
1631 FOR_EACH_VEC_ELT (datarefs, i, dr)
1633 stmt = DR_STMT (dr);
1634 stmt_info = vinfo_for_stmt (stmt);
1636 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1637 continue;
1639 /* For interleaving, only the alignment of the first access
1640 matters. */
1641 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1642 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1643 continue;
1645 /* For invariant accesses there is nothing to enhance. */
1646 if (integer_zerop (DR_STEP (dr)))
1647 continue;
1649 /* Strided accesses perform only component accesses, alignment is
1650 irrelevant for them. */
1651 if (STMT_VINFO_STRIDED_P (stmt_info)
1652 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1653 continue;
1655 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1656 do_peeling = vector_alignment_reachable_p (dr);
1657 if (do_peeling)
1659 if (known_alignment_for_access_p (dr))
1661 unsigned int npeel_tmp = 0;
1662 bool negative = tree_int_cst_compare (DR_STEP (dr),
1663 size_zero_node) < 0;
1665 vectype = STMT_VINFO_VECTYPE (stmt_info);
1666 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1667 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1668 TREE_TYPE (DR_REF (dr))));
1669 if (DR_MISALIGNMENT (dr) != 0)
1670 npeel_tmp = (negative ? (mis - nelements)
1671 : (nelements - mis)) & (nelements - 1);
1673 /* For multiple types, it is possible that the bigger type access
1674 will have more than one peeling option. E.g., a loop with two
1675 types: one of size (vector size / 4), and the other one of
1676 size (vector size / 8). Vectorization factor will 8. If both
1677 accesses are misaligned by 3, the first one needs one scalar
1678 iteration to be aligned, and the second one needs 5. But the
1679 first one will be aligned also by peeling 5 scalar
1680 iterations, and in that case both accesses will be aligned.
1681 Hence, except for the immediate peeling amount, we also want
1682 to try to add full vector size, while we don't exceed
1683 vectorization factor.
1684 We do this automatically for cost model, since we calculate
1685 cost for every peeling option. */
1686 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1688 if (STMT_SLP_TYPE (stmt_info))
1689 possible_npeel_number
1690 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1691 else
1692 possible_npeel_number = vf / nelements;
1694 /* NPEEL_TMP is 0 when there is no misalignment, but also
1695 allow peeling NELEMENTS. */
1696 if (DR_MISALIGNMENT (dr) == 0)
1697 possible_npeel_number++;
1700 /* Save info about DR in the hash table. Also include peeling
1701 amounts according to the explanation above. */
1702 for (j = 0; j < possible_npeel_number; j++)
1704 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1705 dr, npeel_tmp);
1706 npeel_tmp += nelements;
1709 one_misalignment_known = true;
1711 else
1713 /* If we don't know any misalignment values, we prefer
1714 peeling for data-ref that has the maximum number of data-refs
1715 with the same alignment, unless the target prefers to align
1716 stores over load. */
1717 unsigned same_align_drs
1718 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1719 if (!dr0
1720 || same_align_drs_max < same_align_drs)
1722 same_align_drs_max = same_align_drs;
1723 dr0 = dr;
1725 /* For data-refs with the same number of related
1726 accesses prefer the one where the misalign
1727 computation will be invariant in the outermost loop. */
1728 else if (same_align_drs_max == same_align_drs)
1730 struct loop *ivloop0, *ivloop;
1731 ivloop0 = outermost_invariant_loop_for_expr
1732 (loop, DR_BASE_ADDRESS (dr0));
1733 ivloop = outermost_invariant_loop_for_expr
1734 (loop, DR_BASE_ADDRESS (dr));
1735 if ((ivloop && !ivloop0)
1736 || (ivloop && ivloop0
1737 && flow_loop_nested_p (ivloop, ivloop0)))
1738 dr0 = dr;
1741 one_misalignment_unknown = true;
1743 /* Check for data refs with unsupportable alignment that
1744 can be peeled. */
1745 if (!supportable_dr_alignment)
1747 one_dr_unsupportable = true;
1748 unsupportable_dr = dr;
1751 if (!first_store && DR_IS_WRITE (dr))
1752 first_store = dr;
1755 else
1757 if (!aligned_access_p (dr))
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761 "vector alignment may not be reachable\n");
1762 break;
1767 /* Check if we can possibly peel the loop. */
1768 if (!vect_can_advance_ivs_p (loop_vinfo)
1769 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1770 || loop->inner)
1771 do_peeling = false;
1773 struct _vect_peel_extended_info peel_for_known_alignment;
1774 struct _vect_peel_extended_info peel_for_unknown_alignment;
1775 struct _vect_peel_extended_info best_peel;
1777 peel_for_unknown_alignment.inside_cost = INT_MAX;
1778 peel_for_unknown_alignment.outside_cost = INT_MAX;
1779 peel_for_unknown_alignment.peel_info.count = 0;
1781 if (do_peeling
1782 && one_misalignment_unknown)
1784 /* Check if the target requires to prefer stores over loads, i.e., if
1785 misaligned stores are more expensive than misaligned loads (taking
1786 drs with same alignment into account). */
1787 unsigned int load_inside_cost = 0;
1788 unsigned int load_outside_cost = 0;
1789 unsigned int store_inside_cost = 0;
1790 unsigned int store_outside_cost = 0;
1792 stmt_vector_for_cost dummy;
1793 dummy.create (2);
1794 vect_get_peeling_costs_all_drs (datarefs, dr0,
1795 &load_inside_cost,
1796 &load_outside_cost,
1797 &dummy, vf / 2, true);
1798 dummy.release ();
1800 if (first_store)
1802 dummy.create (2);
1803 vect_get_peeling_costs_all_drs (datarefs, first_store,
1804 &store_inside_cost,
1805 &store_outside_cost,
1806 &dummy, vf / 2, true);
1807 dummy.release ();
1809 else
1811 store_inside_cost = INT_MAX;
1812 store_outside_cost = INT_MAX;
1815 if (load_inside_cost > store_inside_cost
1816 || (load_inside_cost == store_inside_cost
1817 && load_outside_cost > store_outside_cost))
1819 dr0 = first_store;
1820 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1821 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1823 else
1825 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1826 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1829 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1830 prologue_cost_vec.create (2);
1831 epilogue_cost_vec.create (2);
1833 int dummy2;
1834 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1835 (loop_vinfo, vf / 2, &dummy2,
1836 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1837 &prologue_cost_vec, &epilogue_cost_vec);
1839 prologue_cost_vec.release ();
1840 epilogue_cost_vec.release ();
1842 peel_for_unknown_alignment.peel_info.count = 1
1843 + STMT_VINFO_SAME_ALIGN_REFS
1844 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1847 peel_for_unknown_alignment.peel_info.npeel = 0;
1848 peel_for_unknown_alignment.peel_info.dr = dr0;
1850 best_peel = peel_for_unknown_alignment;
1852 peel_for_known_alignment.inside_cost = INT_MAX;
1853 peel_for_known_alignment.outside_cost = INT_MAX;
1854 peel_for_known_alignment.peel_info.count = 0;
1855 peel_for_known_alignment.peel_info.dr = NULL;
1857 if (do_peeling && one_misalignment_known)
1859 /* Peeling is possible, but there is no data access that is not supported
1860 unless aligned. So we try to choose the best possible peeling from
1861 the hash table. */
1862 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1863 (&peeling_htab, loop_vinfo);
1866 /* Compare costs of peeling for known and unknown alignment. */
1867 if (peel_for_known_alignment.peel_info.dr != NULL
1868 && peel_for_unknown_alignment.inside_cost
1869 >= peel_for_known_alignment.inside_cost)
1871 best_peel = peel_for_known_alignment;
1873 /* If the best peeling for known alignment has NPEEL == 0, perform no
1874 peeling at all except if there is an unsupportable dr that we can
1875 align. */
1876 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1877 do_peeling = false;
1880 /* If there is an unsupportable data ref, prefer this over all choices so far
1881 since we'd have to discard a chosen peeling except when it accidentally
1882 aligned the unsupportable data ref. */
1883 if (one_dr_unsupportable)
1884 dr0 = unsupportable_dr;
1885 else if (do_peeling)
1887 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1888 TODO: Use nopeel_outside_cost or get rid of it? */
1889 unsigned nopeel_inside_cost = 0;
1890 unsigned nopeel_outside_cost = 0;
1892 stmt_vector_for_cost dummy;
1893 dummy.create (2);
1894 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1895 &nopeel_outside_cost, &dummy, 0, false);
1896 dummy.release ();
1898 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1899 costs will be recorded. */
1900 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1901 prologue_cost_vec.create (2);
1902 epilogue_cost_vec.create (2);
1904 int dummy2;
1905 nopeel_outside_cost += vect_get_known_peeling_cost
1906 (loop_vinfo, 0, &dummy2,
1907 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1908 &prologue_cost_vec, &epilogue_cost_vec);
1910 prologue_cost_vec.release ();
1911 epilogue_cost_vec.release ();
1913 npeel = best_peel.peel_info.npeel;
1914 dr0 = best_peel.peel_info.dr;
1916 /* If no peeling is not more expensive than the best peeling we
1917 have so far, don't perform any peeling. */
1918 if (nopeel_inside_cost <= best_peel.inside_cost)
1919 do_peeling = false;
1922 if (do_peeling)
1924 stmt = DR_STMT (dr0);
1925 stmt_info = vinfo_for_stmt (stmt);
1926 vectype = STMT_VINFO_VECTYPE (stmt_info);
1927 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1929 if (known_alignment_for_access_p (dr0))
1931 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1932 size_zero_node) < 0;
1933 if (!npeel)
1935 /* Since it's known at compile time, compute the number of
1936 iterations in the peeled loop (the peeling factor) for use in
1937 updating DR_MISALIGNMENT values. The peeling factor is the
1938 vectorization factor minus the misalignment as an element
1939 count. */
1940 mis = DR_MISALIGNMENT (dr0);
1941 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1942 npeel = ((negative ? mis - nelements : nelements - mis)
1943 & (nelements - 1));
1946 /* For interleaved data access every iteration accesses all the
1947 members of the group, therefore we divide the number of iterations
1948 by the group size. */
1949 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1950 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1951 npeel /= GROUP_SIZE (stmt_info);
1953 if (dump_enabled_p ())
1954 dump_printf_loc (MSG_NOTE, vect_location,
1955 "Try peeling by %d\n", npeel);
1958 /* Ensure that all datarefs can be vectorized after the peel. */
1959 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1960 do_peeling = false;
1962 /* Check if all datarefs are supportable and log. */
1963 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1965 stat = vect_verify_datarefs_alignment (loop_vinfo);
1966 if (!stat)
1967 do_peeling = false;
1968 else
1969 return stat;
1972 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1973 if (do_peeling)
1975 unsigned max_allowed_peel
1976 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1977 if (max_allowed_peel != (unsigned)-1)
1979 unsigned max_peel = npeel;
1980 if (max_peel == 0)
1982 gimple *dr_stmt = DR_STMT (dr0);
1983 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1984 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1985 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1987 if (max_peel > max_allowed_peel)
1989 do_peeling = false;
1990 if (dump_enabled_p ())
1991 dump_printf_loc (MSG_NOTE, vect_location,
1992 "Disable peeling, max peels reached: %d\n", max_peel);
1997 /* Cost model #2 - if peeling may result in a remaining loop not
1998 iterating enough to be vectorized then do not peel. */
1999 if (do_peeling
2000 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2002 unsigned max_peel
2003 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
2004 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
2005 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
2006 do_peeling = false;
2009 if (do_peeling)
2011 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2012 If the misalignment of DR_i is identical to that of dr0 then set
2013 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2014 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2015 by the peeling factor times the element size of DR_i (MOD the
2016 vectorization factor times the size). Otherwise, the
2017 misalignment of DR_i must be set to unknown. */
2018 FOR_EACH_VEC_ELT (datarefs, i, dr)
2019 if (dr != dr0)
2021 /* Strided accesses perform only component accesses, alignment
2022 is irrelevant for them. */
2023 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2024 if (STMT_VINFO_STRIDED_P (stmt_info)
2025 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2026 continue;
2028 vect_update_misalignment_for_peel (dr, dr0, npeel);
2031 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2032 if (npeel)
2033 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2034 else
2035 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2036 = DR_MISALIGNMENT (dr0);
2037 SET_DR_MISALIGNMENT (dr0, 0);
2038 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_NOTE, vect_location,
2041 "Alignment of access forced using peeling.\n");
2042 dump_printf_loc (MSG_NOTE, vect_location,
2043 "Peeling for alignment will be applied.\n");
2046 /* The inside-loop cost will be accounted for in vectorizable_load
2047 and vectorizable_store correctly with adjusted alignments.
2048 Drop the body_cst_vec on the floor here. */
2049 stat = vect_verify_datarefs_alignment (loop_vinfo);
2050 gcc_assert (stat);
2051 return stat;
2055 /* (2) Versioning to force alignment. */
2057 /* Try versioning if:
2058 1) optimize loop for speed
2059 2) there is at least one unsupported misaligned data ref with an unknown
2060 misalignment, and
2061 3) all misaligned data refs with a known misalignment are supported, and
2062 4) the number of runtime alignment checks is within reason. */
2064 do_versioning =
2065 optimize_loop_nest_for_speed_p (loop)
2066 && (!loop->inner); /* FORNOW */
2068 if (do_versioning)
2070 FOR_EACH_VEC_ELT (datarefs, i, dr)
2072 stmt = DR_STMT (dr);
2073 stmt_info = vinfo_for_stmt (stmt);
2075 /* For interleaving, only the alignment of the first access
2076 matters. */
2077 if (aligned_access_p (dr)
2078 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2079 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2080 continue;
2082 if (STMT_VINFO_STRIDED_P (stmt_info))
2084 /* Strided loads perform only component accesses, alignment is
2085 irrelevant for them. */
2086 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2087 continue;
2088 do_versioning = false;
2089 break;
2092 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2094 if (!supportable_dr_alignment)
2096 gimple *stmt;
2097 int mask;
2098 tree vectype;
2100 if (known_alignment_for_access_p (dr)
2101 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2102 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2104 do_versioning = false;
2105 break;
2108 stmt = DR_STMT (dr);
2109 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2110 gcc_assert (vectype);
2112 /* The rightmost bits of an aligned address must be zeros.
2113 Construct the mask needed for this test. For example,
2114 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2115 mask must be 15 = 0xf. */
2116 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2118 /* FORNOW: use the same mask to test all potentially unaligned
2119 references in the loop. The vectorizer currently supports
2120 a single vector size, see the reference to
2121 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2122 vectorization factor is computed. */
2123 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2124 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2125 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2126 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2127 DR_STMT (dr));
2131 /* Versioning requires at least one misaligned data reference. */
2132 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2133 do_versioning = false;
2134 else if (!do_versioning)
2135 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2138 if (do_versioning)
2140 vec<gimple *> may_misalign_stmts
2141 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2142 gimple *stmt;
2144 /* It can now be assumed that the data references in the statements
2145 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2146 of the loop being vectorized. */
2147 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2149 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2150 dr = STMT_VINFO_DATA_REF (stmt_info);
2151 SET_DR_MISALIGNMENT (dr, 0);
2152 if (dump_enabled_p ())
2153 dump_printf_loc (MSG_NOTE, vect_location,
2154 "Alignment of access forced using versioning.\n");
2157 if (dump_enabled_p ())
2158 dump_printf_loc (MSG_NOTE, vect_location,
2159 "Versioning for alignment will be applied.\n");
2161 /* Peeling and versioning can't be done together at this time. */
2162 gcc_assert (! (do_peeling && do_versioning));
2164 stat = vect_verify_datarefs_alignment (loop_vinfo);
2165 gcc_assert (stat);
2166 return stat;
2169 /* This point is reached if neither peeling nor versioning is being done. */
2170 gcc_assert (! (do_peeling || do_versioning));
2172 stat = vect_verify_datarefs_alignment (loop_vinfo);
2173 return stat;
2177 /* Function vect_find_same_alignment_drs.
2179 Update group and alignment relations according to the chosen
2180 vectorization factor. */
2182 static void
2183 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2185 struct data_reference *dra = DDR_A (ddr);
2186 struct data_reference *drb = DDR_B (ddr);
2187 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2188 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2190 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2191 return;
2193 if (dra == drb)
2194 return;
2196 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2197 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2198 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2199 return;
2201 /* Two references with distance zero have the same alignment. */
2202 offset_int diff = (wi::to_offset (DR_INIT (dra))
2203 - wi::to_offset (DR_INIT (drb)));
2204 if (diff != 0)
2206 /* Get the wider of the two alignments. */
2207 unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2208 unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2209 unsigned int max_align = MAX (align_a, align_b);
2211 /* Require the gap to be a multiple of the larger vector alignment. */
2212 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2213 return;
2216 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2217 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2218 if (dump_enabled_p ())
2220 dump_printf_loc (MSG_NOTE, vect_location,
2221 "accesses have the same alignment: ");
2222 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2223 dump_printf (MSG_NOTE, " and ");
2224 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2225 dump_printf (MSG_NOTE, "\n");
2230 /* Function vect_analyze_data_refs_alignment
2232 Analyze the alignment of the data-references in the loop.
2233 Return FALSE if a data reference is found that cannot be vectorized. */
2235 bool
2236 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2238 if (dump_enabled_p ())
2239 dump_printf_loc (MSG_NOTE, vect_location,
2240 "=== vect_analyze_data_refs_alignment ===\n");
2242 /* Mark groups of data references with same alignment using
2243 data dependence information. */
2244 vec<ddr_p> ddrs = vinfo->ddrs;
2245 struct data_dependence_relation *ddr;
2246 unsigned int i;
2248 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2249 vect_find_same_alignment_drs (ddr);
2251 vec<data_reference_p> datarefs = vinfo->datarefs;
2252 struct data_reference *dr;
2254 vect_record_base_alignments (vinfo);
2255 FOR_EACH_VEC_ELT (datarefs, i, dr)
2257 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2258 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2259 && !vect_compute_data_ref_alignment (dr))
2261 /* Strided accesses perform only component accesses, misalignment
2262 information is irrelevant for them. */
2263 if (STMT_VINFO_STRIDED_P (stmt_info)
2264 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2265 continue;
2267 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269 "not vectorized: can't calculate alignment "
2270 "for data ref.\n");
2272 return false;
2276 return true;
2280 /* Analyze alignment of DRs of stmts in NODE. */
2282 static bool
2283 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2285 /* We vectorize from the first scalar stmt in the node unless
2286 the node is permuted in which case we start from the first
2287 element in the group. */
2288 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2289 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2290 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2291 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2293 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2294 if (! vect_compute_data_ref_alignment (dr)
2295 /* For creating the data-ref pointer we need alignment of the
2296 first element anyway. */
2297 || (dr != first_dr
2298 && ! vect_compute_data_ref_alignment (first_dr))
2299 || ! verify_data_ref_alignment (dr))
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303 "not vectorized: bad data alignment in basic "
2304 "block.\n");
2305 return false;
2308 return true;
2311 /* Function vect_slp_analyze_instance_alignment
2313 Analyze the alignment of the data-references in the SLP instance.
2314 Return FALSE if a data reference is found that cannot be vectorized. */
2316 bool
2317 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2319 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_NOTE, vect_location,
2321 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2323 slp_tree node;
2324 unsigned i;
2325 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2326 if (! vect_slp_analyze_and_verify_node_alignment (node))
2327 return false;
2329 node = SLP_INSTANCE_TREE (instance);
2330 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2331 && ! vect_slp_analyze_and_verify_node_alignment
2332 (SLP_INSTANCE_TREE (instance)))
2333 return false;
2335 return true;
2339 /* Analyze groups of accesses: check that DR belongs to a group of
2340 accesses of legal size, step, etc. Detect gaps, single element
2341 interleaving, and other special cases. Set grouped access info.
2342 Collect groups of strided stores for further use in SLP analysis.
2343 Worker for vect_analyze_group_access. */
2345 static bool
2346 vect_analyze_group_access_1 (struct data_reference *dr)
2348 tree step = DR_STEP (dr);
2349 tree scalar_type = TREE_TYPE (DR_REF (dr));
2350 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2351 gimple *stmt = DR_STMT (dr);
2352 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2353 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2354 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2355 HOST_WIDE_INT dr_step = -1;
2356 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2357 bool slp_impossible = false;
2359 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2360 size of the interleaving group (including gaps). */
2361 if (tree_fits_shwi_p (step))
2363 dr_step = tree_to_shwi (step);
2364 /* Check that STEP is a multiple of type size. Otherwise there is
2365 a non-element-sized gap at the end of the group which we
2366 cannot represent in GROUP_GAP or GROUP_SIZE.
2367 ??? As we can handle non-constant step fine here we should
2368 simply remove uses of GROUP_GAP between the last and first
2369 element and instead rely on DR_STEP. GROUP_SIZE then would
2370 simply not include that gap. */
2371 if ((dr_step % type_size) != 0)
2373 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_NOTE, vect_location,
2376 "Step ");
2377 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2378 dump_printf (MSG_NOTE,
2379 " is not a multiple of the element size for ");
2380 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2381 dump_printf (MSG_NOTE, "\n");
2383 return false;
2385 groupsize = absu_hwi (dr_step) / type_size;
2387 else
2388 groupsize = 0;
2390 /* Not consecutive access is possible only if it is a part of interleaving. */
2391 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2393 /* Check if it this DR is a part of interleaving, and is a single
2394 element of the group that is accessed in the loop. */
2396 /* Gaps are supported only for loads. STEP must be a multiple of the type
2397 size. The size of the group must be a power of 2. */
2398 if (DR_IS_READ (dr)
2399 && (dr_step % type_size) == 0
2400 && groupsize > 0
2401 && pow2p_hwi (groupsize))
2403 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2404 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2405 GROUP_GAP (stmt_info) = groupsize - 1;
2406 if (dump_enabled_p ())
2408 dump_printf_loc (MSG_NOTE, vect_location,
2409 "Detected single element interleaving ");
2410 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2411 dump_printf (MSG_NOTE, " step ");
2412 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2413 dump_printf (MSG_NOTE, "\n");
2416 return true;
2419 if (dump_enabled_p ())
2421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2422 "not consecutive access ");
2423 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2426 if (bb_vinfo)
2428 /* Mark the statement as unvectorizable. */
2429 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2430 return true;
2433 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2434 STMT_VINFO_STRIDED_P (stmt_info) = true;
2435 return true;
2438 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2440 /* First stmt in the interleaving chain. Check the chain. */
2441 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2442 struct data_reference *data_ref = dr;
2443 unsigned int count = 1;
2444 tree prev_init = DR_INIT (data_ref);
2445 gimple *prev = stmt;
2446 HOST_WIDE_INT diff, gaps = 0;
2448 while (next)
2450 /* Skip same data-refs. In case that two or more stmts share
2451 data-ref (supported only for loads), we vectorize only the first
2452 stmt, and the rest get their vectorized loads from the first
2453 one. */
2454 if (!tree_int_cst_compare (DR_INIT (data_ref),
2455 DR_INIT (STMT_VINFO_DATA_REF (
2456 vinfo_for_stmt (next)))))
2458 if (DR_IS_WRITE (data_ref))
2460 if (dump_enabled_p ())
2461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2462 "Two store stmts share the same dr.\n");
2463 return false;
2466 if (dump_enabled_p ())
2467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2468 "Two or more load stmts share the same dr.\n");
2470 /* For load use the same data-ref load. */
2471 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2473 prev = next;
2474 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2475 continue;
2478 prev = next;
2479 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2481 /* All group members have the same STEP by construction. */
2482 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2484 /* Check that the distance between two accesses is equal to the type
2485 size. Otherwise, we have gaps. */
2486 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2487 - TREE_INT_CST_LOW (prev_init)) / type_size;
2488 if (diff != 1)
2490 /* FORNOW: SLP of accesses with gaps is not supported. */
2491 slp_impossible = true;
2492 if (DR_IS_WRITE (data_ref))
2494 if (dump_enabled_p ())
2495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2496 "interleaved store with gaps\n");
2497 return false;
2500 gaps += diff - 1;
2503 last_accessed_element += diff;
2505 /* Store the gap from the previous member of the group. If there is no
2506 gap in the access, GROUP_GAP is always 1. */
2507 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2509 prev_init = DR_INIT (data_ref);
2510 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2511 /* Count the number of data-refs in the chain. */
2512 count++;
2515 if (groupsize == 0)
2516 groupsize = count + gaps;
2518 /* This could be UINT_MAX but as we are generating code in a very
2519 inefficient way we have to cap earlier. See PR78699 for example. */
2520 if (groupsize > 4096)
2522 if (dump_enabled_p ())
2523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2524 "group is too large\n");
2525 return false;
2528 /* Check that the size of the interleaving is equal to count for stores,
2529 i.e., that there are no gaps. */
2530 if (groupsize != count
2531 && !DR_IS_READ (dr))
2533 if (dump_enabled_p ())
2534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2535 "interleaved store with gaps\n");
2536 return false;
2539 /* If there is a gap after the last load in the group it is the
2540 difference between the groupsize and the last accessed
2541 element.
2542 When there is no gap, this difference should be 0. */
2543 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2545 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2546 if (dump_enabled_p ())
2548 dump_printf_loc (MSG_NOTE, vect_location,
2549 "Detected interleaving ");
2550 if (DR_IS_READ (dr))
2551 dump_printf (MSG_NOTE, "load ");
2552 else
2553 dump_printf (MSG_NOTE, "store ");
2554 dump_printf (MSG_NOTE, "of size %u starting with ",
2555 (unsigned)groupsize);
2556 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2557 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2558 dump_printf_loc (MSG_NOTE, vect_location,
2559 "There is a gap of %u elements after the group\n",
2560 GROUP_GAP (vinfo_for_stmt (stmt)));
2563 /* SLP: create an SLP data structure for every interleaving group of
2564 stores for further analysis in vect_analyse_slp. */
2565 if (DR_IS_WRITE (dr) && !slp_impossible)
2567 if (loop_vinfo)
2568 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2569 if (bb_vinfo)
2570 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2574 return true;
2577 /* Analyze groups of accesses: check that DR belongs to a group of
2578 accesses of legal size, step, etc. Detect gaps, single element
2579 interleaving, and other special cases. Set grouped access info.
2580 Collect groups of strided stores for further use in SLP analysis. */
2582 static bool
2583 vect_analyze_group_access (struct data_reference *dr)
2585 if (!vect_analyze_group_access_1 (dr))
2587 /* Dissolve the group if present. */
2588 gimple *next;
2589 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2590 while (stmt)
2592 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2593 next = GROUP_NEXT_ELEMENT (vinfo);
2594 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2595 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2596 stmt = next;
2598 return false;
2600 return true;
2603 /* Analyze the access pattern of the data-reference DR.
2604 In case of non-consecutive accesses call vect_analyze_group_access() to
2605 analyze groups of accesses. */
2607 static bool
2608 vect_analyze_data_ref_access (struct data_reference *dr)
2610 tree step = DR_STEP (dr);
2611 tree scalar_type = TREE_TYPE (DR_REF (dr));
2612 gimple *stmt = DR_STMT (dr);
2613 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2614 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2615 struct loop *loop = NULL;
2617 if (loop_vinfo)
2618 loop = LOOP_VINFO_LOOP (loop_vinfo);
2620 if (loop_vinfo && !step)
2622 if (dump_enabled_p ())
2623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2624 "bad data-ref access in loop\n");
2625 return false;
2628 /* Allow loads with zero step in inner-loop vectorization. */
2629 if (loop_vinfo && integer_zerop (step))
2631 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2632 if (!nested_in_vect_loop_p (loop, stmt))
2633 return DR_IS_READ (dr);
2634 /* Allow references with zero step for outer loops marked
2635 with pragma omp simd only - it guarantees absence of
2636 loop-carried dependencies between inner loop iterations. */
2637 if (!loop->force_vectorize)
2639 if (dump_enabled_p ())
2640 dump_printf_loc (MSG_NOTE, vect_location,
2641 "zero step in inner loop of nest\n");
2642 return false;
2646 if (loop && nested_in_vect_loop_p (loop, stmt))
2648 /* Interleaved accesses are not yet supported within outer-loop
2649 vectorization for references in the inner-loop. */
2650 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2652 /* For the rest of the analysis we use the outer-loop step. */
2653 step = STMT_VINFO_DR_STEP (stmt_info);
2654 if (integer_zerop (step))
2656 if (dump_enabled_p ())
2657 dump_printf_loc (MSG_NOTE, vect_location,
2658 "zero step in outer loop.\n");
2659 return DR_IS_READ (dr);
2663 /* Consecutive? */
2664 if (TREE_CODE (step) == INTEGER_CST)
2666 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2667 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2668 || (dr_step < 0
2669 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2671 /* Mark that it is not interleaving. */
2672 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2673 return true;
2677 if (loop && nested_in_vect_loop_p (loop, stmt))
2679 if (dump_enabled_p ())
2680 dump_printf_loc (MSG_NOTE, vect_location,
2681 "grouped access in outer loop.\n");
2682 return false;
2686 /* Assume this is a DR handled by non-constant strided load case. */
2687 if (TREE_CODE (step) != INTEGER_CST)
2688 return (STMT_VINFO_STRIDED_P (stmt_info)
2689 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2690 || vect_analyze_group_access (dr)));
2692 /* Not consecutive access - check if it's a part of interleaving group. */
2693 return vect_analyze_group_access (dr);
2696 /* Compare two data-references DRA and DRB to group them into chunks
2697 suitable for grouping. */
2699 static int
2700 dr_group_sort_cmp (const void *dra_, const void *drb_)
2702 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2703 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2704 int cmp;
2706 /* Stabilize sort. */
2707 if (dra == drb)
2708 return 0;
2710 /* DRs in different loops never belong to the same group. */
2711 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2712 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2713 if (loopa != loopb)
2714 return loopa->num < loopb->num ? -1 : 1;
2716 /* Ordering of DRs according to base. */
2717 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2719 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2720 DR_BASE_ADDRESS (drb));
2721 if (cmp != 0)
2722 return cmp;
2725 /* And according to DR_OFFSET. */
2726 if (!dr_equal_offsets_p (dra, drb))
2728 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2729 if (cmp != 0)
2730 return cmp;
2733 /* Put reads before writes. */
2734 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2735 return DR_IS_READ (dra) ? -1 : 1;
2737 /* Then sort after access size. */
2738 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2739 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2741 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2742 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2743 if (cmp != 0)
2744 return cmp;
2747 /* And after step. */
2748 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2750 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2751 if (cmp != 0)
2752 return cmp;
2755 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2756 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2757 if (cmp == 0)
2758 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2759 return cmp;
2762 /* Function vect_analyze_data_ref_accesses.
2764 Analyze the access pattern of all the data references in the loop.
2766 FORNOW: the only access pattern that is considered vectorizable is a
2767 simple step 1 (consecutive) access.
2769 FORNOW: handle only arrays and pointer accesses. */
2771 bool
2772 vect_analyze_data_ref_accesses (vec_info *vinfo)
2774 unsigned int i;
2775 vec<data_reference_p> datarefs = vinfo->datarefs;
2776 struct data_reference *dr;
2778 if (dump_enabled_p ())
2779 dump_printf_loc (MSG_NOTE, vect_location,
2780 "=== vect_analyze_data_ref_accesses ===\n");
2782 if (datarefs.is_empty ())
2783 return true;
2785 /* Sort the array of datarefs to make building the interleaving chains
2786 linear. Don't modify the original vector's order, it is needed for
2787 determining what dependencies are reversed. */
2788 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2789 datarefs_copy.qsort (dr_group_sort_cmp);
2791 /* Build the interleaving chains. */
2792 for (i = 0; i < datarefs_copy.length () - 1;)
2794 data_reference_p dra = datarefs_copy[i];
2795 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2796 stmt_vec_info lastinfo = NULL;
2797 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2799 ++i;
2800 continue;
2802 for (i = i + 1; i < datarefs_copy.length (); ++i)
2804 data_reference_p drb = datarefs_copy[i];
2805 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2806 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2807 break;
2809 /* ??? Imperfect sorting (non-compatible types, non-modulo
2810 accesses, same accesses) can lead to a group to be artificially
2811 split here as we don't just skip over those. If it really
2812 matters we can push those to a worklist and re-iterate
2813 over them. The we can just skip ahead to the next DR here. */
2815 /* DRs in a different loop should not be put into the same
2816 interleaving group. */
2817 if (gimple_bb (DR_STMT (dra))->loop_father
2818 != gimple_bb (DR_STMT (drb))->loop_father)
2819 break;
2821 /* Check that the data-refs have same first location (except init)
2822 and they are both either store or load (not load and store,
2823 not masked loads or stores). */
2824 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2825 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2826 DR_BASE_ADDRESS (drb), 0)
2827 || !dr_equal_offsets_p (dra, drb)
2828 || !gimple_assign_single_p (DR_STMT (dra))
2829 || !gimple_assign_single_p (DR_STMT (drb)))
2830 break;
2832 /* Check that the data-refs have the same constant size. */
2833 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2834 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2835 if (!tree_fits_uhwi_p (sza)
2836 || !tree_fits_uhwi_p (szb)
2837 || !tree_int_cst_equal (sza, szb))
2838 break;
2840 /* Check that the data-refs have the same step. */
2841 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2842 break;
2844 /* Do not place the same access in the interleaving chain twice. */
2845 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2846 break;
2848 /* Check the types are compatible.
2849 ??? We don't distinguish this during sorting. */
2850 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2851 TREE_TYPE (DR_REF (drb))))
2852 break;
2854 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2855 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2856 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2857 gcc_assert (init_a <= init_b);
2859 /* If init_b == init_a + the size of the type * k, we have an
2860 interleaving, and DRA is accessed before DRB. */
2861 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2862 if (type_size_a == 0
2863 || (init_b - init_a) % type_size_a != 0)
2864 break;
2866 /* If we have a store, the accesses are adjacent. This splits
2867 groups into chunks we support (we don't support vectorization
2868 of stores with gaps). */
2869 if (!DR_IS_READ (dra)
2870 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2871 (DR_INIT (datarefs_copy[i-1]))
2872 != type_size_a))
2873 break;
2875 /* If the step (if not zero or non-constant) is greater than the
2876 difference between data-refs' inits this splits groups into
2877 suitable sizes. */
2878 if (tree_fits_shwi_p (DR_STEP (dra)))
2880 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2881 if (step != 0 && step <= (init_b - init_a))
2882 break;
2885 if (dump_enabled_p ())
2887 dump_printf_loc (MSG_NOTE, vect_location,
2888 "Detected interleaving ");
2889 if (DR_IS_READ (dra))
2890 dump_printf (MSG_NOTE, "load ");
2891 else
2892 dump_printf (MSG_NOTE, "store ");
2893 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2894 dump_printf (MSG_NOTE, " and ");
2895 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2896 dump_printf (MSG_NOTE, "\n");
2899 /* Link the found element into the group list. */
2900 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2902 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2903 lastinfo = stmtinfo_a;
2905 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2906 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2907 lastinfo = stmtinfo_b;
2911 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2912 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2913 && !vect_analyze_data_ref_access (dr))
2915 if (dump_enabled_p ())
2916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2917 "not vectorized: complicated access pattern.\n");
2919 if (is_a <bb_vec_info> (vinfo))
2921 /* Mark the statement as not vectorizable. */
2922 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2923 continue;
2925 else
2927 datarefs_copy.release ();
2928 return false;
2932 datarefs_copy.release ();
2933 return true;
2936 /* Function vect_vfa_segment_size.
2938 Create an expression that computes the size of segment
2939 that will be accessed for a data reference. The functions takes into
2940 account that realignment loads may access one more vector.
2942 Input:
2943 DR: The data reference.
2944 LENGTH_FACTOR: segment length to consider.
2946 Return an expression whose value is the size of segment which will be
2947 accessed by DR. */
2949 static tree
2950 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2952 tree segment_length;
2954 if (integer_zerop (DR_STEP (dr)))
2955 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2956 else
2957 segment_length = size_binop (MULT_EXPR,
2958 fold_convert (sizetype, DR_STEP (dr)),
2959 fold_convert (sizetype, length_factor));
2961 if (vect_supportable_dr_alignment (dr, false)
2962 == dr_explicit_realign_optimized)
2964 tree vector_size = TYPE_SIZE_UNIT
2965 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2967 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2969 return segment_length;
2972 /* Function vect_no_alias_p.
2974 Given data references A and B with equal base and offset, the alias
2975 relation can be decided at compilation time, return TRUE if they do
2976 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2977 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2978 respectively. */
2980 static bool
2981 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2982 tree segment_length_a, tree segment_length_b)
2984 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2985 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2986 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2987 return false;
2989 tree seg_a_min = DR_INIT (a);
2990 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2991 seg_a_min, segment_length_a);
2992 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2993 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2994 [a, a+12) */
2995 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2997 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2998 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2999 seg_a_max, unit_size);
3000 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3001 DR_INIT (a), unit_size);
3003 tree seg_b_min = DR_INIT (b);
3004 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3005 seg_b_min, segment_length_b);
3006 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3008 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3009 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3010 seg_b_max, unit_size);
3011 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3012 DR_INIT (b), unit_size);
3015 if (tree_int_cst_le (seg_a_max, seg_b_min)
3016 || tree_int_cst_le (seg_b_max, seg_a_min))
3017 return true;
3019 return false;
3022 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3023 in DDR is >= VF. */
3025 static bool
3026 dependence_distance_ge_vf (data_dependence_relation *ddr,
3027 unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3029 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3030 || DDR_NUM_DIST_VECTS (ddr) == 0)
3031 return false;
3033 /* If the dependence is exact, we should have limited the VF instead. */
3034 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3036 unsigned int i;
3037 lambda_vector dist_v;
3038 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3040 HOST_WIDE_INT dist = dist_v[loop_depth];
3041 if (dist != 0
3042 && !(dist > 0 && DDR_REVERSED_P (ddr))
3043 && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3044 return false;
3047 if (dump_enabled_p ())
3049 dump_printf_loc (MSG_NOTE, vect_location,
3050 "dependence distance between ");
3051 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3052 dump_printf (MSG_NOTE, " and ");
3053 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3054 dump_printf (MSG_NOTE, " is >= VF\n");
3057 return true;
3060 /* Function vect_prune_runtime_alias_test_list.
3062 Prune a list of ddrs to be tested at run-time by versioning for alias.
3063 Merge several alias checks into one if possible.
3064 Return FALSE if resulting list of ddrs is longer then allowed by
3065 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3067 bool
3068 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3070 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3071 hash_set <tree_pair_hash> compared_objects;
3073 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3074 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3075 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3076 vec<vec_object_pair> &check_unequal_addrs
3077 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3078 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3079 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3081 ddr_p ddr;
3082 unsigned int i;
3083 tree length_factor;
3085 if (dump_enabled_p ())
3086 dump_printf_loc (MSG_NOTE, vect_location,
3087 "=== vect_prune_runtime_alias_test_list ===\n");
3089 if (may_alias_ddrs.is_empty ())
3090 return true;
3092 comp_alias_ddrs.create (may_alias_ddrs.length ());
3094 unsigned int loop_depth
3095 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3096 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3098 /* First, we collect all data ref pairs for aliasing checks. */
3099 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3101 int comp_res;
3102 struct data_reference *dr_a, *dr_b;
3103 gimple *dr_group_first_a, *dr_group_first_b;
3104 tree segment_length_a, segment_length_b;
3105 gimple *stmt_a, *stmt_b;
3107 /* Ignore the alias if the VF we chose ended up being no greater
3108 than the dependence distance. */
3109 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3110 continue;
3112 if (DDR_OBJECT_A (ddr))
3114 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3115 if (!compared_objects.add (new_pair))
3117 if (dump_enabled_p ())
3119 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3120 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3121 dump_printf (MSG_NOTE, " and ");
3122 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3123 dump_printf (MSG_NOTE, " have different addresses\n");
3125 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3127 continue;
3130 dr_a = DDR_A (ddr);
3131 stmt_a = DR_STMT (DDR_A (ddr));
3132 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3133 if (dr_group_first_a)
3135 stmt_a = dr_group_first_a;
3136 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3139 dr_b = DDR_B (ddr);
3140 stmt_b = DR_STMT (DDR_B (ddr));
3141 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3142 if (dr_group_first_b)
3144 stmt_b = dr_group_first_b;
3145 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3148 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3149 length_factor = scalar_loop_iters;
3150 else
3151 length_factor = size_int (vect_factor);
3152 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3153 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3155 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3156 DR_BASE_ADDRESS (dr_b));
3157 if (comp_res == 0)
3158 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3159 DR_OFFSET (dr_b));
3161 /* Alias is known at compilation time. */
3162 if (comp_res == 0
3163 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3164 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3165 && TREE_CODE (segment_length_a) == INTEGER_CST
3166 && TREE_CODE (segment_length_b) == INTEGER_CST)
3168 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3169 continue;
3171 if (dump_enabled_p ())
3172 dump_printf_loc (MSG_NOTE, vect_location,
3173 "not vectorized: compilation time alias.\n");
3175 return false;
3178 dr_with_seg_len_pair_t dr_with_seg_len_pair
3179 (dr_with_seg_len (dr_a, segment_length_a),
3180 dr_with_seg_len (dr_b, segment_length_b));
3182 /* Canonicalize pairs by sorting the two DR members. */
3183 if (comp_res > 0)
3184 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3186 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3189 prune_runtime_alias_test_list (&comp_alias_ddrs,
3190 (unsigned HOST_WIDE_INT) vect_factor);
3192 unsigned int count = (comp_alias_ddrs.length ()
3193 + check_unequal_addrs.length ());
3194 dump_printf_loc (MSG_NOTE, vect_location,
3195 "improved number of alias checks from %d to %d\n",
3196 may_alias_ddrs.length (), count);
3197 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3199 if (dump_enabled_p ())
3200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3201 "number of versioning for alias "
3202 "run-time tests exceeds %d "
3203 "(--param vect-max-version-for-alias-checks)\n",
3204 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3205 return false;
3208 return true;
3211 /* Return true if a non-affine read or write in STMT is suitable for a
3212 gather load or scatter store. Describe the operation in *INFO if so. */
3214 bool
3215 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3216 gather_scatter_info *info)
3218 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3219 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3220 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3221 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3222 tree offtype = NULL_TREE;
3223 tree decl, base, off;
3224 machine_mode pmode;
3225 int punsignedp, reversep, pvolatilep = 0;
3227 base = DR_REF (dr);
3228 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3229 see if we can use the def stmt of the address. */
3230 if (is_gimple_call (stmt)
3231 && gimple_call_internal_p (stmt)
3232 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3233 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3234 && TREE_CODE (base) == MEM_REF
3235 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3236 && integer_zerop (TREE_OPERAND (base, 1))
3237 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3239 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3240 if (is_gimple_assign (def_stmt)
3241 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3242 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3245 /* The gather and scatter builtins need address of the form
3246 loop_invariant + vector * {1, 2, 4, 8}
3248 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3249 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3250 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3251 multiplications and additions in it. To get a vector, we need
3252 a single SSA_NAME that will be defined in the loop and will
3253 contain everything that is not loop invariant and that can be
3254 vectorized. The following code attempts to find such a preexistng
3255 SSA_NAME OFF and put the loop invariants into a tree BASE
3256 that can be gimplified before the loop. */
3257 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3258 &punsignedp, &reversep, &pvolatilep);
3259 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3261 if (TREE_CODE (base) == MEM_REF)
3263 if (!integer_zerop (TREE_OPERAND (base, 1)))
3265 if (off == NULL_TREE)
3267 offset_int moff = mem_ref_offset (base);
3268 off = wide_int_to_tree (sizetype, moff);
3270 else
3271 off = size_binop (PLUS_EXPR, off,
3272 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3274 base = TREE_OPERAND (base, 0);
3276 else
3277 base = build_fold_addr_expr (base);
3279 if (off == NULL_TREE)
3280 off = size_zero_node;
3282 /* If base is not loop invariant, either off is 0, then we start with just
3283 the constant offset in the loop invariant BASE and continue with base
3284 as OFF, otherwise give up.
3285 We could handle that case by gimplifying the addition of base + off
3286 into some SSA_NAME and use that as off, but for now punt. */
3287 if (!expr_invariant_in_loop_p (loop, base))
3289 if (!integer_zerop (off))
3290 return false;
3291 off = base;
3292 base = size_int (pbitpos / BITS_PER_UNIT);
3294 /* Otherwise put base + constant offset into the loop invariant BASE
3295 and continue with OFF. */
3296 else
3298 base = fold_convert (sizetype, base);
3299 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3302 /* OFF at this point may be either a SSA_NAME or some tree expression
3303 from get_inner_reference. Try to peel off loop invariants from it
3304 into BASE as long as possible. */
3305 STRIP_NOPS (off);
3306 while (offtype == NULL_TREE)
3308 enum tree_code code;
3309 tree op0, op1, add = NULL_TREE;
3311 if (TREE_CODE (off) == SSA_NAME)
3313 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3315 if (expr_invariant_in_loop_p (loop, off))
3316 return false;
3318 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3319 break;
3321 op0 = gimple_assign_rhs1 (def_stmt);
3322 code = gimple_assign_rhs_code (def_stmt);
3323 op1 = gimple_assign_rhs2 (def_stmt);
3325 else
3327 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3328 return false;
3329 code = TREE_CODE (off);
3330 extract_ops_from_tree (off, &code, &op0, &op1);
3332 switch (code)
3334 case POINTER_PLUS_EXPR:
3335 case PLUS_EXPR:
3336 if (expr_invariant_in_loop_p (loop, op0))
3338 add = op0;
3339 off = op1;
3340 do_add:
3341 add = fold_convert (sizetype, add);
3342 if (scale != 1)
3343 add = size_binop (MULT_EXPR, add, size_int (scale));
3344 base = size_binop (PLUS_EXPR, base, add);
3345 continue;
3347 if (expr_invariant_in_loop_p (loop, op1))
3349 add = op1;
3350 off = op0;
3351 goto do_add;
3353 break;
3354 case MINUS_EXPR:
3355 if (expr_invariant_in_loop_p (loop, op1))
3357 add = fold_convert (sizetype, op1);
3358 add = size_binop (MINUS_EXPR, size_zero_node, add);
3359 off = op0;
3360 goto do_add;
3362 break;
3363 case MULT_EXPR:
3364 if (scale == 1 && tree_fits_shwi_p (op1))
3366 scale = tree_to_shwi (op1);
3367 off = op0;
3368 continue;
3370 break;
3371 case SSA_NAME:
3372 off = op0;
3373 continue;
3374 CASE_CONVERT:
3375 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3376 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3377 break;
3378 if (TYPE_PRECISION (TREE_TYPE (op0))
3379 == TYPE_PRECISION (TREE_TYPE (off)))
3381 off = op0;
3382 continue;
3384 if (TYPE_PRECISION (TREE_TYPE (op0))
3385 < TYPE_PRECISION (TREE_TYPE (off)))
3387 off = op0;
3388 offtype = TREE_TYPE (off);
3389 STRIP_NOPS (off);
3390 continue;
3392 break;
3393 default:
3394 break;
3396 break;
3399 /* If at the end OFF still isn't a SSA_NAME or isn't
3400 defined in the loop, punt. */
3401 if (TREE_CODE (off) != SSA_NAME
3402 || expr_invariant_in_loop_p (loop, off))
3403 return false;
3405 if (offtype == NULL_TREE)
3406 offtype = TREE_TYPE (off);
3408 if (DR_IS_READ (dr))
3409 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3410 offtype, scale);
3411 else
3412 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3413 offtype, scale);
3415 if (decl == NULL_TREE)
3416 return false;
3418 info->decl = decl;
3419 info->base = base;
3420 info->offset = off;
3421 info->offset_dt = vect_unknown_def_type;
3422 info->offset_vectype = NULL_TREE;
3423 info->scale = scale;
3424 return true;
3427 /* Function vect_analyze_data_refs.
3429 Find all the data references in the loop or basic block.
3431 The general structure of the analysis of data refs in the vectorizer is as
3432 follows:
3433 1- vect_analyze_data_refs(loop/bb): call
3434 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3435 in the loop/bb and their dependences.
3436 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3437 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3438 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3442 bool
3443 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3445 struct loop *loop = NULL;
3446 unsigned int i;
3447 struct data_reference *dr;
3448 tree scalar_type;
3450 if (dump_enabled_p ())
3451 dump_printf_loc (MSG_NOTE, vect_location,
3452 "=== vect_analyze_data_refs ===\n");
3454 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3455 loop = LOOP_VINFO_LOOP (loop_vinfo);
3457 /* Go through the data-refs, check that the analysis succeeded. Update
3458 pointer from stmt_vec_info struct to DR and vectype. */
3460 vec<data_reference_p> datarefs = vinfo->datarefs;
3461 FOR_EACH_VEC_ELT (datarefs, i, dr)
3463 gimple *stmt;
3464 stmt_vec_info stmt_info;
3465 tree base, offset, init;
3466 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3467 bool simd_lane_access = false;
3468 int vf;
3470 again:
3471 if (!dr || !DR_REF (dr))
3473 if (dump_enabled_p ())
3474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3475 "not vectorized: unhandled data-ref\n");
3476 return false;
3479 stmt = DR_STMT (dr);
3480 stmt_info = vinfo_for_stmt (stmt);
3482 /* Discard clobbers from the dataref vector. We will remove
3483 clobber stmts during vectorization. */
3484 if (gimple_clobber_p (stmt))
3486 free_data_ref (dr);
3487 if (i == datarefs.length () - 1)
3489 datarefs.pop ();
3490 break;
3492 datarefs.ordered_remove (i);
3493 dr = datarefs[i];
3494 goto again;
3497 /* Check that analysis of the data-ref succeeded. */
3498 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3499 || !DR_STEP (dr))
3501 bool maybe_gather
3502 = DR_IS_READ (dr)
3503 && !TREE_THIS_VOLATILE (DR_REF (dr))
3504 && targetm.vectorize.builtin_gather != NULL;
3505 bool maybe_scatter
3506 = DR_IS_WRITE (dr)
3507 && !TREE_THIS_VOLATILE (DR_REF (dr))
3508 && targetm.vectorize.builtin_scatter != NULL;
3509 bool maybe_simd_lane_access
3510 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3512 /* If target supports vector gather loads or scatter stores, or if
3513 this might be a SIMD lane access, see if they can't be used. */
3514 if (is_a <loop_vec_info> (vinfo)
3515 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3516 && !nested_in_vect_loop_p (loop, stmt))
3518 struct data_reference *newdr
3519 = create_data_ref (NULL, loop_containing_stmt (stmt),
3520 DR_REF (dr), stmt, !maybe_scatter,
3521 DR_IS_CONDITIONAL_IN_STMT (dr));
3522 gcc_assert (newdr != NULL && DR_REF (newdr));
3523 if (DR_BASE_ADDRESS (newdr)
3524 && DR_OFFSET (newdr)
3525 && DR_INIT (newdr)
3526 && DR_STEP (newdr)
3527 && integer_zerop (DR_STEP (newdr)))
3529 if (maybe_simd_lane_access)
3531 tree off = DR_OFFSET (newdr);
3532 STRIP_NOPS (off);
3533 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3534 && TREE_CODE (off) == MULT_EXPR
3535 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3537 tree step = TREE_OPERAND (off, 1);
3538 off = TREE_OPERAND (off, 0);
3539 STRIP_NOPS (off);
3540 if (CONVERT_EXPR_P (off)
3541 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3542 0)))
3543 < TYPE_PRECISION (TREE_TYPE (off)))
3544 off = TREE_OPERAND (off, 0);
3545 if (TREE_CODE (off) == SSA_NAME)
3547 gimple *def = SSA_NAME_DEF_STMT (off);
3548 tree reft = TREE_TYPE (DR_REF (newdr));
3549 if (is_gimple_call (def)
3550 && gimple_call_internal_p (def)
3551 && (gimple_call_internal_fn (def)
3552 == IFN_GOMP_SIMD_LANE))
3554 tree arg = gimple_call_arg (def, 0);
3555 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3556 arg = SSA_NAME_VAR (arg);
3557 if (arg == loop->simduid
3558 /* For now. */
3559 && tree_int_cst_equal
3560 (TYPE_SIZE_UNIT (reft),
3561 step))
3563 DR_OFFSET (newdr) = ssize_int (0);
3564 DR_STEP (newdr) = step;
3565 DR_OFFSET_ALIGNMENT (newdr)
3566 = BIGGEST_ALIGNMENT;
3567 DR_STEP_ALIGNMENT (newdr)
3568 = highest_pow2_factor (step);
3569 dr = newdr;
3570 simd_lane_access = true;
3576 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3578 dr = newdr;
3579 if (maybe_gather)
3580 gatherscatter = GATHER;
3581 else
3582 gatherscatter = SCATTER;
3585 if (gatherscatter == SG_NONE && !simd_lane_access)
3586 free_data_ref (newdr);
3589 if (gatherscatter == SG_NONE && !simd_lane_access)
3591 if (dump_enabled_p ())
3593 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3594 "not vectorized: data ref analysis "
3595 "failed ");
3596 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3599 if (is_a <bb_vec_info> (vinfo))
3600 break;
3602 return false;
3606 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3608 if (dump_enabled_p ())
3609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3610 "not vectorized: base addr of dr is a "
3611 "constant\n");
3613 if (is_a <bb_vec_info> (vinfo))
3614 break;
3616 if (gatherscatter != SG_NONE || simd_lane_access)
3617 free_data_ref (dr);
3618 return false;
3621 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3623 if (dump_enabled_p ())
3625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3626 "not vectorized: volatile type ");
3627 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3630 if (is_a <bb_vec_info> (vinfo))
3631 break;
3633 return false;
3636 if (stmt_can_throw_internal (stmt))
3638 if (dump_enabled_p ())
3640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3641 "not vectorized: statement can throw an "
3642 "exception ");
3643 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3646 if (is_a <bb_vec_info> (vinfo))
3647 break;
3649 if (gatherscatter != SG_NONE || simd_lane_access)
3650 free_data_ref (dr);
3651 return false;
3654 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3655 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3657 if (dump_enabled_p ())
3659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3660 "not vectorized: statement is bitfield "
3661 "access ");
3662 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3665 if (is_a <bb_vec_info> (vinfo))
3666 break;
3668 if (gatherscatter != SG_NONE || simd_lane_access)
3669 free_data_ref (dr);
3670 return false;
3673 base = unshare_expr (DR_BASE_ADDRESS (dr));
3674 offset = unshare_expr (DR_OFFSET (dr));
3675 init = unshare_expr (DR_INIT (dr));
3677 if (is_gimple_call (stmt)
3678 && (!gimple_call_internal_p (stmt)
3679 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3680 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3682 if (dump_enabled_p ())
3684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3685 "not vectorized: dr in a call ");
3686 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3689 if (is_a <bb_vec_info> (vinfo))
3690 break;
3692 if (gatherscatter != SG_NONE || simd_lane_access)
3693 free_data_ref (dr);
3694 return false;
3697 /* Update DR field in stmt_vec_info struct. */
3699 /* If the dataref is in an inner-loop of the loop that is considered for
3700 for vectorization, we also want to analyze the access relative to
3701 the outer-loop (DR contains information only relative to the
3702 inner-most enclosing loop). We do that by building a reference to the
3703 first location accessed by the inner-loop, and analyze it relative to
3704 the outer-loop. */
3705 if (loop && nested_in_vect_loop_p (loop, stmt))
3707 /* Build a reference to the first location accessed by the
3708 inner loop: *(BASE + INIT + OFFSET). By construction,
3709 this address must be invariant in the inner loop, so we
3710 can consider it as being used in the outer loop. */
3711 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3712 init, offset);
3713 tree init_addr = fold_build_pointer_plus (base, init_offset);
3714 tree init_ref = build_fold_indirect_ref (init_addr);
3716 if (dump_enabled_p ())
3718 dump_printf_loc (MSG_NOTE, vect_location,
3719 "analyze in outer loop: ");
3720 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3721 dump_printf (MSG_NOTE, "\n");
3724 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3725 init_ref, loop))
3726 /* dr_analyze_innermost already explained the failure. */
3727 return false;
3729 if (dump_enabled_p ())
3731 dump_printf_loc (MSG_NOTE, vect_location,
3732 "\touter base_address: ");
3733 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3734 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3735 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3736 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3737 STMT_VINFO_DR_OFFSET (stmt_info));
3738 dump_printf (MSG_NOTE,
3739 "\n\touter constant offset from base address: ");
3740 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3741 STMT_VINFO_DR_INIT (stmt_info));
3742 dump_printf (MSG_NOTE, "\n\touter step: ");
3743 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3744 STMT_VINFO_DR_STEP (stmt_info));
3745 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3746 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3747 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3748 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3749 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3750 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3751 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3752 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3756 if (STMT_VINFO_DATA_REF (stmt_info))
3758 if (dump_enabled_p ())
3760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3761 "not vectorized: more than one data ref "
3762 "in stmt: ");
3763 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3766 if (is_a <bb_vec_info> (vinfo))
3767 break;
3769 if (gatherscatter != SG_NONE || simd_lane_access)
3770 free_data_ref (dr);
3771 return false;
3774 STMT_VINFO_DATA_REF (stmt_info) = dr;
3775 if (simd_lane_access)
3777 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3778 free_data_ref (datarefs[i]);
3779 datarefs[i] = dr;
3782 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3783 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3784 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3786 if (dump_enabled_p ())
3788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3789 "not vectorized: base object not addressable "
3790 "for stmt: ");
3791 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3793 if (is_a <bb_vec_info> (vinfo))
3795 /* In BB vectorization the ref can still participate
3796 in dependence analysis, we just can't vectorize it. */
3797 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3798 continue;
3800 return false;
3803 /* Set vectype for STMT. */
3804 scalar_type = TREE_TYPE (DR_REF (dr));
3805 STMT_VINFO_VECTYPE (stmt_info)
3806 = get_vectype_for_scalar_type (scalar_type);
3807 if (!STMT_VINFO_VECTYPE (stmt_info))
3809 if (dump_enabled_p ())
3811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3812 "not vectorized: no vectype for stmt: ");
3813 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3814 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3815 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3816 scalar_type);
3817 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3820 if (is_a <bb_vec_info> (vinfo))
3822 /* No vector type is fine, the ref can still participate
3823 in dependence analysis, we just can't vectorize it. */
3824 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3825 continue;
3828 if (gatherscatter != SG_NONE || simd_lane_access)
3830 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3831 if (gatherscatter != SG_NONE)
3832 free_data_ref (dr);
3834 return false;
3836 else
3838 if (dump_enabled_p ())
3840 dump_printf_loc (MSG_NOTE, vect_location,
3841 "got vectype for stmt: ");
3842 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3843 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3844 STMT_VINFO_VECTYPE (stmt_info));
3845 dump_printf (MSG_NOTE, "\n");
3849 /* Adjust the minimal vectorization factor according to the
3850 vector type. */
3851 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3852 if (vf > *min_vf)
3853 *min_vf = vf;
3855 if (gatherscatter != SG_NONE)
3857 gather_scatter_info gs_info;
3858 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3859 &gs_info)
3860 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3862 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3863 free_data_ref (dr);
3864 if (dump_enabled_p ())
3866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3867 (gatherscatter == GATHER) ?
3868 "not vectorized: not suitable for gather "
3869 "load " :
3870 "not vectorized: not suitable for scatter "
3871 "store ");
3872 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3874 return false;
3877 free_data_ref (datarefs[i]);
3878 datarefs[i] = dr;
3879 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3882 else if (is_a <loop_vec_info> (vinfo)
3883 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3885 if (nested_in_vect_loop_p (loop, stmt))
3887 if (dump_enabled_p ())
3889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3890 "not vectorized: not suitable for strided "
3891 "load ");
3892 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3894 return false;
3896 STMT_VINFO_STRIDED_P (stmt_info) = true;
3900 /* If we stopped analysis at the first dataref we could not analyze
3901 when trying to vectorize a basic-block mark the rest of the datarefs
3902 as not vectorizable and truncate the vector of datarefs. That
3903 avoids spending useless time in analyzing their dependence. */
3904 if (i != datarefs.length ())
3906 gcc_assert (is_a <bb_vec_info> (vinfo));
3907 for (unsigned j = i; j < datarefs.length (); ++j)
3909 data_reference_p dr = datarefs[j];
3910 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3911 free_data_ref (dr);
3913 datarefs.truncate (i);
3916 return true;
3920 /* Function vect_get_new_vect_var.
3922 Returns a name for a new variable. The current naming scheme appends the
3923 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3924 the name of vectorizer generated variables, and appends that to NAME if
3925 provided. */
3927 tree
3928 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3930 const char *prefix;
3931 tree new_vect_var;
3933 switch (var_kind)
3935 case vect_simple_var:
3936 prefix = "vect";
3937 break;
3938 case vect_scalar_var:
3939 prefix = "stmp";
3940 break;
3941 case vect_mask_var:
3942 prefix = "mask";
3943 break;
3944 case vect_pointer_var:
3945 prefix = "vectp";
3946 break;
3947 default:
3948 gcc_unreachable ();
3951 if (name)
3953 char* tmp = concat (prefix, "_", name, NULL);
3954 new_vect_var = create_tmp_reg (type, tmp);
3955 free (tmp);
3957 else
3958 new_vect_var = create_tmp_reg (type, prefix);
3960 return new_vect_var;
3963 /* Like vect_get_new_vect_var but return an SSA name. */
3965 tree
3966 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3968 const char *prefix;
3969 tree new_vect_var;
3971 switch (var_kind)
3973 case vect_simple_var:
3974 prefix = "vect";
3975 break;
3976 case vect_scalar_var:
3977 prefix = "stmp";
3978 break;
3979 case vect_pointer_var:
3980 prefix = "vectp";
3981 break;
3982 default:
3983 gcc_unreachable ();
3986 if (name)
3988 char* tmp = concat (prefix, "_", name, NULL);
3989 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3990 free (tmp);
3992 else
3993 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3995 return new_vect_var;
3998 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4000 static void
4001 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4002 stmt_vec_info stmt_info)
4004 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4005 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4006 int misalign = DR_MISALIGNMENT (dr);
4007 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4008 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4009 else
4010 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4013 /* Function vect_create_addr_base_for_vector_ref.
4015 Create an expression that computes the address of the first memory location
4016 that will be accessed for a data reference.
4018 Input:
4019 STMT: The statement containing the data reference.
4020 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4021 OFFSET: Optional. If supplied, it is be added to the initial address.
4022 LOOP: Specify relative to which loop-nest should the address be computed.
4023 For example, when the dataref is in an inner-loop nested in an
4024 outer-loop that is now being vectorized, LOOP can be either the
4025 outer-loop, or the inner-loop. The first memory location accessed
4026 by the following dataref ('in' points to short):
4028 for (i=0; i<N; i++)
4029 for (j=0; j<M; j++)
4030 s += in[i+j]
4032 is as follows:
4033 if LOOP=i_loop: &in (relative to i_loop)
4034 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4035 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4036 initial address. Unlike OFFSET, which is number of elements to
4037 be added, BYTE_OFFSET is measured in bytes.
4039 Output:
4040 1. Return an SSA_NAME whose value is the address of the memory location of
4041 the first vector of the data reference.
4042 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4043 these statement(s) which define the returned SSA_NAME.
4045 FORNOW: We are only handling array accesses with step 1. */
4047 tree
4048 vect_create_addr_base_for_vector_ref (gimple *stmt,
4049 gimple_seq *new_stmt_list,
4050 tree offset,
4051 tree byte_offset)
4053 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4054 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4055 const char *base_name;
4056 tree addr_base;
4057 tree dest;
4058 gimple_seq seq = NULL;
4059 tree vect_ptr_type;
4060 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4061 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4062 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4064 tree data_ref_base = unshare_expr (drb->base_address);
4065 tree base_offset = unshare_expr (drb->offset);
4066 tree init = unshare_expr (drb->init);
4068 if (loop_vinfo)
4069 base_name = get_name (data_ref_base);
4070 else
4072 base_offset = ssize_int (0);
4073 init = ssize_int (0);
4074 base_name = get_name (DR_REF (dr));
4077 /* Create base_offset */
4078 base_offset = size_binop (PLUS_EXPR,
4079 fold_convert (sizetype, base_offset),
4080 fold_convert (sizetype, init));
4082 if (offset)
4084 offset = fold_build2 (MULT_EXPR, sizetype,
4085 fold_convert (sizetype, offset), step);
4086 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4087 base_offset, offset);
4089 if (byte_offset)
4091 byte_offset = fold_convert (sizetype, byte_offset);
4092 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4093 base_offset, byte_offset);
4096 /* base + base_offset */
4097 if (loop_vinfo)
4098 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4099 else
4101 addr_base = build1 (ADDR_EXPR,
4102 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4103 unshare_expr (DR_REF (dr)));
4106 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4107 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4108 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4109 gimple_seq_add_seq (new_stmt_list, seq);
4111 if (DR_PTR_INFO (dr)
4112 && TREE_CODE (addr_base) == SSA_NAME
4113 && !SSA_NAME_PTR_INFO (addr_base))
4115 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4116 if (offset || byte_offset)
4117 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4120 if (dump_enabled_p ())
4122 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4123 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4124 dump_printf (MSG_NOTE, "\n");
4127 return addr_base;
4131 /* Function vect_create_data_ref_ptr.
4133 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4134 location accessed in the loop by STMT, along with the def-use update
4135 chain to appropriately advance the pointer through the loop iterations.
4136 Also set aliasing information for the pointer. This pointer is used by
4137 the callers to this function to create a memory reference expression for
4138 vector load/store access.
4140 Input:
4141 1. STMT: a stmt that references memory. Expected to be of the form
4142 GIMPLE_ASSIGN <name, data-ref> or
4143 GIMPLE_ASSIGN <data-ref, name>.
4144 2. AGGR_TYPE: the type of the reference, which should be either a vector
4145 or an array.
4146 3. AT_LOOP: the loop where the vector memref is to be created.
4147 4. OFFSET (optional): an offset to be added to the initial address accessed
4148 by the data-ref in STMT.
4149 5. BSI: location where the new stmts are to be placed if there is no loop
4150 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4151 pointing to the initial address.
4152 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4153 to the initial address accessed by the data-ref in STMT. This is
4154 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4155 in bytes.
4157 Output:
4158 1. Declare a new ptr to vector_type, and have it point to the base of the
4159 data reference (initial addressed accessed by the data reference).
4160 For example, for vector of type V8HI, the following code is generated:
4162 v8hi *ap;
4163 ap = (v8hi *)initial_address;
4165 if OFFSET is not supplied:
4166 initial_address = &a[init];
4167 if OFFSET is supplied:
4168 initial_address = &a[init + OFFSET];
4169 if BYTE_OFFSET is supplied:
4170 initial_address = &a[init] + BYTE_OFFSET;
4172 Return the initial_address in INITIAL_ADDRESS.
4174 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4175 update the pointer in each iteration of the loop.
4177 Return the increment stmt that updates the pointer in PTR_INCR.
4179 3. Set INV_P to true if the access pattern of the data reference in the
4180 vectorized loop is invariant. Set it to false otherwise.
4182 4. Return the pointer. */
4184 tree
4185 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4186 tree offset, tree *initial_address,
4187 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4188 bool only_init, bool *inv_p, tree byte_offset)
4190 const char *base_name;
4191 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4192 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4193 struct loop *loop = NULL;
4194 bool nested_in_vect_loop = false;
4195 struct loop *containing_loop = NULL;
4196 tree aggr_ptr_type;
4197 tree aggr_ptr;
4198 tree new_temp;
4199 gimple_seq new_stmt_list = NULL;
4200 edge pe = NULL;
4201 basic_block new_bb;
4202 tree aggr_ptr_init;
4203 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4204 tree aptr;
4205 gimple_stmt_iterator incr_gsi;
4206 bool insert_after;
4207 tree indx_before_incr, indx_after_incr;
4208 gimple *incr;
4209 tree step;
4210 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4212 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4213 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4215 if (loop_vinfo)
4217 loop = LOOP_VINFO_LOOP (loop_vinfo);
4218 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4219 containing_loop = (gimple_bb (stmt))->loop_father;
4220 pe = loop_preheader_edge (loop);
4222 else
4224 gcc_assert (bb_vinfo);
4225 only_init = true;
4226 *ptr_incr = NULL;
4229 /* Check the step (evolution) of the load in LOOP, and record
4230 whether it's invariant. */
4231 step = vect_dr_behavior (dr)->step;
4232 if (integer_zerop (step))
4233 *inv_p = true;
4234 else
4235 *inv_p = false;
4237 /* Create an expression for the first address accessed by this load
4238 in LOOP. */
4239 base_name = get_name (DR_BASE_ADDRESS (dr));
4241 if (dump_enabled_p ())
4243 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4244 dump_printf_loc (MSG_NOTE, vect_location,
4245 "create %s-pointer variable to type: ",
4246 get_tree_code_name (TREE_CODE (aggr_type)));
4247 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4248 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4249 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4250 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4251 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4252 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4253 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4254 else
4255 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4256 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4257 dump_printf (MSG_NOTE, "\n");
4260 /* (1) Create the new aggregate-pointer variable.
4261 Vector and array types inherit the alias set of their component
4262 type by default so we need to use a ref-all pointer if the data
4263 reference does not conflict with the created aggregated data
4264 reference because it is not addressable. */
4265 bool need_ref_all = false;
4266 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4267 get_alias_set (DR_REF (dr))))
4268 need_ref_all = true;
4269 /* Likewise for any of the data references in the stmt group. */
4270 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4272 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4275 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4276 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4277 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4278 get_alias_set (DR_REF (sdr))))
4280 need_ref_all = true;
4281 break;
4283 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4285 while (orig_stmt);
4287 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4288 need_ref_all);
4289 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4292 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4293 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4294 def-use update cycles for the pointer: one relative to the outer-loop
4295 (LOOP), which is what steps (3) and (4) below do. The other is relative
4296 to the inner-loop (which is the inner-most loop containing the dataref),
4297 and this is done be step (5) below.
4299 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4300 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4301 redundant. Steps (3),(4) create the following:
4303 vp0 = &base_addr;
4304 LOOP: vp1 = phi(vp0,vp2)
4307 vp2 = vp1 + step
4308 goto LOOP
4310 If there is an inner-loop nested in loop, then step (5) will also be
4311 applied, and an additional update in the inner-loop will be created:
4313 vp0 = &base_addr;
4314 LOOP: vp1 = phi(vp0,vp2)
4316 inner: vp3 = phi(vp1,vp4)
4317 vp4 = vp3 + inner_step
4318 if () goto inner
4320 vp2 = vp1 + step
4321 if () goto LOOP */
4323 /* (2) Calculate the initial address of the aggregate-pointer, and set
4324 the aggregate-pointer to point to it before the loop. */
4326 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4328 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4329 offset, byte_offset);
4330 if (new_stmt_list)
4332 if (pe)
4334 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4335 gcc_assert (!new_bb);
4337 else
4338 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4341 *initial_address = new_temp;
4342 aggr_ptr_init = new_temp;
4344 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4345 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4346 inner-loop nested in LOOP (during outer-loop vectorization). */
4348 /* No update in loop is required. */
4349 if (only_init && (!loop_vinfo || at_loop == loop))
4350 aptr = aggr_ptr_init;
4351 else
4353 /* The step of the aggregate pointer is the type size. */
4354 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4355 /* One exception to the above is when the scalar step of the load in
4356 LOOP is zero. In this case the step here is also zero. */
4357 if (*inv_p)
4358 iv_step = size_zero_node;
4359 else if (tree_int_cst_sgn (step) == -1)
4360 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4362 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4364 create_iv (aggr_ptr_init,
4365 fold_convert (aggr_ptr_type, iv_step),
4366 aggr_ptr, loop, &incr_gsi, insert_after,
4367 &indx_before_incr, &indx_after_incr);
4368 incr = gsi_stmt (incr_gsi);
4369 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4371 /* Copy the points-to information if it exists. */
4372 if (DR_PTR_INFO (dr))
4374 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4375 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4377 if (ptr_incr)
4378 *ptr_incr = incr;
4380 aptr = indx_before_incr;
4383 if (!nested_in_vect_loop || only_init)
4384 return aptr;
4387 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4388 nested in LOOP, if exists. */
4390 gcc_assert (nested_in_vect_loop);
4391 if (!only_init)
4393 standard_iv_increment_position (containing_loop, &incr_gsi,
4394 &insert_after);
4395 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4396 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4397 &indx_after_incr);
4398 incr = gsi_stmt (incr_gsi);
4399 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4401 /* Copy the points-to information if it exists. */
4402 if (DR_PTR_INFO (dr))
4404 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4405 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4407 if (ptr_incr)
4408 *ptr_incr = incr;
4410 return indx_before_incr;
4412 else
4413 gcc_unreachable ();
4417 /* Function bump_vector_ptr
4419 Increment a pointer (to a vector type) by vector-size. If requested,
4420 i.e. if PTR-INCR is given, then also connect the new increment stmt
4421 to the existing def-use update-chain of the pointer, by modifying
4422 the PTR_INCR as illustrated below:
4424 The pointer def-use update-chain before this function:
4425 DATAREF_PTR = phi (p_0, p_2)
4426 ....
4427 PTR_INCR: p_2 = DATAREF_PTR + step
4429 The pointer def-use update-chain after this function:
4430 DATAREF_PTR = phi (p_0, p_2)
4431 ....
4432 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4433 ....
4434 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4436 Input:
4437 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4438 in the loop.
4439 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4440 the loop. The increment amount across iterations is expected
4441 to be vector_size.
4442 BSI - location where the new update stmt is to be placed.
4443 STMT - the original scalar memory-access stmt that is being vectorized.
4444 BUMP - optional. The offset by which to bump the pointer. If not given,
4445 the offset is assumed to be vector_size.
4447 Output: Return NEW_DATAREF_PTR as illustrated above.
4451 tree
4452 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4453 gimple *stmt, tree bump)
4455 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4456 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4457 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4458 tree update = TYPE_SIZE_UNIT (vectype);
4459 gassign *incr_stmt;
4460 ssa_op_iter iter;
4461 use_operand_p use_p;
4462 tree new_dataref_ptr;
4464 if (bump)
4465 update = bump;
4467 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4468 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4469 else
4470 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4471 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4472 dataref_ptr, update);
4473 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4475 /* Copy the points-to information if it exists. */
4476 if (DR_PTR_INFO (dr))
4478 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4479 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4482 if (!ptr_incr)
4483 return new_dataref_ptr;
4485 /* Update the vector-pointer's cross-iteration increment. */
4486 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4488 tree use = USE_FROM_PTR (use_p);
4490 if (use == dataref_ptr)
4491 SET_USE (use_p, new_dataref_ptr);
4492 else
4493 gcc_assert (tree_int_cst_compare (use, update) == 0);
4496 return new_dataref_ptr;
4500 /* Function vect_create_destination_var.
4502 Create a new temporary of type VECTYPE. */
4504 tree
4505 vect_create_destination_var (tree scalar_dest, tree vectype)
4507 tree vec_dest;
4508 const char *name;
4509 char *new_name;
4510 tree type;
4511 enum vect_var_kind kind;
4513 kind = vectype
4514 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4515 ? vect_mask_var
4516 : vect_simple_var
4517 : vect_scalar_var;
4518 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4520 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4522 name = get_name (scalar_dest);
4523 if (name)
4524 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4525 else
4526 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4527 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4528 free (new_name);
4530 return vec_dest;
4533 /* Function vect_grouped_store_supported.
4535 Returns TRUE if interleave high and interleave low permutations
4536 are supported, and FALSE otherwise. */
4538 bool
4539 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4541 machine_mode mode = TYPE_MODE (vectype);
4543 /* vect_permute_store_chain requires the group size to be equal to 3 or
4544 be a power of two. */
4545 if (count != 3 && exact_log2 (count) == -1)
4547 if (dump_enabled_p ())
4548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4549 "the size of the group of accesses"
4550 " is not a power of 2 or not eqaul to 3\n");
4551 return false;
4554 /* Check that the permutation is supported. */
4555 if (VECTOR_MODE_P (mode))
4557 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4558 auto_vec_perm_indices sel (nelt);
4559 sel.quick_grow (nelt);
4561 if (count == 3)
4563 unsigned int j0 = 0, j1 = 0, j2 = 0;
4564 unsigned int i, j;
4566 for (j = 0; j < 3; j++)
4568 int nelt0 = ((3 - j) * nelt) % 3;
4569 int nelt1 = ((3 - j) * nelt + 1) % 3;
4570 int nelt2 = ((3 - j) * nelt + 2) % 3;
4571 for (i = 0; i < nelt; i++)
4573 if (3 * i + nelt0 < nelt)
4574 sel[3 * i + nelt0] = j0++;
4575 if (3 * i + nelt1 < nelt)
4576 sel[3 * i + nelt1] = nelt + j1++;
4577 if (3 * i + nelt2 < nelt)
4578 sel[3 * i + nelt2] = 0;
4580 if (!can_vec_perm_p (mode, false, &sel))
4582 if (dump_enabled_p ())
4583 dump_printf (MSG_MISSED_OPTIMIZATION,
4584 "permutaion op not supported by target.\n");
4585 return false;
4588 for (i = 0; i < nelt; i++)
4590 if (3 * i + nelt0 < nelt)
4591 sel[3 * i + nelt0] = 3 * i + nelt0;
4592 if (3 * i + nelt1 < nelt)
4593 sel[3 * i + nelt1] = 3 * i + nelt1;
4594 if (3 * i + nelt2 < nelt)
4595 sel[3 * i + nelt2] = nelt + j2++;
4597 if (!can_vec_perm_p (mode, false, &sel))
4599 if (dump_enabled_p ())
4600 dump_printf (MSG_MISSED_OPTIMIZATION,
4601 "permutaion op not supported by target.\n");
4602 return false;
4605 return true;
4607 else
4609 /* If length is not equal to 3 then only power of 2 is supported. */
4610 gcc_assert (pow2p_hwi (count));
4612 for (i = 0; i < nelt / 2; i++)
4614 sel[i * 2] = i;
4615 sel[i * 2 + 1] = i + nelt;
4617 if (can_vec_perm_p (mode, false, &sel))
4619 for (i = 0; i < nelt; i++)
4620 sel[i] += nelt / 2;
4621 if (can_vec_perm_p (mode, false, &sel))
4622 return true;
4627 if (dump_enabled_p ())
4628 dump_printf (MSG_MISSED_OPTIMIZATION,
4629 "permutaion op not supported by target.\n");
4630 return false;
4634 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4635 type VECTYPE. */
4637 bool
4638 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4640 return vect_lanes_optab_supported_p ("vec_store_lanes",
4641 vec_store_lanes_optab,
4642 vectype, count);
4646 /* Function vect_permute_store_chain.
4648 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4649 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4650 the data correctly for the stores. Return the final references for stores
4651 in RESULT_CHAIN.
4653 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4654 The input is 4 vectors each containing 8 elements. We assign a number to
4655 each element, the input sequence is:
4657 1st vec: 0 1 2 3 4 5 6 7
4658 2nd vec: 8 9 10 11 12 13 14 15
4659 3rd vec: 16 17 18 19 20 21 22 23
4660 4th vec: 24 25 26 27 28 29 30 31
4662 The output sequence should be:
4664 1st vec: 0 8 16 24 1 9 17 25
4665 2nd vec: 2 10 18 26 3 11 19 27
4666 3rd vec: 4 12 20 28 5 13 21 30
4667 4th vec: 6 14 22 30 7 15 23 31
4669 i.e., we interleave the contents of the four vectors in their order.
4671 We use interleave_high/low instructions to create such output. The input of
4672 each interleave_high/low operation is two vectors:
4673 1st vec 2nd vec
4674 0 1 2 3 4 5 6 7
4675 the even elements of the result vector are obtained left-to-right from the
4676 high/low elements of the first vector. The odd elements of the result are
4677 obtained left-to-right from the high/low elements of the second vector.
4678 The output of interleave_high will be: 0 4 1 5
4679 and of interleave_low: 2 6 3 7
4682 The permutation is done in log LENGTH stages. In each stage interleave_high
4683 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4684 where the first argument is taken from the first half of DR_CHAIN and the
4685 second argument from it's second half.
4686 In our example,
4688 I1: interleave_high (1st vec, 3rd vec)
4689 I2: interleave_low (1st vec, 3rd vec)
4690 I3: interleave_high (2nd vec, 4th vec)
4691 I4: interleave_low (2nd vec, 4th vec)
4693 The output for the first stage is:
4695 I1: 0 16 1 17 2 18 3 19
4696 I2: 4 20 5 21 6 22 7 23
4697 I3: 8 24 9 25 10 26 11 27
4698 I4: 12 28 13 29 14 30 15 31
4700 The output of the second stage, i.e. the final result is:
4702 I1: 0 8 16 24 1 9 17 25
4703 I2: 2 10 18 26 3 11 19 27
4704 I3: 4 12 20 28 5 13 21 30
4705 I4: 6 14 22 30 7 15 23 31. */
4707 void
4708 vect_permute_store_chain (vec<tree> dr_chain,
4709 unsigned int length,
4710 gimple *stmt,
4711 gimple_stmt_iterator *gsi,
4712 vec<tree> *result_chain)
4714 tree vect1, vect2, high, low;
4715 gimple *perm_stmt;
4716 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4717 tree perm_mask_low, perm_mask_high;
4718 tree data_ref;
4719 tree perm3_mask_low, perm3_mask_high;
4720 unsigned int i, n, log_length = exact_log2 (length);
4721 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4723 auto_vec_perm_indices sel (nelt);
4724 sel.quick_grow (nelt);
4726 result_chain->quick_grow (length);
4727 memcpy (result_chain->address (), dr_chain.address (),
4728 length * sizeof (tree));
4730 if (length == 3)
4732 unsigned int j0 = 0, j1 = 0, j2 = 0;
4734 for (j = 0; j < 3; j++)
4736 int nelt0 = ((3 - j) * nelt) % 3;
4737 int nelt1 = ((3 - j) * nelt + 1) % 3;
4738 int nelt2 = ((3 - j) * nelt + 2) % 3;
4740 for (i = 0; i < nelt; i++)
4742 if (3 * i + nelt0 < nelt)
4743 sel[3 * i + nelt0] = j0++;
4744 if (3 * i + nelt1 < nelt)
4745 sel[3 * i + nelt1] = nelt + j1++;
4746 if (3 * i + nelt2 < nelt)
4747 sel[3 * i + nelt2] = 0;
4749 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4751 for (i = 0; i < nelt; i++)
4753 if (3 * i + nelt0 < nelt)
4754 sel[3 * i + nelt0] = 3 * i + nelt0;
4755 if (3 * i + nelt1 < nelt)
4756 sel[3 * i + nelt1] = 3 * i + nelt1;
4757 if (3 * i + nelt2 < nelt)
4758 sel[3 * i + nelt2] = nelt + j2++;
4760 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4762 vect1 = dr_chain[0];
4763 vect2 = dr_chain[1];
4765 /* Create interleaving stmt:
4766 low = VEC_PERM_EXPR <vect1, vect2,
4767 {j, nelt, *, j + 1, nelt + j + 1, *,
4768 j + 2, nelt + j + 2, *, ...}> */
4769 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4770 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4771 vect2, perm3_mask_low);
4772 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4774 vect1 = data_ref;
4775 vect2 = dr_chain[2];
4776 /* Create interleaving stmt:
4777 low = VEC_PERM_EXPR <vect1, vect2,
4778 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4779 6, 7, nelt + j + 2, ...}> */
4780 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4781 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4782 vect2, perm3_mask_high);
4783 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4784 (*result_chain)[j] = data_ref;
4787 else
4789 /* If length is not equal to 3 then only power of 2 is supported. */
4790 gcc_assert (pow2p_hwi (length));
4792 for (i = 0, n = nelt / 2; i < n; i++)
4794 sel[i * 2] = i;
4795 sel[i * 2 + 1] = i + nelt;
4797 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4799 for (i = 0; i < nelt; i++)
4800 sel[i] += nelt / 2;
4801 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4803 for (i = 0, n = log_length; i < n; i++)
4805 for (j = 0; j < length/2; j++)
4807 vect1 = dr_chain[j];
4808 vect2 = dr_chain[j+length/2];
4810 /* Create interleaving stmt:
4811 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4812 ...}> */
4813 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4814 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4815 vect2, perm_mask_high);
4816 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4817 (*result_chain)[2*j] = high;
4819 /* Create interleaving stmt:
4820 low = VEC_PERM_EXPR <vect1, vect2,
4821 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4822 ...}> */
4823 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4824 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4825 vect2, perm_mask_low);
4826 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4827 (*result_chain)[2*j+1] = low;
4829 memcpy (dr_chain.address (), result_chain->address (),
4830 length * sizeof (tree));
4835 /* Function vect_setup_realignment
4837 This function is called when vectorizing an unaligned load using
4838 the dr_explicit_realign[_optimized] scheme.
4839 This function generates the following code at the loop prolog:
4841 p = initial_addr;
4842 x msq_init = *(floor(p)); # prolog load
4843 realignment_token = call target_builtin;
4844 loop:
4845 x msq = phi (msq_init, ---)
4847 The stmts marked with x are generated only for the case of
4848 dr_explicit_realign_optimized.
4850 The code above sets up a new (vector) pointer, pointing to the first
4851 location accessed by STMT, and a "floor-aligned" load using that pointer.
4852 It also generates code to compute the "realignment-token" (if the relevant
4853 target hook was defined), and creates a phi-node at the loop-header bb
4854 whose arguments are the result of the prolog-load (created by this
4855 function) and the result of a load that takes place in the loop (to be
4856 created by the caller to this function).
4858 For the case of dr_explicit_realign_optimized:
4859 The caller to this function uses the phi-result (msq) to create the
4860 realignment code inside the loop, and sets up the missing phi argument,
4861 as follows:
4862 loop:
4863 msq = phi (msq_init, lsq)
4864 lsq = *(floor(p')); # load in loop
4865 result = realign_load (msq, lsq, realignment_token);
4867 For the case of dr_explicit_realign:
4868 loop:
4869 msq = *(floor(p)); # load in loop
4870 p' = p + (VS-1);
4871 lsq = *(floor(p')); # load in loop
4872 result = realign_load (msq, lsq, realignment_token);
4874 Input:
4875 STMT - (scalar) load stmt to be vectorized. This load accesses
4876 a memory location that may be unaligned.
4877 BSI - place where new code is to be inserted.
4878 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4879 is used.
4881 Output:
4882 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4883 target hook, if defined.
4884 Return value - the result of the loop-header phi node. */
4886 tree
4887 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4888 tree *realignment_token,
4889 enum dr_alignment_support alignment_support_scheme,
4890 tree init_addr,
4891 struct loop **at_loop)
4893 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4894 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4895 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4896 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4897 struct loop *loop = NULL;
4898 edge pe = NULL;
4899 tree scalar_dest = gimple_assign_lhs (stmt);
4900 tree vec_dest;
4901 gimple *inc;
4902 tree ptr;
4903 tree data_ref;
4904 basic_block new_bb;
4905 tree msq_init = NULL_TREE;
4906 tree new_temp;
4907 gphi *phi_stmt;
4908 tree msq = NULL_TREE;
4909 gimple_seq stmts = NULL;
4910 bool inv_p;
4911 bool compute_in_loop = false;
4912 bool nested_in_vect_loop = false;
4913 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4914 struct loop *loop_for_initial_load = NULL;
4916 if (loop_vinfo)
4918 loop = LOOP_VINFO_LOOP (loop_vinfo);
4919 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4922 gcc_assert (alignment_support_scheme == dr_explicit_realign
4923 || alignment_support_scheme == dr_explicit_realign_optimized);
4925 /* We need to generate three things:
4926 1. the misalignment computation
4927 2. the extra vector load (for the optimized realignment scheme).
4928 3. the phi node for the two vectors from which the realignment is
4929 done (for the optimized realignment scheme). */
4931 /* 1. Determine where to generate the misalignment computation.
4933 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4934 calculation will be generated by this function, outside the loop (in the
4935 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4936 caller, inside the loop.
4938 Background: If the misalignment remains fixed throughout the iterations of
4939 the loop, then both realignment schemes are applicable, and also the
4940 misalignment computation can be done outside LOOP. This is because we are
4941 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4942 are a multiple of VS (the Vector Size), and therefore the misalignment in
4943 different vectorized LOOP iterations is always the same.
4944 The problem arises only if the memory access is in an inner-loop nested
4945 inside LOOP, which is now being vectorized using outer-loop vectorization.
4946 This is the only case when the misalignment of the memory access may not
4947 remain fixed throughout the iterations of the inner-loop (as explained in
4948 detail in vect_supportable_dr_alignment). In this case, not only is the
4949 optimized realignment scheme not applicable, but also the misalignment
4950 computation (and generation of the realignment token that is passed to
4951 REALIGN_LOAD) have to be done inside the loop.
4953 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4954 or not, which in turn determines if the misalignment is computed inside
4955 the inner-loop, or outside LOOP. */
4957 if (init_addr != NULL_TREE || !loop_vinfo)
4959 compute_in_loop = true;
4960 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4964 /* 2. Determine where to generate the extra vector load.
4966 For the optimized realignment scheme, instead of generating two vector
4967 loads in each iteration, we generate a single extra vector load in the
4968 preheader of the loop, and in each iteration reuse the result of the
4969 vector load from the previous iteration. In case the memory access is in
4970 an inner-loop nested inside LOOP, which is now being vectorized using
4971 outer-loop vectorization, we need to determine whether this initial vector
4972 load should be generated at the preheader of the inner-loop, or can be
4973 generated at the preheader of LOOP. If the memory access has no evolution
4974 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4975 to be generated inside LOOP (in the preheader of the inner-loop). */
4977 if (nested_in_vect_loop)
4979 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4980 bool invariant_in_outerloop =
4981 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4982 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4984 else
4985 loop_for_initial_load = loop;
4986 if (at_loop)
4987 *at_loop = loop_for_initial_load;
4989 if (loop_for_initial_load)
4990 pe = loop_preheader_edge (loop_for_initial_load);
4992 /* 3. For the case of the optimized realignment, create the first vector
4993 load at the loop preheader. */
4995 if (alignment_support_scheme == dr_explicit_realign_optimized)
4997 /* Create msq_init = *(floor(p1)) in the loop preheader */
4998 gassign *new_stmt;
5000 gcc_assert (!compute_in_loop);
5001 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5002 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5003 NULL_TREE, &init_addr, NULL, &inc,
5004 true, &inv_p);
5005 if (TREE_CODE (ptr) == SSA_NAME)
5006 new_temp = copy_ssa_name (ptr);
5007 else
5008 new_temp = make_ssa_name (TREE_TYPE (ptr));
5009 new_stmt = gimple_build_assign
5010 (new_temp, BIT_AND_EXPR, ptr,
5011 build_int_cst (TREE_TYPE (ptr),
5012 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5013 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5014 gcc_assert (!new_bb);
5015 data_ref
5016 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5017 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5018 new_stmt = gimple_build_assign (vec_dest, data_ref);
5019 new_temp = make_ssa_name (vec_dest, new_stmt);
5020 gimple_assign_set_lhs (new_stmt, new_temp);
5021 if (pe)
5023 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5024 gcc_assert (!new_bb);
5026 else
5027 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5029 msq_init = gimple_assign_lhs (new_stmt);
5032 /* 4. Create realignment token using a target builtin, if available.
5033 It is done either inside the containing loop, or before LOOP (as
5034 determined above). */
5036 if (targetm.vectorize.builtin_mask_for_load)
5038 gcall *new_stmt;
5039 tree builtin_decl;
5041 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5042 if (!init_addr)
5044 /* Generate the INIT_ADDR computation outside LOOP. */
5045 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5046 NULL_TREE);
5047 if (loop)
5049 pe = loop_preheader_edge (loop);
5050 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5051 gcc_assert (!new_bb);
5053 else
5054 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5057 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5058 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5059 vec_dest =
5060 vect_create_destination_var (scalar_dest,
5061 gimple_call_return_type (new_stmt));
5062 new_temp = make_ssa_name (vec_dest, new_stmt);
5063 gimple_call_set_lhs (new_stmt, new_temp);
5065 if (compute_in_loop)
5066 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5067 else
5069 /* Generate the misalignment computation outside LOOP. */
5070 pe = loop_preheader_edge (loop);
5071 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5072 gcc_assert (!new_bb);
5075 *realignment_token = gimple_call_lhs (new_stmt);
5077 /* The result of the CALL_EXPR to this builtin is determined from
5078 the value of the parameter and no global variables are touched
5079 which makes the builtin a "const" function. Requiring the
5080 builtin to have the "const" attribute makes it unnecessary
5081 to call mark_call_clobbered. */
5082 gcc_assert (TREE_READONLY (builtin_decl));
5085 if (alignment_support_scheme == dr_explicit_realign)
5086 return msq;
5088 gcc_assert (!compute_in_loop);
5089 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5092 /* 5. Create msq = phi <msq_init, lsq> in loop */
5094 pe = loop_preheader_edge (containing_loop);
5095 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5096 msq = make_ssa_name (vec_dest);
5097 phi_stmt = create_phi_node (msq, containing_loop->header);
5098 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5100 return msq;
5104 /* Function vect_grouped_load_supported.
5106 COUNT is the size of the load group (the number of statements plus the
5107 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5108 only one statement, with a gap of COUNT - 1.
5110 Returns true if a suitable permute exists. */
5112 bool
5113 vect_grouped_load_supported (tree vectype, bool single_element_p,
5114 unsigned HOST_WIDE_INT count)
5116 machine_mode mode = TYPE_MODE (vectype);
5118 /* If this is single-element interleaving with an element distance
5119 that leaves unused vector loads around punt - we at least create
5120 very sub-optimal code in that case (and blow up memory,
5121 see PR65518). */
5122 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5124 if (dump_enabled_p ())
5125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5126 "single-element interleaving not supported "
5127 "for not adjacent vector loads\n");
5128 return false;
5131 /* vect_permute_load_chain requires the group size to be equal to 3 or
5132 be a power of two. */
5133 if (count != 3 && exact_log2 (count) == -1)
5135 if (dump_enabled_p ())
5136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5137 "the size of the group of accesses"
5138 " is not a power of 2 or not equal to 3\n");
5139 return false;
5142 /* Check that the permutation is supported. */
5143 if (VECTOR_MODE_P (mode))
5145 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5146 auto_vec_perm_indices sel (nelt);
5147 sel.quick_grow (nelt);
5149 if (count == 3)
5151 unsigned int k;
5152 for (k = 0; k < 3; k++)
5154 for (i = 0; i < nelt; i++)
5155 if (3 * i + k < 2 * nelt)
5156 sel[i] = 3 * i + k;
5157 else
5158 sel[i] = 0;
5159 if (!can_vec_perm_p (mode, false, &sel))
5161 if (dump_enabled_p ())
5162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5163 "shuffle of 3 loads is not supported by"
5164 " target\n");
5165 return false;
5167 for (i = 0, j = 0; i < nelt; i++)
5168 if (3 * i + k < 2 * nelt)
5169 sel[i] = i;
5170 else
5171 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5172 if (!can_vec_perm_p (mode, false, &sel))
5174 if (dump_enabled_p ())
5175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5176 "shuffle of 3 loads is not supported by"
5177 " target\n");
5178 return false;
5181 return true;
5183 else
5185 /* If length is not equal to 3 then only power of 2 is supported. */
5186 gcc_assert (pow2p_hwi (count));
5187 for (i = 0; i < nelt; i++)
5188 sel[i] = i * 2;
5189 if (can_vec_perm_p (mode, false, &sel))
5191 for (i = 0; i < nelt; i++)
5192 sel[i] = i * 2 + 1;
5193 if (can_vec_perm_p (mode, false, &sel))
5194 return true;
5199 if (dump_enabled_p ())
5200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5201 "extract even/odd not supported by target\n");
5202 return false;
5205 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5206 type VECTYPE. */
5208 bool
5209 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5211 return vect_lanes_optab_supported_p ("vec_load_lanes",
5212 vec_load_lanes_optab,
5213 vectype, count);
5216 /* Function vect_permute_load_chain.
5218 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5219 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5220 the input data correctly. Return the final references for loads in
5221 RESULT_CHAIN.
5223 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5224 The input is 4 vectors each containing 8 elements. We assign a number to each
5225 element, the input sequence is:
5227 1st vec: 0 1 2 3 4 5 6 7
5228 2nd vec: 8 9 10 11 12 13 14 15
5229 3rd vec: 16 17 18 19 20 21 22 23
5230 4th vec: 24 25 26 27 28 29 30 31
5232 The output sequence should be:
5234 1st vec: 0 4 8 12 16 20 24 28
5235 2nd vec: 1 5 9 13 17 21 25 29
5236 3rd vec: 2 6 10 14 18 22 26 30
5237 4th vec: 3 7 11 15 19 23 27 31
5239 i.e., the first output vector should contain the first elements of each
5240 interleaving group, etc.
5242 We use extract_even/odd instructions to create such output. The input of
5243 each extract_even/odd operation is two vectors
5244 1st vec 2nd vec
5245 0 1 2 3 4 5 6 7
5247 and the output is the vector of extracted even/odd elements. The output of
5248 extract_even will be: 0 2 4 6
5249 and of extract_odd: 1 3 5 7
5252 The permutation is done in log LENGTH stages. In each stage extract_even
5253 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5254 their order. In our example,
5256 E1: extract_even (1st vec, 2nd vec)
5257 E2: extract_odd (1st vec, 2nd vec)
5258 E3: extract_even (3rd vec, 4th vec)
5259 E4: extract_odd (3rd vec, 4th vec)
5261 The output for the first stage will be:
5263 E1: 0 2 4 6 8 10 12 14
5264 E2: 1 3 5 7 9 11 13 15
5265 E3: 16 18 20 22 24 26 28 30
5266 E4: 17 19 21 23 25 27 29 31
5268 In order to proceed and create the correct sequence for the next stage (or
5269 for the correct output, if the second stage is the last one, as in our
5270 example), we first put the output of extract_even operation and then the
5271 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5272 The input for the second stage is:
5274 1st vec (E1): 0 2 4 6 8 10 12 14
5275 2nd vec (E3): 16 18 20 22 24 26 28 30
5276 3rd vec (E2): 1 3 5 7 9 11 13 15
5277 4th vec (E4): 17 19 21 23 25 27 29 31
5279 The output of the second stage:
5281 E1: 0 4 8 12 16 20 24 28
5282 E2: 2 6 10 14 18 22 26 30
5283 E3: 1 5 9 13 17 21 25 29
5284 E4: 3 7 11 15 19 23 27 31
5286 And RESULT_CHAIN after reordering:
5288 1st vec (E1): 0 4 8 12 16 20 24 28
5289 2nd vec (E3): 1 5 9 13 17 21 25 29
5290 3rd vec (E2): 2 6 10 14 18 22 26 30
5291 4th vec (E4): 3 7 11 15 19 23 27 31. */
5293 static void
5294 vect_permute_load_chain (vec<tree> dr_chain,
5295 unsigned int length,
5296 gimple *stmt,
5297 gimple_stmt_iterator *gsi,
5298 vec<tree> *result_chain)
5300 tree data_ref, first_vect, second_vect;
5301 tree perm_mask_even, perm_mask_odd;
5302 tree perm3_mask_low, perm3_mask_high;
5303 gimple *perm_stmt;
5304 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5305 unsigned int i, j, log_length = exact_log2 (length);
5306 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5308 auto_vec_perm_indices sel (nelt);
5309 sel.quick_grow (nelt);
5311 result_chain->quick_grow (length);
5312 memcpy (result_chain->address (), dr_chain.address (),
5313 length * sizeof (tree));
5315 if (length == 3)
5317 unsigned int k;
5319 for (k = 0; k < 3; k++)
5321 for (i = 0; i < nelt; i++)
5322 if (3 * i + k < 2 * nelt)
5323 sel[i] = 3 * i + k;
5324 else
5325 sel[i] = 0;
5326 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5328 for (i = 0, j = 0; i < nelt; i++)
5329 if (3 * i + k < 2 * nelt)
5330 sel[i] = i;
5331 else
5332 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5334 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5336 first_vect = dr_chain[0];
5337 second_vect = dr_chain[1];
5339 /* Create interleaving stmt (low part of):
5340 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5341 ...}> */
5342 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5343 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5344 second_vect, perm3_mask_low);
5345 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5347 /* Create interleaving stmt (high part of):
5348 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5349 ...}> */
5350 first_vect = data_ref;
5351 second_vect = dr_chain[2];
5352 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5353 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5354 second_vect, perm3_mask_high);
5355 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5356 (*result_chain)[k] = data_ref;
5359 else
5361 /* If length is not equal to 3 then only power of 2 is supported. */
5362 gcc_assert (pow2p_hwi (length));
5364 for (i = 0; i < nelt; ++i)
5365 sel[i] = i * 2;
5366 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5368 for (i = 0; i < nelt; ++i)
5369 sel[i] = i * 2 + 1;
5370 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5372 for (i = 0; i < log_length; i++)
5374 for (j = 0; j < length; j += 2)
5376 first_vect = dr_chain[j];
5377 second_vect = dr_chain[j+1];
5379 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5380 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5381 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5382 first_vect, second_vect,
5383 perm_mask_even);
5384 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5385 (*result_chain)[j/2] = data_ref;
5387 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5388 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5389 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5390 first_vect, second_vect,
5391 perm_mask_odd);
5392 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5393 (*result_chain)[j/2+length/2] = data_ref;
5395 memcpy (dr_chain.address (), result_chain->address (),
5396 length * sizeof (tree));
5401 /* Function vect_shift_permute_load_chain.
5403 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5404 sequence of stmts to reorder the input data accordingly.
5405 Return the final references for loads in RESULT_CHAIN.
5406 Return true if successed, false otherwise.
5408 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5409 The input is 3 vectors each containing 8 elements. We assign a
5410 number to each element, the input sequence is:
5412 1st vec: 0 1 2 3 4 5 6 7
5413 2nd vec: 8 9 10 11 12 13 14 15
5414 3rd vec: 16 17 18 19 20 21 22 23
5416 The output sequence should be:
5418 1st vec: 0 3 6 9 12 15 18 21
5419 2nd vec: 1 4 7 10 13 16 19 22
5420 3rd vec: 2 5 8 11 14 17 20 23
5422 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5424 First we shuffle all 3 vectors to get correct elements order:
5426 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5427 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5428 3rd vec: (16 19 22) (17 20 23) (18 21)
5430 Next we unite and shift vector 3 times:
5432 1st step:
5433 shift right by 6 the concatenation of:
5434 "1st vec" and "2nd vec"
5435 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5436 "2nd vec" and "3rd vec"
5437 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5438 "3rd vec" and "1st vec"
5439 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5440 | New vectors |
5442 So that now new vectors are:
5444 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5445 2nd vec: (10 13) (16 19 22) (17 20 23)
5446 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5448 2nd step:
5449 shift right by 5 the concatenation of:
5450 "1st vec" and "3rd vec"
5451 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5452 "2nd vec" and "1st vec"
5453 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5454 "3rd vec" and "2nd vec"
5455 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5456 | New vectors |
5458 So that now new vectors are:
5460 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5461 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5462 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5464 3rd step:
5465 shift right by 5 the concatenation of:
5466 "1st vec" and "1st vec"
5467 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5468 shift right by 3 the concatenation of:
5469 "2nd vec" and "2nd vec"
5470 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5471 | New vectors |
5473 So that now all vectors are READY:
5474 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5475 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5476 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5478 This algorithm is faster than one in vect_permute_load_chain if:
5479 1. "shift of a concatination" is faster than general permutation.
5480 This is usually so.
5481 2. The TARGET machine can't execute vector instructions in parallel.
5482 This is because each step of the algorithm depends on previous.
5483 The algorithm in vect_permute_load_chain is much more parallel.
5485 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5488 static bool
5489 vect_shift_permute_load_chain (vec<tree> dr_chain,
5490 unsigned int length,
5491 gimple *stmt,
5492 gimple_stmt_iterator *gsi,
5493 vec<tree> *result_chain)
5495 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5496 tree perm2_mask1, perm2_mask2, perm3_mask;
5497 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5498 gimple *perm_stmt;
5500 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5501 unsigned int i;
5502 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5503 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5504 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5506 auto_vec_perm_indices sel (nelt);
5507 sel.quick_grow (nelt);
5509 result_chain->quick_grow (length);
5510 memcpy (result_chain->address (), dr_chain.address (),
5511 length * sizeof (tree));
5513 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5515 unsigned int j, log_length = exact_log2 (length);
5516 for (i = 0; i < nelt / 2; ++i)
5517 sel[i] = i * 2;
5518 for (i = 0; i < nelt / 2; ++i)
5519 sel[nelt / 2 + i] = i * 2 + 1;
5520 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5522 if (dump_enabled_p ())
5523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5524 "shuffle of 2 fields structure is not \
5525 supported by target\n");
5526 return false;
5528 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5530 for (i = 0; i < nelt / 2; ++i)
5531 sel[i] = i * 2 + 1;
5532 for (i = 0; i < nelt / 2; ++i)
5533 sel[nelt / 2 + i] = i * 2;
5534 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5536 if (dump_enabled_p ())
5537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5538 "shuffle of 2 fields structure is not \
5539 supported by target\n");
5540 return false;
5542 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5544 /* Generating permutation constant to shift all elements.
5545 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5546 for (i = 0; i < nelt; i++)
5547 sel[i] = nelt / 2 + i;
5548 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5550 if (dump_enabled_p ())
5551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5552 "shift permutation is not supported by target\n");
5553 return false;
5555 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5557 /* Generating permutation constant to select vector from 2.
5558 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5559 for (i = 0; i < nelt / 2; i++)
5560 sel[i] = i;
5561 for (i = nelt / 2; i < nelt; i++)
5562 sel[i] = nelt + i;
5563 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5565 if (dump_enabled_p ())
5566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5567 "select is not supported by target\n");
5568 return false;
5570 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5572 for (i = 0; i < log_length; i++)
5574 for (j = 0; j < length; j += 2)
5576 first_vect = dr_chain[j];
5577 second_vect = dr_chain[j + 1];
5579 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5580 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5581 first_vect, first_vect,
5582 perm2_mask1);
5583 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5584 vect[0] = data_ref;
5586 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5587 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5588 second_vect, second_vect,
5589 perm2_mask2);
5590 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5591 vect[1] = data_ref;
5593 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5594 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5595 vect[0], vect[1], shift1_mask);
5596 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5597 (*result_chain)[j/2 + length/2] = data_ref;
5599 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5600 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5601 vect[0], vect[1], select_mask);
5602 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5603 (*result_chain)[j/2] = data_ref;
5605 memcpy (dr_chain.address (), result_chain->address (),
5606 length * sizeof (tree));
5608 return true;
5610 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5612 unsigned int k = 0, l = 0;
5614 /* Generating permutation constant to get all elements in rigth order.
5615 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5616 for (i = 0; i < nelt; i++)
5618 if (3 * k + (l % 3) >= nelt)
5620 k = 0;
5621 l += (3 - (nelt % 3));
5623 sel[i] = 3 * k + (l % 3);
5624 k++;
5626 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5628 if (dump_enabled_p ())
5629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5630 "shuffle of 3 fields structure is not \
5631 supported by target\n");
5632 return false;
5634 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5636 /* Generating permutation constant to shift all elements.
5637 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5638 for (i = 0; i < nelt; i++)
5639 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5640 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5642 if (dump_enabled_p ())
5643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5644 "shift permutation is not supported by target\n");
5645 return false;
5647 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5649 /* Generating permutation constant to shift all elements.
5650 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5651 for (i = 0; i < nelt; i++)
5652 sel[i] = 2 * (nelt / 3) + 1 + i;
5653 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5655 if (dump_enabled_p ())
5656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5657 "shift permutation is not supported by target\n");
5658 return false;
5660 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5662 /* Generating permutation constant to shift all elements.
5663 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5664 for (i = 0; i < nelt; i++)
5665 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5666 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5668 if (dump_enabled_p ())
5669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5670 "shift permutation is not supported by target\n");
5671 return false;
5673 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5675 /* Generating permutation constant to shift all elements.
5676 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5677 for (i = 0; i < nelt; i++)
5678 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5679 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5681 if (dump_enabled_p ())
5682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5683 "shift permutation is not supported by target\n");
5684 return false;
5686 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5688 for (k = 0; k < 3; k++)
5690 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5691 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5692 dr_chain[k], dr_chain[k],
5693 perm3_mask);
5694 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5695 vect[k] = data_ref;
5698 for (k = 0; k < 3; k++)
5700 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5701 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5702 vect[k % 3], vect[(k + 1) % 3],
5703 shift1_mask);
5704 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5705 vect_shift[k] = data_ref;
5708 for (k = 0; k < 3; k++)
5710 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5711 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5712 vect_shift[(4 - k) % 3],
5713 vect_shift[(3 - k) % 3],
5714 shift2_mask);
5715 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5716 vect[k] = data_ref;
5719 (*result_chain)[3 - (nelt % 3)] = vect[2];
5721 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5722 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5723 vect[0], shift3_mask);
5724 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5725 (*result_chain)[nelt % 3] = data_ref;
5727 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5728 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5729 vect[1], shift4_mask);
5730 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5731 (*result_chain)[0] = data_ref;
5732 return true;
5734 return false;
5737 /* Function vect_transform_grouped_load.
5739 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5740 to perform their permutation and ascribe the result vectorized statements to
5741 the scalar statements.
5744 void
5745 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5746 gimple_stmt_iterator *gsi)
5748 machine_mode mode;
5749 vec<tree> result_chain = vNULL;
5751 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5752 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5753 vectors, that are ready for vector computation. */
5754 result_chain.create (size);
5756 /* If reassociation width for vector type is 2 or greater target machine can
5757 execute 2 or more vector instructions in parallel. Otherwise try to
5758 get chain for loads group using vect_shift_permute_load_chain. */
5759 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5760 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5761 || pow2p_hwi (size)
5762 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5763 gsi, &result_chain))
5764 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5765 vect_record_grouped_load_vectors (stmt, result_chain);
5766 result_chain.release ();
5769 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5770 generated as part of the vectorization of STMT. Assign the statement
5771 for each vector to the associated scalar statement. */
5773 void
5774 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5776 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5777 gimple *next_stmt, *new_stmt;
5778 unsigned int i, gap_count;
5779 tree tmp_data_ref;
5781 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5782 Since we scan the chain starting from it's first node, their order
5783 corresponds the order of data-refs in RESULT_CHAIN. */
5784 next_stmt = first_stmt;
5785 gap_count = 1;
5786 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5788 if (!next_stmt)
5789 break;
5791 /* Skip the gaps. Loads created for the gaps will be removed by dead
5792 code elimination pass later. No need to check for the first stmt in
5793 the group, since it always exists.
5794 GROUP_GAP is the number of steps in elements from the previous
5795 access (if there is no gap GROUP_GAP is 1). We skip loads that
5796 correspond to the gaps. */
5797 if (next_stmt != first_stmt
5798 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5800 gap_count++;
5801 continue;
5804 while (next_stmt)
5806 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5807 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5808 copies, and we put the new vector statement in the first available
5809 RELATED_STMT. */
5810 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5811 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5812 else
5814 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5816 gimple *prev_stmt =
5817 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5818 gimple *rel_stmt =
5819 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5820 while (rel_stmt)
5822 prev_stmt = rel_stmt;
5823 rel_stmt =
5824 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5827 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5828 new_stmt;
5832 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5833 gap_count = 1;
5834 /* If NEXT_STMT accesses the same DR as the previous statement,
5835 put the same TMP_DATA_REF as its vectorized statement; otherwise
5836 get the next data-ref from RESULT_CHAIN. */
5837 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5838 break;
5843 /* Function vect_force_dr_alignment_p.
5845 Returns whether the alignment of a DECL can be forced to be aligned
5846 on ALIGNMENT bit boundary. */
5848 bool
5849 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5851 if (!VAR_P (decl))
5852 return false;
5854 if (decl_in_symtab_p (decl)
5855 && !symtab_node::get (decl)->can_increase_alignment_p ())
5856 return false;
5858 if (TREE_STATIC (decl))
5859 return (alignment <= MAX_OFILE_ALIGNMENT);
5860 else
5861 return (alignment <= MAX_STACK_ALIGNMENT);
5865 /* Return whether the data reference DR is supported with respect to its
5866 alignment.
5867 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5868 it is aligned, i.e., check if it is possible to vectorize it with different
5869 alignment. */
5871 enum dr_alignment_support
5872 vect_supportable_dr_alignment (struct data_reference *dr,
5873 bool check_aligned_accesses)
5875 gimple *stmt = DR_STMT (dr);
5876 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5877 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5878 machine_mode mode = TYPE_MODE (vectype);
5879 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5880 struct loop *vect_loop = NULL;
5881 bool nested_in_vect_loop = false;
5883 if (aligned_access_p (dr) && !check_aligned_accesses)
5884 return dr_aligned;
5886 /* For now assume all conditional loads/stores support unaligned
5887 access without any special code. */
5888 if (is_gimple_call (stmt)
5889 && gimple_call_internal_p (stmt)
5890 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5891 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5892 return dr_unaligned_supported;
5894 if (loop_vinfo)
5896 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5897 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5900 /* Possibly unaligned access. */
5902 /* We can choose between using the implicit realignment scheme (generating
5903 a misaligned_move stmt) and the explicit realignment scheme (generating
5904 aligned loads with a REALIGN_LOAD). There are two variants to the
5905 explicit realignment scheme: optimized, and unoptimized.
5906 We can optimize the realignment only if the step between consecutive
5907 vector loads is equal to the vector size. Since the vector memory
5908 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5909 is guaranteed that the misalignment amount remains the same throughout the
5910 execution of the vectorized loop. Therefore, we can create the
5911 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5912 at the loop preheader.
5914 However, in the case of outer-loop vectorization, when vectorizing a
5915 memory access in the inner-loop nested within the LOOP that is now being
5916 vectorized, while it is guaranteed that the misalignment of the
5917 vectorized memory access will remain the same in different outer-loop
5918 iterations, it is *not* guaranteed that is will remain the same throughout
5919 the execution of the inner-loop. This is because the inner-loop advances
5920 with the original scalar step (and not in steps of VS). If the inner-loop
5921 step happens to be a multiple of VS, then the misalignment remains fixed
5922 and we can use the optimized realignment scheme. For example:
5924 for (i=0; i<N; i++)
5925 for (j=0; j<M; j++)
5926 s += a[i+j];
5928 When vectorizing the i-loop in the above example, the step between
5929 consecutive vector loads is 1, and so the misalignment does not remain
5930 fixed across the execution of the inner-loop, and the realignment cannot
5931 be optimized (as illustrated in the following pseudo vectorized loop):
5933 for (i=0; i<N; i+=4)
5934 for (j=0; j<M; j++){
5935 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5936 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5937 // (assuming that we start from an aligned address).
5940 We therefore have to use the unoptimized realignment scheme:
5942 for (i=0; i<N; i+=4)
5943 for (j=k; j<M; j+=4)
5944 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5945 // that the misalignment of the initial address is
5946 // 0).
5948 The loop can then be vectorized as follows:
5950 for (k=0; k<4; k++){
5951 rt = get_realignment_token (&vp[k]);
5952 for (i=0; i<N; i+=4){
5953 v1 = vp[i+k];
5954 for (j=k; j<M; j+=4){
5955 v2 = vp[i+j+VS-1];
5956 va = REALIGN_LOAD <v1,v2,rt>;
5957 vs += va;
5958 v1 = v2;
5961 } */
5963 if (DR_IS_READ (dr))
5965 bool is_packed = false;
5966 tree type = (TREE_TYPE (DR_REF (dr)));
5968 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5969 && (!targetm.vectorize.builtin_mask_for_load
5970 || targetm.vectorize.builtin_mask_for_load ()))
5972 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5974 /* If we are doing SLP then the accesses need not have the
5975 same alignment, instead it depends on the SLP group size. */
5976 if (loop_vinfo
5977 && STMT_SLP_TYPE (stmt_info)
5978 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5979 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5980 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5982 else if (!loop_vinfo
5983 || (nested_in_vect_loop
5984 && (TREE_INT_CST_LOW (DR_STEP (dr))
5985 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5986 return dr_explicit_realign;
5987 else
5988 return dr_explicit_realign_optimized;
5990 if (!known_alignment_for_access_p (dr))
5991 is_packed = not_size_aligned (DR_REF (dr));
5993 if (targetm.vectorize.support_vector_misalignment
5994 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5995 /* Can't software pipeline the loads, but can at least do them. */
5996 return dr_unaligned_supported;
5998 else
6000 bool is_packed = false;
6001 tree type = (TREE_TYPE (DR_REF (dr)));
6003 if (!known_alignment_for_access_p (dr))
6004 is_packed = not_size_aligned (DR_REF (dr));
6006 if (targetm.vectorize.support_vector_misalignment
6007 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6008 return dr_unaligned_supported;
6011 /* Unsupported. */
6012 return dr_unaligned_unsupported;