builtins.def: (_Float<N> and _Float<N>X BUILT_IN_CEIL): Add _Float<N> and _Float...
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob014862a5b1b588a095a1b824c91e6c028a76c377
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
56 /* Return true if load- or store-lanes optab OPTAB is implemented for
57 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59 static bool
60 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
61 tree vectype, unsigned HOST_WIDE_INT count)
63 machine_mode mode;
64 scalar_int_mode array_mode;
65 bool limit_p;
67 mode = TYPE_MODE (vectype);
68 limit_p = !targetm.array_mode_supported_p (mode, count);
69 if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
70 limit_p).exists (&array_mode))
72 if (dump_enabled_p ())
73 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
74 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
75 GET_MODE_NAME (mode), count);
76 return false;
79 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
81 if (dump_enabled_p ())
82 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
83 "cannot use %s<%s><%s>\n", name,
84 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
85 return false;
88 if (dump_enabled_p ())
89 dump_printf_loc (MSG_NOTE, vect_location,
90 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
91 GET_MODE_NAME (mode));
93 return true;
97 /* Return the smallest scalar part of STMT.
98 This is used to determine the vectype of the stmt. We generally set the
99 vectype according to the type of the result (lhs). For stmts whose
100 result-type is different than the type of the arguments (e.g., demotion,
101 promotion), vectype will be reset appropriately (later). Note that we have
102 to visit the smallest datatype in this function, because that determines the
103 VF. If the smallest datatype in the loop is present only as the rhs of a
104 promotion operation - we'd miss it.
105 Such a case, where a variable of this datatype does not appear in the lhs
106 anywhere in the loop, can only occur if it's an invariant: e.g.:
107 'int_x = (int) short_inv', which we'd expect to have been optimized away by
108 invariant motion. However, we cannot rely on invariant motion to always
109 take invariants out of the loop, and so in the case of promotion we also
110 have to check the rhs.
111 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
112 types. */
114 tree
115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
116 HOST_WIDE_INT *rhs_size_unit)
118 tree scalar_type = gimple_expr_type (stmt);
119 HOST_WIDE_INT lhs, rhs;
121 /* During the analysis phase, this function is called on arbitrary
122 statements that might not have scalar results. */
123 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
124 return scalar_type;
126 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
128 if (is_gimple_assign (stmt)
129 && (gimple_assign_cast_p (stmt)
130 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
131 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
132 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
134 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
136 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
137 if (rhs < lhs)
138 scalar_type = rhs_type;
141 *lhs_size_unit = lhs;
142 *rhs_size_unit = rhs;
143 return scalar_type;
147 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
148 tested at run-time. Return TRUE if DDR was successfully inserted.
149 Return false if versioning is not supported. */
151 static bool
152 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
154 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
156 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
157 return false;
159 if (!runtime_alias_check_p (ddr, loop,
160 optimize_loop_nest_for_speed_p (loop)))
161 return false;
163 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
164 return true;
168 /* A subroutine of vect_analyze_data_ref_dependence. Handle
169 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
170 distances. These distances are conservatively correct but they don't
171 reflect a guaranteed dependence.
173 Return true if this function does all the work necessary to avoid
174 an alias or false if the caller should use the dependence distances
175 to limit the vectorization factor in the usual way. LOOP_DEPTH is
176 the depth of the loop described by LOOP_VINFO and the other arguments
177 are as for vect_analyze_data_ref_dependence. */
179 static bool
180 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
181 loop_vec_info loop_vinfo,
182 int loop_depth, int *max_vf)
184 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
185 lambda_vector dist_v;
186 unsigned int i;
187 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
189 int dist = dist_v[loop_depth];
190 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
192 /* If the user asserted safelen >= DIST consecutive iterations
193 can be executed concurrently, assume independence.
195 ??? An alternative would be to add the alias check even
196 in this case, and vectorize the fallback loop with the
197 maximum VF set to safelen. However, if the user has
198 explicitly given a length, it's less likely that that
199 would be a win. */
200 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
202 if (loop->safelen < *max_vf)
203 *max_vf = loop->safelen;
204 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
205 continue;
208 /* For dependence distances of 2 or more, we have the option
209 of limiting VF or checking for an alias at runtime.
210 Prefer to check at runtime if we can, to avoid limiting
211 the VF unnecessarily when the bases are in fact independent.
213 Note that the alias checks will be removed if the VF ends up
214 being small enough. */
215 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
218 return true;
222 /* Function vect_analyze_data_ref_dependence.
224 Return TRUE if there (might) exist a dependence between a memory-reference
225 DRA and a memory-reference DRB. When versioning for alias may check a
226 dependence at run-time, return FALSE. Adjust *MAX_VF according to
227 the data dependence. */
229 static bool
230 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
231 loop_vec_info loop_vinfo, int *max_vf)
233 unsigned int i;
234 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
235 struct data_reference *dra = DDR_A (ddr);
236 struct data_reference *drb = DDR_B (ddr);
237 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
238 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
239 lambda_vector dist_v;
240 unsigned int loop_depth;
242 /* In loop analysis all data references should be vectorizable. */
243 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
244 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
245 gcc_unreachable ();
247 /* Independent data accesses. */
248 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
249 return false;
251 if (dra == drb
252 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
253 return false;
255 /* We do not have to consider dependences between accesses that belong
256 to the same group. */
257 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
258 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
259 return false;
261 /* Even if we have an anti-dependence then, as the vectorized loop covers at
262 least two scalar iterations, there is always also a true dependence.
263 As the vectorizer does not re-order loads and stores we can ignore
264 the anti-dependence if TBAA can disambiguate both DRs similar to the
265 case with known negative distance anti-dependences (positive
266 distance anti-dependences would violate TBAA constraints). */
267 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
268 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
269 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
270 get_alias_set (DR_REF (drb))))
271 return false;
273 /* Unknown data dependence. */
274 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
276 /* If user asserted safelen consecutive iterations can be
277 executed concurrently, assume independence. */
278 if (loop->safelen >= 2)
280 if (loop->safelen < *max_vf)
281 *max_vf = loop->safelen;
282 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
283 return false;
286 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
287 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
289 if (dump_enabled_p ())
291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
292 "versioning for alias not supported for: "
293 "can't determine dependence between ");
294 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
295 DR_REF (dra));
296 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
297 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
298 DR_REF (drb));
299 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
301 return true;
304 if (dump_enabled_p ())
306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
307 "versioning for alias required: "
308 "can't determine dependence between ");
309 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
310 DR_REF (dra));
311 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
312 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
313 DR_REF (drb));
314 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
317 /* Add to list of ddrs that need to be tested at run-time. */
318 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
321 /* Known data dependence. */
322 if (DDR_NUM_DIST_VECTS (ddr) == 0)
324 /* If user asserted safelen consecutive iterations can be
325 executed concurrently, assume independence. */
326 if (loop->safelen >= 2)
328 if (loop->safelen < *max_vf)
329 *max_vf = loop->safelen;
330 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
331 return false;
334 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
335 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340 "versioning for alias not supported for: "
341 "bad dist vector for ");
342 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
343 DR_REF (dra));
344 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
345 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
346 DR_REF (drb));
347 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
349 return true;
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "versioning for alias required: "
356 "bad dist vector for ");
357 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
358 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
359 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
360 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
362 /* Add to list of ddrs that need to be tested at run-time. */
363 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
366 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
368 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
369 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
370 loop_depth, max_vf))
371 return false;
373 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
375 int dist = dist_v[loop_depth];
377 if (dump_enabled_p ())
378 dump_printf_loc (MSG_NOTE, vect_location,
379 "dependence distance = %d.\n", dist);
381 if (dist == 0)
383 if (dump_enabled_p ())
385 dump_printf_loc (MSG_NOTE, vect_location,
386 "dependence distance == 0 between ");
387 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
388 dump_printf (MSG_NOTE, " and ");
389 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
390 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
393 /* When we perform grouped accesses and perform implicit CSE
394 by detecting equal accesses and doing disambiguation with
395 runtime alias tests like for
396 .. = a[i];
397 .. = a[i+1];
398 a[i] = ..;
399 a[i+1] = ..;
400 *p = ..;
401 .. = a[i];
402 .. = a[i+1];
403 where we will end up loading { a[i], a[i+1] } once, make
404 sure that inserting group loads before the first load and
405 stores after the last store will do the right thing.
406 Similar for groups like
407 a[i] = ...;
408 ... = a[i];
409 a[i+1] = ...;
410 where loads from the group interleave with the store. */
411 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
412 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
414 gimple *earlier_stmt;
415 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
416 if (DR_IS_WRITE
417 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
419 if (dump_enabled_p ())
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421 "READ_WRITE dependence in interleaving."
422 "\n");
423 return true;
427 continue;
430 if (dist > 0 && DDR_REVERSED_P (ddr))
432 /* If DDR_REVERSED_P the order of the data-refs in DDR was
433 reversed (to make distance vector positive), and the actual
434 distance is negative. */
435 if (dump_enabled_p ())
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
437 "dependence distance negative.\n");
438 /* Record a negative dependence distance to later limit the
439 amount of stmt copying / unrolling we can perform.
440 Only need to handle read-after-write dependence. */
441 if (DR_IS_READ (drb)
442 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
443 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
444 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
445 continue;
448 if (abs (dist) >= 2
449 && abs (dist) < *max_vf)
451 /* The dependence distance requires reduction of the maximal
452 vectorization factor. */
453 *max_vf = abs (dist);
454 if (dump_enabled_p ())
455 dump_printf_loc (MSG_NOTE, vect_location,
456 "adjusting maximal vectorization factor to %i\n",
457 *max_vf);
460 if (abs (dist) >= *max_vf)
462 /* Dependence distance does not create dependence, as far as
463 vectorization is concerned, in this case. */
464 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE, vect_location,
466 "dependence distance >= VF.\n");
467 continue;
470 if (dump_enabled_p ())
472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
473 "not vectorized, possible dependence "
474 "between data-refs ");
475 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
476 dump_printf (MSG_NOTE, " and ");
477 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
478 dump_printf (MSG_NOTE, "\n");
481 return true;
484 return false;
487 /* Function vect_analyze_data_ref_dependences.
489 Examine all the data references in the loop, and make sure there do not
490 exist any data dependences between them. Set *MAX_VF according to
491 the maximum vectorization factor the data dependences allow. */
493 bool
494 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
496 unsigned int i;
497 struct data_dependence_relation *ddr;
499 if (dump_enabled_p ())
500 dump_printf_loc (MSG_NOTE, vect_location,
501 "=== vect_analyze_data_ref_dependences ===\n");
503 LOOP_VINFO_DDRS (loop_vinfo)
504 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
505 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
506 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
507 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
508 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
509 &LOOP_VINFO_DDRS (loop_vinfo),
510 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
511 return false;
513 /* For epilogues we either have no aliases or alias versioning
514 was applied to original loop. Therefore we may just get max_vf
515 using VF of original loop. */
516 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
517 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
518 else
519 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
520 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
521 return false;
523 return true;
527 /* Function vect_slp_analyze_data_ref_dependence.
529 Return TRUE if there (might) exist a dependence between a memory-reference
530 DRA and a memory-reference DRB. When versioning for alias may check a
531 dependence at run-time, return FALSE. Adjust *MAX_VF according to
532 the data dependence. */
534 static bool
535 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
537 struct data_reference *dra = DDR_A (ddr);
538 struct data_reference *drb = DDR_B (ddr);
540 /* We need to check dependences of statements marked as unvectorizable
541 as well, they still can prohibit vectorization. */
543 /* Independent data accesses. */
544 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
545 return false;
547 if (dra == drb)
548 return false;
550 /* Read-read is OK. */
551 if (DR_IS_READ (dra) && DR_IS_READ (drb))
552 return false;
554 /* If dra and drb are part of the same interleaving chain consider
555 them independent. */
556 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
557 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
558 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
559 return false;
561 /* Unknown data dependence. */
562 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
564 if (dump_enabled_p ())
566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
567 "can't determine dependence between ");
568 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
569 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
570 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
571 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
574 else if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location,
577 "determined dependence between ");
578 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
579 dump_printf (MSG_NOTE, " and ");
580 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
581 dump_printf (MSG_NOTE, "\n");
584 return true;
588 /* Analyze dependences involved in the transform of SLP NODE. STORES
589 contain the vector of scalar stores of this instance if we are
590 disambiguating the loads. */
592 static bool
593 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
594 vec<gimple *> stores, gimple *last_store)
596 /* This walks over all stmts involved in the SLP load/store done
597 in NODE verifying we can sink them up to the last stmt in the
598 group. */
599 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
600 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
602 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
603 if (access == last_access)
604 continue;
605 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
606 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
607 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
609 gimple *stmt = gsi_stmt (gsi);
610 if (! gimple_vuse (stmt)
611 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
612 continue;
614 /* If we couldn't record a (single) data reference for this
615 stmt we have to give up. */
616 /* ??? Here and below if dependence analysis fails we can resort
617 to the alias oracle which can handle more kinds of stmts. */
618 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
619 if (!dr_b)
620 return false;
622 bool dependent = false;
623 /* If we run into a store of this same instance (we've just
624 marked those) then delay dependence checking until we run
625 into the last store because this is where it will have
626 been sunk to (and we verify if we can do that as well). */
627 if (gimple_visited_p (stmt))
629 if (stmt != last_store)
630 continue;
631 unsigned i;
632 gimple *store;
633 FOR_EACH_VEC_ELT (stores, i, store)
635 data_reference *store_dr
636 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
637 ddr_p ddr = initialize_data_dependence_relation
638 (dr_a, store_dr, vNULL);
639 dependent = vect_slp_analyze_data_ref_dependence (ddr);
640 free_dependence_relation (ddr);
641 if (dependent)
642 break;
645 else
647 ddr_p ddr = initialize_data_dependence_relation (dr_a,
648 dr_b, vNULL);
649 dependent = vect_slp_analyze_data_ref_dependence (ddr);
650 free_dependence_relation (ddr);
652 if (dependent)
653 return false;
656 return true;
660 /* Function vect_analyze_data_ref_dependences.
662 Examine all the data references in the basic-block, and make sure there
663 do not exist any data dependences between them. Set *MAX_VF according to
664 the maximum vectorization factor the data dependences allow. */
666 bool
667 vect_slp_analyze_instance_dependence (slp_instance instance)
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_NOTE, vect_location,
671 "=== vect_slp_analyze_instance_dependence ===\n");
673 /* The stores of this instance are at the root of the SLP tree. */
674 slp_tree store = SLP_INSTANCE_TREE (instance);
675 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
676 store = NULL;
678 /* Verify we can sink stores to the vectorized stmt insert location. */
679 gimple *last_store = NULL;
680 if (store)
682 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
683 return false;
685 /* Mark stores in this instance and remember the last one. */
686 last_store = vect_find_last_scalar_stmt_in_slp (store);
687 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
688 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
691 bool res = true;
693 /* Verify we can sink loads to the vectorized stmt insert location,
694 special-casing stores of this instance. */
695 slp_tree load;
696 unsigned int i;
697 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
698 if (! vect_slp_analyze_node_dependences (instance, load,
699 store
700 ? SLP_TREE_SCALAR_STMTS (store)
701 : vNULL, last_store))
703 res = false;
704 break;
707 /* Unset the visited flag. */
708 if (store)
709 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
710 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
712 return res;
715 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
716 the statement that contains DRB, which is useful for recording in the
717 dump file. */
719 static void
720 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
721 innermost_loop_behavior *drb)
723 bool existed;
724 innermost_loop_behavior *&entry
725 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
726 if (!existed || entry->base_alignment < drb->base_alignment)
728 entry = drb;
729 if (dump_enabled_p ())
731 dump_printf_loc (MSG_NOTE, vect_location,
732 "recording new base alignment for ");
733 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
734 dump_printf (MSG_NOTE, "\n");
735 dump_printf_loc (MSG_NOTE, vect_location,
736 " alignment: %d\n", drb->base_alignment);
737 dump_printf_loc (MSG_NOTE, vect_location,
738 " misalignment: %d\n", drb->base_misalignment);
739 dump_printf_loc (MSG_NOTE, vect_location,
740 " based on: ");
741 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
746 /* If the region we're going to vectorize is reached, all unconditional
747 data references occur at least once. We can therefore pool the base
748 alignment guarantees from each unconditional reference. Do this by
749 going through all the data references in VINFO and checking whether
750 the containing statement makes the reference unconditionally. If so,
751 record the alignment of the base address in VINFO so that it can be
752 used for all other references with the same base. */
754 void
755 vect_record_base_alignments (vec_info *vinfo)
757 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
758 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
759 data_reference *dr;
760 unsigned int i;
761 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
762 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
764 gimple *stmt = DR_STMT (dr);
765 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
767 /* If DR is nested in the loop that is being vectorized, we can also
768 record the alignment of the base wrt the outer loop. */
769 if (loop && nested_in_vect_loop_p (loop, stmt))
771 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
772 vect_record_base_alignment
773 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
778 /* Return the target alignment for the vectorized form of DR. */
780 static unsigned int
781 vect_calculate_target_alignment (struct data_reference *dr)
783 gimple *stmt = DR_STMT (dr);
784 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
785 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
786 return targetm.vectorize.preferred_vector_alignment (vectype);
789 /* Function vect_compute_data_ref_alignment
791 Compute the misalignment of the data reference DR.
793 Output:
794 1. If during the misalignment computation it is found that the data reference
795 cannot be vectorized then false is returned.
796 2. DR_MISALIGNMENT (DR) is defined.
798 FOR NOW: No analysis is actually performed. Misalignment is calculated
799 only for trivial cases. TODO. */
801 bool
802 vect_compute_data_ref_alignment (struct data_reference *dr)
804 gimple *stmt = DR_STMT (dr);
805 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
806 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
807 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
808 struct loop *loop = NULL;
809 tree ref = DR_REF (dr);
810 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
812 if (dump_enabled_p ())
813 dump_printf_loc (MSG_NOTE, vect_location,
814 "vect_compute_data_ref_alignment:\n");
816 if (loop_vinfo)
817 loop = LOOP_VINFO_LOOP (loop_vinfo);
819 /* Initialize misalignment to unknown. */
820 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
822 innermost_loop_behavior *drb = vect_dr_behavior (dr);
823 bool step_preserves_misalignment_p;
825 unsigned HOST_WIDE_INT vector_alignment
826 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
827 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
829 /* No step for BB vectorization. */
830 if (!loop)
832 gcc_assert (integer_zerop (drb->step));
833 step_preserves_misalignment_p = true;
836 /* In case the dataref is in an inner-loop of the loop that is being
837 vectorized (LOOP), we use the base and misalignment information
838 relative to the outer-loop (LOOP). This is ok only if the misalignment
839 stays the same throughout the execution of the inner-loop, which is why
840 we have to check that the stride of the dataref in the inner-loop evenly
841 divides by the vector alignment. */
842 else if (nested_in_vect_loop_p (loop, stmt))
844 step_preserves_misalignment_p
845 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
847 if (dump_enabled_p ())
849 if (step_preserves_misalignment_p)
850 dump_printf_loc (MSG_NOTE, vect_location,
851 "inner step divides the vector alignment.\n");
852 else
853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
854 "inner step doesn't divide the vector"
855 " alignment.\n");
859 /* Similarly we can only use base and misalignment information relative to
860 an innermost loop if the misalignment stays the same throughout the
861 execution of the loop. As above, this is the case if the stride of
862 the dataref evenly divides by the alignment. */
863 else
865 unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
866 step_preserves_misalignment_p
867 = ((DR_STEP_ALIGNMENT (dr) * vf) % vector_alignment) == 0;
869 if (!step_preserves_misalignment_p && dump_enabled_p ())
870 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
871 "step doesn't divide the vector alignment.\n");
874 unsigned int base_alignment = drb->base_alignment;
875 unsigned int base_misalignment = drb->base_misalignment;
877 /* Calculate the maximum of the pooled base address alignment and the
878 alignment that we can compute for DR itself. */
879 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
880 if (entry && base_alignment < (*entry)->base_alignment)
882 base_alignment = (*entry)->base_alignment;
883 base_misalignment = (*entry)->base_misalignment;
886 if (drb->offset_alignment < vector_alignment
887 || !step_preserves_misalignment_p
888 /* We need to know whether the step wrt the vectorized loop is
889 negative when computing the starting misalignment below. */
890 || TREE_CODE (drb->step) != INTEGER_CST)
892 if (dump_enabled_p ())
894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
895 "Unknown alignment for access: ");
896 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
897 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
899 return true;
902 if (base_alignment < vector_alignment)
904 tree base = drb->base_address;
905 if (TREE_CODE (base) == ADDR_EXPR)
906 base = TREE_OPERAND (base, 0);
907 if (!vect_can_force_dr_alignment_p (base,
908 vector_alignment * BITS_PER_UNIT))
910 if (dump_enabled_p ())
912 dump_printf_loc (MSG_NOTE, vect_location,
913 "can't force alignment of ref: ");
914 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
915 dump_printf (MSG_NOTE, "\n");
917 return true;
920 if (DECL_USER_ALIGN (base))
922 if (dump_enabled_p ())
924 dump_printf_loc (MSG_NOTE, vect_location,
925 "not forcing alignment of user-aligned "
926 "variable: ");
927 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
928 dump_printf (MSG_NOTE, "\n");
930 return true;
933 /* Force the alignment of the decl.
934 NOTE: This is the only change to the code we make during
935 the analysis phase, before deciding to vectorize the loop. */
936 if (dump_enabled_p ())
938 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
939 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
940 dump_printf (MSG_NOTE, "\n");
943 DR_VECT_AUX (dr)->base_decl = base;
944 DR_VECT_AUX (dr)->base_misaligned = true;
945 base_misalignment = 0;
947 poly_int64 misalignment
948 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
950 /* If this is a backward running DR then first access in the larger
951 vectype actually is N-1 elements before the address in the DR.
952 Adjust misalign accordingly. */
953 if (tree_int_cst_sgn (drb->step) < 0)
954 /* PLUS because STEP is negative. */
955 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
956 * TREE_INT_CST_LOW (drb->step));
958 unsigned int const_misalignment;
959 if (!known_misalignment (misalignment, vector_alignment,
960 &const_misalignment))
962 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 "Non-constant misalignment for access: ");
966 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
967 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
969 return true;
972 SET_DR_MISALIGNMENT (dr, const_misalignment);
974 if (dump_enabled_p ())
976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
977 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
978 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
979 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
982 return true;
985 /* Function vect_update_misalignment_for_peel.
986 Sets DR's misalignment
987 - to 0 if it has the same alignment as DR_PEEL,
988 - to the misalignment computed using NPEEL if DR's salignment is known,
989 - to -1 (unknown) otherwise.
991 DR - the data reference whose misalignment is to be adjusted.
992 DR_PEEL - the data reference whose misalignment is being made
993 zero in the vector loop by the peel.
994 NPEEL - the number of iterations in the peel loop if the misalignment
995 of DR_PEEL is known at compile time. */
997 static void
998 vect_update_misalignment_for_peel (struct data_reference *dr,
999 struct data_reference *dr_peel, int npeel)
1001 unsigned int i;
1002 vec<dr_p> same_aligned_drs;
1003 struct data_reference *current_dr;
1004 int dr_size = vect_get_scalar_dr_size (dr);
1005 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1006 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1007 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1009 /* For interleaved data accesses the step in the loop must be multiplied by
1010 the size of the interleaving group. */
1011 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1012 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1013 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1014 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1016 /* It can be assumed that the data refs with the same alignment as dr_peel
1017 are aligned in the vector loop. */
1018 same_aligned_drs
1019 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1020 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1022 if (current_dr != dr)
1023 continue;
1024 gcc_assert (!known_alignment_for_access_p (dr)
1025 || !known_alignment_for_access_p (dr_peel)
1026 || (DR_MISALIGNMENT (dr) / dr_size
1027 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1028 SET_DR_MISALIGNMENT (dr, 0);
1029 return;
1032 if (known_alignment_for_access_p (dr)
1033 && known_alignment_for_access_p (dr_peel))
1035 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1036 int misal = DR_MISALIGNMENT (dr);
1037 misal += negative ? -npeel * dr_size : npeel * dr_size;
1038 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1039 SET_DR_MISALIGNMENT (dr, misal);
1040 return;
1043 if (dump_enabled_p ())
1044 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1045 "to unknown (-1).\n");
1046 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1050 /* Function verify_data_ref_alignment
1052 Return TRUE if DR can be handled with respect to alignment. */
1054 static bool
1055 verify_data_ref_alignment (data_reference_p dr)
1057 enum dr_alignment_support supportable_dr_alignment
1058 = vect_supportable_dr_alignment (dr, false);
1059 if (!supportable_dr_alignment)
1061 if (dump_enabled_p ())
1063 if (DR_IS_READ (dr))
1064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1065 "not vectorized: unsupported unaligned load.");
1066 else
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068 "not vectorized: unsupported unaligned "
1069 "store.");
1071 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1072 DR_REF (dr));
1073 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1075 return false;
1078 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1079 dump_printf_loc (MSG_NOTE, vect_location,
1080 "Vectorizing an unaligned access.\n");
1082 return true;
1085 /* Function vect_verify_datarefs_alignment
1087 Return TRUE if all data references in the loop can be
1088 handled with respect to alignment. */
1090 bool
1091 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1093 vec<data_reference_p> datarefs = vinfo->datarefs;
1094 struct data_reference *dr;
1095 unsigned int i;
1097 FOR_EACH_VEC_ELT (datarefs, i, dr)
1099 gimple *stmt = DR_STMT (dr);
1100 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1102 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1103 continue;
1105 /* For interleaving, only the alignment of the first access matters. */
1106 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1107 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1108 continue;
1110 /* Strided accesses perform only component accesses, alignment is
1111 irrelevant for them. */
1112 if (STMT_VINFO_STRIDED_P (stmt_info)
1113 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1114 continue;
1116 if (! verify_data_ref_alignment (dr))
1117 return false;
1120 return true;
1123 /* Given an memory reference EXP return whether its alignment is less
1124 than its size. */
1126 static bool
1127 not_size_aligned (tree exp)
1129 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1130 return true;
1132 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1133 > get_object_alignment (exp));
1136 /* Function vector_alignment_reachable_p
1138 Return true if vector alignment for DR is reachable by peeling
1139 a few loop iterations. Return false otherwise. */
1141 static bool
1142 vector_alignment_reachable_p (struct data_reference *dr)
1144 gimple *stmt = DR_STMT (dr);
1145 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1146 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1148 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1150 /* For interleaved access we peel only if number of iterations in
1151 the prolog loop ({VF - misalignment}), is a multiple of the
1152 number of the interleaved accesses. */
1153 int elem_size, mis_in_elements;
1154 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1156 /* FORNOW: handle only known alignment. */
1157 if (!known_alignment_for_access_p (dr))
1158 return false;
1160 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1161 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1163 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1164 return false;
1167 /* If misalignment is known at the compile time then allow peeling
1168 only if natural alignment is reachable through peeling. */
1169 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1171 HOST_WIDE_INT elmsize =
1172 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1173 if (dump_enabled_p ())
1175 dump_printf_loc (MSG_NOTE, vect_location,
1176 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1177 dump_printf (MSG_NOTE,
1178 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1180 if (DR_MISALIGNMENT (dr) % elmsize)
1182 if (dump_enabled_p ())
1183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1184 "data size does not divide the misalignment.\n");
1185 return false;
1189 if (!known_alignment_for_access_p (dr))
1191 tree type = TREE_TYPE (DR_REF (dr));
1192 bool is_packed = not_size_aligned (DR_REF (dr));
1193 if (dump_enabled_p ())
1194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1195 "Unknown misalignment, %snaturally aligned\n",
1196 is_packed ? "not " : "");
1197 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1200 return true;
1204 /* Calculate the cost of the memory access represented by DR. */
1206 static void
1207 vect_get_data_access_cost (struct data_reference *dr,
1208 unsigned int *inside_cost,
1209 unsigned int *outside_cost,
1210 stmt_vector_for_cost *body_cost_vec)
1212 gimple *stmt = DR_STMT (dr);
1213 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1214 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1215 int ncopies;
1217 if (PURE_SLP_STMT (stmt_info))
1218 ncopies = 1;
1219 else
1220 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1222 if (DR_IS_READ (dr))
1223 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1224 NULL, body_cost_vec, false);
1225 else
1226 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1228 if (dump_enabled_p ())
1229 dump_printf_loc (MSG_NOTE, vect_location,
1230 "vect_get_data_access_cost: inside_cost = %d, "
1231 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1235 typedef struct _vect_peel_info
1237 struct data_reference *dr;
1238 int npeel;
1239 unsigned int count;
1240 } *vect_peel_info;
1242 typedef struct _vect_peel_extended_info
1244 struct _vect_peel_info peel_info;
1245 unsigned int inside_cost;
1246 unsigned int outside_cost;
1247 } *vect_peel_extended_info;
1250 /* Peeling hashtable helpers. */
1252 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1254 static inline hashval_t hash (const _vect_peel_info *);
1255 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1258 inline hashval_t
1259 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1261 return (hashval_t) peel_info->npeel;
1264 inline bool
1265 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1267 return (a->npeel == b->npeel);
1271 /* Insert DR into peeling hash table with NPEEL as key. */
1273 static void
1274 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1275 loop_vec_info loop_vinfo, struct data_reference *dr,
1276 int npeel)
1278 struct _vect_peel_info elem, *slot;
1279 _vect_peel_info **new_slot;
1280 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1282 elem.npeel = npeel;
1283 slot = peeling_htab->find (&elem);
1284 if (slot)
1285 slot->count++;
1286 else
1288 slot = XNEW (struct _vect_peel_info);
1289 slot->npeel = npeel;
1290 slot->dr = dr;
1291 slot->count = 1;
1292 new_slot = peeling_htab->find_slot (slot, INSERT);
1293 *new_slot = slot;
1296 if (!supportable_dr_alignment
1297 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1298 slot->count += VECT_MAX_COST;
1302 /* Traverse peeling hash table to find peeling option that aligns maximum
1303 number of data accesses. */
1306 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1307 _vect_peel_extended_info *max)
1309 vect_peel_info elem = *slot;
1311 if (elem->count > max->peel_info.count
1312 || (elem->count == max->peel_info.count
1313 && max->peel_info.npeel > elem->npeel))
1315 max->peel_info.npeel = elem->npeel;
1316 max->peel_info.count = elem->count;
1317 max->peel_info.dr = elem->dr;
1320 return 1;
1323 /* Get the costs of peeling NPEEL iterations checking data access costs
1324 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1325 misalignment will be zero after peeling. */
1327 static void
1328 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1329 struct data_reference *dr0,
1330 unsigned int *inside_cost,
1331 unsigned int *outside_cost,
1332 stmt_vector_for_cost *body_cost_vec,
1333 unsigned int npeel,
1334 bool unknown_misalignment)
1336 unsigned i;
1337 data_reference *dr;
1339 FOR_EACH_VEC_ELT (datarefs, i, dr)
1341 gimple *stmt = DR_STMT (dr);
1342 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1343 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1344 continue;
1346 /* For interleaving, only the alignment of the first access
1347 matters. */
1348 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1349 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1350 continue;
1352 /* Strided accesses perform only component accesses, alignment is
1353 irrelevant for them. */
1354 if (STMT_VINFO_STRIDED_P (stmt_info)
1355 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1356 continue;
1358 int save_misalignment;
1359 save_misalignment = DR_MISALIGNMENT (dr);
1360 if (npeel == 0)
1362 else if (unknown_misalignment && dr == dr0)
1363 SET_DR_MISALIGNMENT (dr, 0);
1364 else
1365 vect_update_misalignment_for_peel (dr, dr0, npeel);
1366 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1367 body_cost_vec);
1368 SET_DR_MISALIGNMENT (dr, save_misalignment);
1372 /* Traverse peeling hash table and calculate cost for each peeling option.
1373 Find the one with the lowest cost. */
1376 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1377 _vect_peel_extended_info *min)
1379 vect_peel_info elem = *slot;
1380 int dummy;
1381 unsigned int inside_cost = 0, outside_cost = 0;
1382 gimple *stmt = DR_STMT (elem->dr);
1383 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1384 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1385 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1386 epilogue_cost_vec;
1388 prologue_cost_vec.create (2);
1389 body_cost_vec.create (2);
1390 epilogue_cost_vec.create (2);
1392 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1393 elem->dr, &inside_cost, &outside_cost,
1394 &body_cost_vec, elem->npeel, false);
1396 body_cost_vec.release ();
1398 outside_cost += vect_get_known_peeling_cost
1399 (loop_vinfo, elem->npeel, &dummy,
1400 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1401 &prologue_cost_vec, &epilogue_cost_vec);
1403 /* Prologue and epilogue costs are added to the target model later.
1404 These costs depend only on the scalar iteration cost, the
1405 number of peeling iterations finally chosen, and the number of
1406 misaligned statements. So discard the information found here. */
1407 prologue_cost_vec.release ();
1408 epilogue_cost_vec.release ();
1410 if (inside_cost < min->inside_cost
1411 || (inside_cost == min->inside_cost
1412 && outside_cost < min->outside_cost))
1414 min->inside_cost = inside_cost;
1415 min->outside_cost = outside_cost;
1416 min->peel_info.dr = elem->dr;
1417 min->peel_info.npeel = elem->npeel;
1418 min->peel_info.count = elem->count;
1421 return 1;
1425 /* Choose best peeling option by traversing peeling hash table and either
1426 choosing an option with the lowest cost (if cost model is enabled) or the
1427 option that aligns as many accesses as possible. */
1429 static struct _vect_peel_extended_info
1430 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1431 loop_vec_info loop_vinfo)
1433 struct _vect_peel_extended_info res;
1435 res.peel_info.dr = NULL;
1437 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1439 res.inside_cost = INT_MAX;
1440 res.outside_cost = INT_MAX;
1441 peeling_htab->traverse <_vect_peel_extended_info *,
1442 vect_peeling_hash_get_lowest_cost> (&res);
1444 else
1446 res.peel_info.count = 0;
1447 peeling_htab->traverse <_vect_peel_extended_info *,
1448 vect_peeling_hash_get_most_frequent> (&res);
1449 res.inside_cost = 0;
1450 res.outside_cost = 0;
1453 return res;
1456 /* Return true if the new peeling NPEEL is supported. */
1458 static bool
1459 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1460 unsigned npeel)
1462 unsigned i;
1463 struct data_reference *dr = NULL;
1464 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1465 gimple *stmt;
1466 stmt_vec_info stmt_info;
1467 enum dr_alignment_support supportable_dr_alignment;
1469 /* Ensure that all data refs can be vectorized after the peel. */
1470 FOR_EACH_VEC_ELT (datarefs, i, dr)
1472 int save_misalignment;
1474 if (dr == dr0)
1475 continue;
1477 stmt = DR_STMT (dr);
1478 stmt_info = vinfo_for_stmt (stmt);
1479 /* For interleaving, only the alignment of the first access
1480 matters. */
1481 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1482 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1483 continue;
1485 /* Strided accesses perform only component accesses, alignment is
1486 irrelevant for them. */
1487 if (STMT_VINFO_STRIDED_P (stmt_info)
1488 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1489 continue;
1491 save_misalignment = DR_MISALIGNMENT (dr);
1492 vect_update_misalignment_for_peel (dr, dr0, npeel);
1493 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1494 SET_DR_MISALIGNMENT (dr, save_misalignment);
1496 if (!supportable_dr_alignment)
1497 return false;
1500 return true;
1503 /* Function vect_enhance_data_refs_alignment
1505 This pass will use loop versioning and loop peeling in order to enhance
1506 the alignment of data references in the loop.
1508 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1509 original loop is to be vectorized. Any other loops that are created by
1510 the transformations performed in this pass - are not supposed to be
1511 vectorized. This restriction will be relaxed.
1513 This pass will require a cost model to guide it whether to apply peeling
1514 or versioning or a combination of the two. For example, the scheme that
1515 intel uses when given a loop with several memory accesses, is as follows:
1516 choose one memory access ('p') which alignment you want to force by doing
1517 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1518 other accesses are not necessarily aligned, or (2) use loop versioning to
1519 generate one loop in which all accesses are aligned, and another loop in
1520 which only 'p' is necessarily aligned.
1522 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1523 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1524 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1526 Devising a cost model is the most critical aspect of this work. It will
1527 guide us on which access to peel for, whether to use loop versioning, how
1528 many versions to create, etc. The cost model will probably consist of
1529 generic considerations as well as target specific considerations (on
1530 powerpc for example, misaligned stores are more painful than misaligned
1531 loads).
1533 Here are the general steps involved in alignment enhancements:
1535 -- original loop, before alignment analysis:
1536 for (i=0; i<N; i++){
1537 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1538 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1541 -- After vect_compute_data_refs_alignment:
1542 for (i=0; i<N; i++){
1543 x = q[i]; # DR_MISALIGNMENT(q) = 3
1544 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1547 -- Possibility 1: we do loop versioning:
1548 if (p is aligned) {
1549 for (i=0; i<N; i++){ # loop 1A
1550 x = q[i]; # DR_MISALIGNMENT(q) = 3
1551 p[i] = y; # DR_MISALIGNMENT(p) = 0
1554 else {
1555 for (i=0; i<N; i++){ # loop 1B
1556 x = q[i]; # DR_MISALIGNMENT(q) = 3
1557 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1561 -- Possibility 2: we do loop peeling:
1562 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1563 x = q[i];
1564 p[i] = y;
1566 for (i = 3; i < N; i++){ # loop 2A
1567 x = q[i]; # DR_MISALIGNMENT(q) = 0
1568 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1571 -- Possibility 3: combination of loop peeling and versioning:
1572 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1573 x = q[i];
1574 p[i] = y;
1576 if (p is aligned) {
1577 for (i = 3; i<N; i++){ # loop 3A
1578 x = q[i]; # DR_MISALIGNMENT(q) = 0
1579 p[i] = y; # DR_MISALIGNMENT(p) = 0
1582 else {
1583 for (i = 3; i<N; i++){ # loop 3B
1584 x = q[i]; # DR_MISALIGNMENT(q) = 0
1585 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1589 These loops are later passed to loop_transform to be vectorized. The
1590 vectorizer will use the alignment information to guide the transformation
1591 (whether to generate regular loads/stores, or with special handling for
1592 misalignment). */
1594 bool
1595 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1597 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1598 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1599 enum dr_alignment_support supportable_dr_alignment;
1600 struct data_reference *dr0 = NULL, *first_store = NULL;
1601 struct data_reference *dr;
1602 unsigned int i, j;
1603 bool do_peeling = false;
1604 bool do_versioning = false;
1605 bool stat;
1606 gimple *stmt;
1607 stmt_vec_info stmt_info;
1608 unsigned int npeel = 0;
1609 bool one_misalignment_known = false;
1610 bool one_misalignment_unknown = false;
1611 bool one_dr_unsupportable = false;
1612 struct data_reference *unsupportable_dr = NULL;
1613 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1614 unsigned possible_npeel_number = 1;
1615 tree vectype;
1616 unsigned int nelements, mis, same_align_drs_max = 0;
1617 hash_table<peel_info_hasher> peeling_htab (1);
1619 if (dump_enabled_p ())
1620 dump_printf_loc (MSG_NOTE, vect_location,
1621 "=== vect_enhance_data_refs_alignment ===\n");
1623 /* Reset data so we can safely be called multiple times. */
1624 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1625 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1627 /* While cost model enhancements are expected in the future, the high level
1628 view of the code at this time is as follows:
1630 A) If there is a misaligned access then see if peeling to align
1631 this access can make all data references satisfy
1632 vect_supportable_dr_alignment. If so, update data structures
1633 as needed and return true.
1635 B) If peeling wasn't possible and there is a data reference with an
1636 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1637 then see if loop versioning checks can be used to make all data
1638 references satisfy vect_supportable_dr_alignment. If so, update
1639 data structures as needed and return true.
1641 C) If neither peeling nor versioning were successful then return false if
1642 any data reference does not satisfy vect_supportable_dr_alignment.
1644 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1646 Note, Possibility 3 above (which is peeling and versioning together) is not
1647 being done at this time. */
1649 /* (1) Peeling to force alignment. */
1651 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1652 Considerations:
1653 + How many accesses will become aligned due to the peeling
1654 - How many accesses will become unaligned due to the peeling,
1655 and the cost of misaligned accesses.
1656 - The cost of peeling (the extra runtime checks, the increase
1657 in code size). */
1659 FOR_EACH_VEC_ELT (datarefs, i, dr)
1661 stmt = DR_STMT (dr);
1662 stmt_info = vinfo_for_stmt (stmt);
1664 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1665 continue;
1667 /* For interleaving, only the alignment of the first access
1668 matters. */
1669 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1670 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1671 continue;
1673 /* For invariant accesses there is nothing to enhance. */
1674 if (integer_zerop (DR_STEP (dr)))
1675 continue;
1677 /* Strided accesses perform only component accesses, alignment is
1678 irrelevant for them. */
1679 if (STMT_VINFO_STRIDED_P (stmt_info)
1680 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1681 continue;
1683 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1684 do_peeling = vector_alignment_reachable_p (dr);
1685 if (do_peeling)
1687 if (known_alignment_for_access_p (dr))
1689 unsigned int npeel_tmp = 0;
1690 bool negative = tree_int_cst_compare (DR_STEP (dr),
1691 size_zero_node) < 0;
1693 vectype = STMT_VINFO_VECTYPE (stmt_info);
1694 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1695 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1696 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1697 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1698 if (DR_MISALIGNMENT (dr) != 0)
1699 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1701 /* For multiple types, it is possible that the bigger type access
1702 will have more than one peeling option. E.g., a loop with two
1703 types: one of size (vector size / 4), and the other one of
1704 size (vector size / 8). Vectorization factor will 8. If both
1705 accesses are misaligned by 3, the first one needs one scalar
1706 iteration to be aligned, and the second one needs 5. But the
1707 first one will be aligned also by peeling 5 scalar
1708 iterations, and in that case both accesses will be aligned.
1709 Hence, except for the immediate peeling amount, we also want
1710 to try to add full vector size, while we don't exceed
1711 vectorization factor.
1712 We do this automatically for cost model, since we calculate
1713 cost for every peeling option. */
1714 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1716 if (STMT_SLP_TYPE (stmt_info))
1717 possible_npeel_number
1718 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1719 else
1720 possible_npeel_number = vf / nelements;
1722 /* NPEEL_TMP is 0 when there is no misalignment, but also
1723 allow peeling NELEMENTS. */
1724 if (DR_MISALIGNMENT (dr) == 0)
1725 possible_npeel_number++;
1728 /* Save info about DR in the hash table. Also include peeling
1729 amounts according to the explanation above. */
1730 for (j = 0; j < possible_npeel_number; j++)
1732 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1733 dr, npeel_tmp);
1734 npeel_tmp += target_align / dr_size;
1737 one_misalignment_known = true;
1739 else
1741 /* If we don't know any misalignment values, we prefer
1742 peeling for data-ref that has the maximum number of data-refs
1743 with the same alignment, unless the target prefers to align
1744 stores over load. */
1745 unsigned same_align_drs
1746 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1747 if (!dr0
1748 || same_align_drs_max < same_align_drs)
1750 same_align_drs_max = same_align_drs;
1751 dr0 = dr;
1753 /* For data-refs with the same number of related
1754 accesses prefer the one where the misalign
1755 computation will be invariant in the outermost loop. */
1756 else if (same_align_drs_max == same_align_drs)
1758 struct loop *ivloop0, *ivloop;
1759 ivloop0 = outermost_invariant_loop_for_expr
1760 (loop, DR_BASE_ADDRESS (dr0));
1761 ivloop = outermost_invariant_loop_for_expr
1762 (loop, DR_BASE_ADDRESS (dr));
1763 if ((ivloop && !ivloop0)
1764 || (ivloop && ivloop0
1765 && flow_loop_nested_p (ivloop, ivloop0)))
1766 dr0 = dr;
1769 one_misalignment_unknown = true;
1771 /* Check for data refs with unsupportable alignment that
1772 can be peeled. */
1773 if (!supportable_dr_alignment)
1775 one_dr_unsupportable = true;
1776 unsupportable_dr = dr;
1779 if (!first_store && DR_IS_WRITE (dr))
1780 first_store = dr;
1783 else
1785 if (!aligned_access_p (dr))
1787 if (dump_enabled_p ())
1788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1789 "vector alignment may not be reachable\n");
1790 break;
1795 /* Check if we can possibly peel the loop. */
1796 if (!vect_can_advance_ivs_p (loop_vinfo)
1797 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1798 || loop->inner)
1799 do_peeling = false;
1801 struct _vect_peel_extended_info peel_for_known_alignment;
1802 struct _vect_peel_extended_info peel_for_unknown_alignment;
1803 struct _vect_peel_extended_info best_peel;
1805 peel_for_unknown_alignment.inside_cost = INT_MAX;
1806 peel_for_unknown_alignment.outside_cost = INT_MAX;
1807 peel_for_unknown_alignment.peel_info.count = 0;
1809 if (do_peeling
1810 && one_misalignment_unknown)
1812 /* Check if the target requires to prefer stores over loads, i.e., if
1813 misaligned stores are more expensive than misaligned loads (taking
1814 drs with same alignment into account). */
1815 unsigned int load_inside_cost = 0;
1816 unsigned int load_outside_cost = 0;
1817 unsigned int store_inside_cost = 0;
1818 unsigned int store_outside_cost = 0;
1820 stmt_vector_for_cost dummy;
1821 dummy.create (2);
1822 vect_get_peeling_costs_all_drs (datarefs, dr0,
1823 &load_inside_cost,
1824 &load_outside_cost,
1825 &dummy, vf / 2, true);
1826 dummy.release ();
1828 if (first_store)
1830 dummy.create (2);
1831 vect_get_peeling_costs_all_drs (datarefs, first_store,
1832 &store_inside_cost,
1833 &store_outside_cost,
1834 &dummy, vf / 2, true);
1835 dummy.release ();
1837 else
1839 store_inside_cost = INT_MAX;
1840 store_outside_cost = INT_MAX;
1843 if (load_inside_cost > store_inside_cost
1844 || (load_inside_cost == store_inside_cost
1845 && load_outside_cost > store_outside_cost))
1847 dr0 = first_store;
1848 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1849 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1851 else
1853 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1854 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1857 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1858 prologue_cost_vec.create (2);
1859 epilogue_cost_vec.create (2);
1861 int dummy2;
1862 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1863 (loop_vinfo, vf / 2, &dummy2,
1864 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1865 &prologue_cost_vec, &epilogue_cost_vec);
1867 prologue_cost_vec.release ();
1868 epilogue_cost_vec.release ();
1870 peel_for_unknown_alignment.peel_info.count = 1
1871 + STMT_VINFO_SAME_ALIGN_REFS
1872 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1875 peel_for_unknown_alignment.peel_info.npeel = 0;
1876 peel_for_unknown_alignment.peel_info.dr = dr0;
1878 best_peel = peel_for_unknown_alignment;
1880 peel_for_known_alignment.inside_cost = INT_MAX;
1881 peel_for_known_alignment.outside_cost = INT_MAX;
1882 peel_for_known_alignment.peel_info.count = 0;
1883 peel_for_known_alignment.peel_info.dr = NULL;
1885 if (do_peeling && one_misalignment_known)
1887 /* Peeling is possible, but there is no data access that is not supported
1888 unless aligned. So we try to choose the best possible peeling from
1889 the hash table. */
1890 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1891 (&peeling_htab, loop_vinfo);
1894 /* Compare costs of peeling for known and unknown alignment. */
1895 if (peel_for_known_alignment.peel_info.dr != NULL
1896 && peel_for_unknown_alignment.inside_cost
1897 >= peel_for_known_alignment.inside_cost)
1899 best_peel = peel_for_known_alignment;
1901 /* If the best peeling for known alignment has NPEEL == 0, perform no
1902 peeling at all except if there is an unsupportable dr that we can
1903 align. */
1904 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1905 do_peeling = false;
1908 /* If there is an unsupportable data ref, prefer this over all choices so far
1909 since we'd have to discard a chosen peeling except when it accidentally
1910 aligned the unsupportable data ref. */
1911 if (one_dr_unsupportable)
1912 dr0 = unsupportable_dr;
1913 else if (do_peeling)
1915 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1916 TODO: Use nopeel_outside_cost or get rid of it? */
1917 unsigned nopeel_inside_cost = 0;
1918 unsigned nopeel_outside_cost = 0;
1920 stmt_vector_for_cost dummy;
1921 dummy.create (2);
1922 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1923 &nopeel_outside_cost, &dummy, 0, false);
1924 dummy.release ();
1926 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1927 costs will be recorded. */
1928 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1929 prologue_cost_vec.create (2);
1930 epilogue_cost_vec.create (2);
1932 int dummy2;
1933 nopeel_outside_cost += vect_get_known_peeling_cost
1934 (loop_vinfo, 0, &dummy2,
1935 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1936 &prologue_cost_vec, &epilogue_cost_vec);
1938 prologue_cost_vec.release ();
1939 epilogue_cost_vec.release ();
1941 npeel = best_peel.peel_info.npeel;
1942 dr0 = best_peel.peel_info.dr;
1944 /* If no peeling is not more expensive than the best peeling we
1945 have so far, don't perform any peeling. */
1946 if (nopeel_inside_cost <= best_peel.inside_cost)
1947 do_peeling = false;
1950 if (do_peeling)
1952 stmt = DR_STMT (dr0);
1953 stmt_info = vinfo_for_stmt (stmt);
1954 vectype = STMT_VINFO_VECTYPE (stmt_info);
1956 if (known_alignment_for_access_p (dr0))
1958 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1959 size_zero_node) < 0;
1960 if (!npeel)
1962 /* Since it's known at compile time, compute the number of
1963 iterations in the peeled loop (the peeling factor) for use in
1964 updating DR_MISALIGNMENT values. The peeling factor is the
1965 vectorization factor minus the misalignment as an element
1966 count. */
1967 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1968 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1969 npeel = ((mis & (target_align - 1))
1970 / vect_get_scalar_dr_size (dr0));
1973 /* For interleaved data access every iteration accesses all the
1974 members of the group, therefore we divide the number of iterations
1975 by the group size. */
1976 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1977 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1978 npeel /= GROUP_SIZE (stmt_info);
1980 if (dump_enabled_p ())
1981 dump_printf_loc (MSG_NOTE, vect_location,
1982 "Try peeling by %d\n", npeel);
1985 /* Ensure that all datarefs can be vectorized after the peel. */
1986 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1987 do_peeling = false;
1989 /* Check if all datarefs are supportable and log. */
1990 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1992 stat = vect_verify_datarefs_alignment (loop_vinfo);
1993 if (!stat)
1994 do_peeling = false;
1995 else
1996 return stat;
1999 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2000 if (do_peeling)
2002 unsigned max_allowed_peel
2003 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2004 if (max_allowed_peel != (unsigned)-1)
2006 unsigned max_peel = npeel;
2007 if (max_peel == 0)
2009 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2010 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2012 if (max_peel > max_allowed_peel)
2014 do_peeling = false;
2015 if (dump_enabled_p ())
2016 dump_printf_loc (MSG_NOTE, vect_location,
2017 "Disable peeling, max peels reached: %d\n", max_peel);
2022 /* Cost model #2 - if peeling may result in a remaining loop not
2023 iterating enough to be vectorized then do not peel. */
2024 if (do_peeling
2025 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2027 unsigned max_peel
2028 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
2029 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
2030 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
2031 do_peeling = false;
2034 if (do_peeling)
2036 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2037 If the misalignment of DR_i is identical to that of dr0 then set
2038 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2039 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2040 by the peeling factor times the element size of DR_i (MOD the
2041 vectorization factor times the size). Otherwise, the
2042 misalignment of DR_i must be set to unknown. */
2043 FOR_EACH_VEC_ELT (datarefs, i, dr)
2044 if (dr != dr0)
2046 /* Strided accesses perform only component accesses, alignment
2047 is irrelevant for them. */
2048 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2049 if (STMT_VINFO_STRIDED_P (stmt_info)
2050 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2051 continue;
2053 vect_update_misalignment_for_peel (dr, dr0, npeel);
2056 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2057 if (npeel)
2058 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2059 else
2060 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2061 = DR_MISALIGNMENT (dr0);
2062 SET_DR_MISALIGNMENT (dr0, 0);
2063 if (dump_enabled_p ())
2065 dump_printf_loc (MSG_NOTE, vect_location,
2066 "Alignment of access forced using peeling.\n");
2067 dump_printf_loc (MSG_NOTE, vect_location,
2068 "Peeling for alignment will be applied.\n");
2071 /* The inside-loop cost will be accounted for in vectorizable_load
2072 and vectorizable_store correctly with adjusted alignments.
2073 Drop the body_cst_vec on the floor here. */
2074 stat = vect_verify_datarefs_alignment (loop_vinfo);
2075 gcc_assert (stat);
2076 return stat;
2080 /* (2) Versioning to force alignment. */
2082 /* Try versioning if:
2083 1) optimize loop for speed
2084 2) there is at least one unsupported misaligned data ref with an unknown
2085 misalignment, and
2086 3) all misaligned data refs with a known misalignment are supported, and
2087 4) the number of runtime alignment checks is within reason. */
2089 do_versioning =
2090 optimize_loop_nest_for_speed_p (loop)
2091 && (!loop->inner); /* FORNOW */
2093 if (do_versioning)
2095 FOR_EACH_VEC_ELT (datarefs, i, dr)
2097 stmt = DR_STMT (dr);
2098 stmt_info = vinfo_for_stmt (stmt);
2100 /* For interleaving, only the alignment of the first access
2101 matters. */
2102 if (aligned_access_p (dr)
2103 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2104 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2105 continue;
2107 if (STMT_VINFO_STRIDED_P (stmt_info))
2109 /* Strided loads perform only component accesses, alignment is
2110 irrelevant for them. */
2111 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2112 continue;
2113 do_versioning = false;
2114 break;
2117 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2119 if (!supportable_dr_alignment)
2121 gimple *stmt;
2122 int mask;
2123 tree vectype;
2125 if (known_alignment_for_access_p (dr)
2126 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2127 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2129 do_versioning = false;
2130 break;
2133 stmt = DR_STMT (dr);
2134 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2135 gcc_assert (vectype);
2137 /* The rightmost bits of an aligned address must be zeros.
2138 Construct the mask needed for this test. For example,
2139 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2140 mask must be 15 = 0xf. */
2141 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2143 /* FORNOW: use the same mask to test all potentially unaligned
2144 references in the loop. The vectorizer currently supports
2145 a single vector size, see the reference to
2146 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2147 vectorization factor is computed. */
2148 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2149 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2150 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2151 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2152 DR_STMT (dr));
2156 /* Versioning requires at least one misaligned data reference. */
2157 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2158 do_versioning = false;
2159 else if (!do_versioning)
2160 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2163 if (do_versioning)
2165 vec<gimple *> may_misalign_stmts
2166 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2167 gimple *stmt;
2169 /* It can now be assumed that the data references in the statements
2170 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2171 of the loop being vectorized. */
2172 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2174 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2175 dr = STMT_VINFO_DATA_REF (stmt_info);
2176 SET_DR_MISALIGNMENT (dr, 0);
2177 if (dump_enabled_p ())
2178 dump_printf_loc (MSG_NOTE, vect_location,
2179 "Alignment of access forced using versioning.\n");
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_NOTE, vect_location,
2184 "Versioning for alignment will be applied.\n");
2186 /* Peeling and versioning can't be done together at this time. */
2187 gcc_assert (! (do_peeling && do_versioning));
2189 stat = vect_verify_datarefs_alignment (loop_vinfo);
2190 gcc_assert (stat);
2191 return stat;
2194 /* This point is reached if neither peeling nor versioning is being done. */
2195 gcc_assert (! (do_peeling || do_versioning));
2197 stat = vect_verify_datarefs_alignment (loop_vinfo);
2198 return stat;
2202 /* Function vect_find_same_alignment_drs.
2204 Update group and alignment relations according to the chosen
2205 vectorization factor. */
2207 static void
2208 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2210 struct data_reference *dra = DDR_A (ddr);
2211 struct data_reference *drb = DDR_B (ddr);
2212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2215 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2216 return;
2218 if (dra == drb)
2219 return;
2221 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2222 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2223 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2224 return;
2226 /* Two references with distance zero have the same alignment. */
2227 offset_int diff = (wi::to_offset (DR_INIT (dra))
2228 - wi::to_offset (DR_INIT (drb)));
2229 if (diff != 0)
2231 /* Get the wider of the two alignments. */
2232 unsigned int align_a = (vect_calculate_target_alignment (dra)
2233 / BITS_PER_UNIT);
2234 unsigned int align_b = (vect_calculate_target_alignment (drb)
2235 / BITS_PER_UNIT);
2236 unsigned int max_align = MAX (align_a, align_b);
2238 /* Require the gap to be a multiple of the larger vector alignment. */
2239 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2240 return;
2243 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2244 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2245 if (dump_enabled_p ())
2247 dump_printf_loc (MSG_NOTE, vect_location,
2248 "accesses have the same alignment: ");
2249 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2250 dump_printf (MSG_NOTE, " and ");
2251 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2252 dump_printf (MSG_NOTE, "\n");
2257 /* Function vect_analyze_data_refs_alignment
2259 Analyze the alignment of the data-references in the loop.
2260 Return FALSE if a data reference is found that cannot be vectorized. */
2262 bool
2263 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2265 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_NOTE, vect_location,
2267 "=== vect_analyze_data_refs_alignment ===\n");
2269 /* Mark groups of data references with same alignment using
2270 data dependence information. */
2271 vec<ddr_p> ddrs = vinfo->ddrs;
2272 struct data_dependence_relation *ddr;
2273 unsigned int i;
2275 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2276 vect_find_same_alignment_drs (ddr);
2278 vec<data_reference_p> datarefs = vinfo->datarefs;
2279 struct data_reference *dr;
2281 vect_record_base_alignments (vinfo);
2282 FOR_EACH_VEC_ELT (datarefs, i, dr)
2284 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2285 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2286 && !vect_compute_data_ref_alignment (dr))
2288 /* Strided accesses perform only component accesses, misalignment
2289 information is irrelevant for them. */
2290 if (STMT_VINFO_STRIDED_P (stmt_info)
2291 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2292 continue;
2294 if (dump_enabled_p ())
2295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2296 "not vectorized: can't calculate alignment "
2297 "for data ref.\n");
2299 return false;
2303 return true;
2307 /* Analyze alignment of DRs of stmts in NODE. */
2309 static bool
2310 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2312 /* We vectorize from the first scalar stmt in the node unless
2313 the node is permuted in which case we start from the first
2314 element in the group. */
2315 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2316 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2317 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2318 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2320 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2321 if (! vect_compute_data_ref_alignment (dr)
2322 /* For creating the data-ref pointer we need alignment of the
2323 first element anyway. */
2324 || (dr != first_dr
2325 && ! vect_compute_data_ref_alignment (first_dr))
2326 || ! verify_data_ref_alignment (dr))
2328 if (dump_enabled_p ())
2329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2330 "not vectorized: bad data alignment in basic "
2331 "block.\n");
2332 return false;
2335 return true;
2338 /* Function vect_slp_analyze_instance_alignment
2340 Analyze the alignment of the data-references in the SLP instance.
2341 Return FALSE if a data reference is found that cannot be vectorized. */
2343 bool
2344 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2346 if (dump_enabled_p ())
2347 dump_printf_loc (MSG_NOTE, vect_location,
2348 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2350 slp_tree node;
2351 unsigned i;
2352 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2353 if (! vect_slp_analyze_and_verify_node_alignment (node))
2354 return false;
2356 node = SLP_INSTANCE_TREE (instance);
2357 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2358 && ! vect_slp_analyze_and_verify_node_alignment
2359 (SLP_INSTANCE_TREE (instance)))
2360 return false;
2362 return true;
2366 /* Analyze groups of accesses: check that DR belongs to a group of
2367 accesses of legal size, step, etc. Detect gaps, single element
2368 interleaving, and other special cases. Set grouped access info.
2369 Collect groups of strided stores for further use in SLP analysis.
2370 Worker for vect_analyze_group_access. */
2372 static bool
2373 vect_analyze_group_access_1 (struct data_reference *dr)
2375 tree step = DR_STEP (dr);
2376 tree scalar_type = TREE_TYPE (DR_REF (dr));
2377 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2378 gimple *stmt = DR_STMT (dr);
2379 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2380 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2381 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2382 HOST_WIDE_INT dr_step = -1;
2383 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2384 bool slp_impossible = false;
2386 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2387 size of the interleaving group (including gaps). */
2388 if (tree_fits_shwi_p (step))
2390 dr_step = tree_to_shwi (step);
2391 /* Check that STEP is a multiple of type size. Otherwise there is
2392 a non-element-sized gap at the end of the group which we
2393 cannot represent in GROUP_GAP or GROUP_SIZE.
2394 ??? As we can handle non-constant step fine here we should
2395 simply remove uses of GROUP_GAP between the last and first
2396 element and instead rely on DR_STEP. GROUP_SIZE then would
2397 simply not include that gap. */
2398 if ((dr_step % type_size) != 0)
2400 if (dump_enabled_p ())
2402 dump_printf_loc (MSG_NOTE, vect_location,
2403 "Step ");
2404 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2405 dump_printf (MSG_NOTE,
2406 " is not a multiple of the element size for ");
2407 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2408 dump_printf (MSG_NOTE, "\n");
2410 return false;
2412 groupsize = absu_hwi (dr_step) / type_size;
2414 else
2415 groupsize = 0;
2417 /* Not consecutive access is possible only if it is a part of interleaving. */
2418 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2420 /* Check if it this DR is a part of interleaving, and is a single
2421 element of the group that is accessed in the loop. */
2423 /* Gaps are supported only for loads. STEP must be a multiple of the type
2424 size. The size of the group must be a power of 2. */
2425 if (DR_IS_READ (dr)
2426 && (dr_step % type_size) == 0
2427 && groupsize > 0
2428 && pow2p_hwi (groupsize))
2430 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2431 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2432 GROUP_GAP (stmt_info) = groupsize - 1;
2433 if (dump_enabled_p ())
2435 dump_printf_loc (MSG_NOTE, vect_location,
2436 "Detected single element interleaving ");
2437 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2438 dump_printf (MSG_NOTE, " step ");
2439 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2440 dump_printf (MSG_NOTE, "\n");
2443 return true;
2446 if (dump_enabled_p ())
2448 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2449 "not consecutive access ");
2450 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2453 if (bb_vinfo)
2455 /* Mark the statement as unvectorizable. */
2456 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2457 return true;
2460 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2461 STMT_VINFO_STRIDED_P (stmt_info) = true;
2462 return true;
2465 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2467 /* First stmt in the interleaving chain. Check the chain. */
2468 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2469 struct data_reference *data_ref = dr;
2470 unsigned int count = 1;
2471 tree prev_init = DR_INIT (data_ref);
2472 gimple *prev = stmt;
2473 HOST_WIDE_INT diff, gaps = 0;
2475 while (next)
2477 /* Skip same data-refs. In case that two or more stmts share
2478 data-ref (supported only for loads), we vectorize only the first
2479 stmt, and the rest get their vectorized loads from the first
2480 one. */
2481 if (!tree_int_cst_compare (DR_INIT (data_ref),
2482 DR_INIT (STMT_VINFO_DATA_REF (
2483 vinfo_for_stmt (next)))))
2485 if (DR_IS_WRITE (data_ref))
2487 if (dump_enabled_p ())
2488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2489 "Two store stmts share the same dr.\n");
2490 return false;
2493 if (dump_enabled_p ())
2494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2495 "Two or more load stmts share the same dr.\n");
2497 /* For load use the same data-ref load. */
2498 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2500 prev = next;
2501 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2502 continue;
2505 prev = next;
2506 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2508 /* All group members have the same STEP by construction. */
2509 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2511 /* Check that the distance between two accesses is equal to the type
2512 size. Otherwise, we have gaps. */
2513 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2514 - TREE_INT_CST_LOW (prev_init)) / type_size;
2515 if (diff != 1)
2517 /* FORNOW: SLP of accesses with gaps is not supported. */
2518 slp_impossible = true;
2519 if (DR_IS_WRITE (data_ref))
2521 if (dump_enabled_p ())
2522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523 "interleaved store with gaps\n");
2524 return false;
2527 gaps += diff - 1;
2530 last_accessed_element += diff;
2532 /* Store the gap from the previous member of the group. If there is no
2533 gap in the access, GROUP_GAP is always 1. */
2534 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2536 prev_init = DR_INIT (data_ref);
2537 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2538 /* Count the number of data-refs in the chain. */
2539 count++;
2542 if (groupsize == 0)
2543 groupsize = count + gaps;
2545 /* This could be UINT_MAX but as we are generating code in a very
2546 inefficient way we have to cap earlier. See PR78699 for example. */
2547 if (groupsize > 4096)
2549 if (dump_enabled_p ())
2550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2551 "group is too large\n");
2552 return false;
2555 /* Check that the size of the interleaving is equal to count for stores,
2556 i.e., that there are no gaps. */
2557 if (groupsize != count
2558 && !DR_IS_READ (dr))
2560 if (dump_enabled_p ())
2561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2562 "interleaved store with gaps\n");
2563 return false;
2566 /* If there is a gap after the last load in the group it is the
2567 difference between the groupsize and the last accessed
2568 element.
2569 When there is no gap, this difference should be 0. */
2570 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2572 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2573 if (dump_enabled_p ())
2575 dump_printf_loc (MSG_NOTE, vect_location,
2576 "Detected interleaving ");
2577 if (DR_IS_READ (dr))
2578 dump_printf (MSG_NOTE, "load ");
2579 else
2580 dump_printf (MSG_NOTE, "store ");
2581 dump_printf (MSG_NOTE, "of size %u starting with ",
2582 (unsigned)groupsize);
2583 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2584 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2585 dump_printf_loc (MSG_NOTE, vect_location,
2586 "There is a gap of %u elements after the group\n",
2587 GROUP_GAP (vinfo_for_stmt (stmt)));
2590 /* SLP: create an SLP data structure for every interleaving group of
2591 stores for further analysis in vect_analyse_slp. */
2592 if (DR_IS_WRITE (dr) && !slp_impossible)
2594 if (loop_vinfo)
2595 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2596 if (bb_vinfo)
2597 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2601 return true;
2604 /* Analyze groups of accesses: check that DR belongs to a group of
2605 accesses of legal size, step, etc. Detect gaps, single element
2606 interleaving, and other special cases. Set grouped access info.
2607 Collect groups of strided stores for further use in SLP analysis. */
2609 static bool
2610 vect_analyze_group_access (struct data_reference *dr)
2612 if (!vect_analyze_group_access_1 (dr))
2614 /* Dissolve the group if present. */
2615 gimple *next;
2616 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2617 while (stmt)
2619 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2620 next = GROUP_NEXT_ELEMENT (vinfo);
2621 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2622 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2623 stmt = next;
2625 return false;
2627 return true;
2630 /* Analyze the access pattern of the data-reference DR.
2631 In case of non-consecutive accesses call vect_analyze_group_access() to
2632 analyze groups of accesses. */
2634 static bool
2635 vect_analyze_data_ref_access (struct data_reference *dr)
2637 tree step = DR_STEP (dr);
2638 tree scalar_type = TREE_TYPE (DR_REF (dr));
2639 gimple *stmt = DR_STMT (dr);
2640 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2641 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2642 struct loop *loop = NULL;
2644 if (loop_vinfo)
2645 loop = LOOP_VINFO_LOOP (loop_vinfo);
2647 if (loop_vinfo && !step)
2649 if (dump_enabled_p ())
2650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2651 "bad data-ref access in loop\n");
2652 return false;
2655 /* Allow loads with zero step in inner-loop vectorization. */
2656 if (loop_vinfo && integer_zerop (step))
2658 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2659 if (!nested_in_vect_loop_p (loop, stmt))
2660 return DR_IS_READ (dr);
2661 /* Allow references with zero step for outer loops marked
2662 with pragma omp simd only - it guarantees absence of
2663 loop-carried dependencies between inner loop iterations. */
2664 if (!loop->force_vectorize)
2666 if (dump_enabled_p ())
2667 dump_printf_loc (MSG_NOTE, vect_location,
2668 "zero step in inner loop of nest\n");
2669 return false;
2673 if (loop && nested_in_vect_loop_p (loop, stmt))
2675 /* Interleaved accesses are not yet supported within outer-loop
2676 vectorization for references in the inner-loop. */
2677 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2679 /* For the rest of the analysis we use the outer-loop step. */
2680 step = STMT_VINFO_DR_STEP (stmt_info);
2681 if (integer_zerop (step))
2683 if (dump_enabled_p ())
2684 dump_printf_loc (MSG_NOTE, vect_location,
2685 "zero step in outer loop.\n");
2686 return DR_IS_READ (dr);
2690 /* Consecutive? */
2691 if (TREE_CODE (step) == INTEGER_CST)
2693 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2694 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2695 || (dr_step < 0
2696 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2698 /* Mark that it is not interleaving. */
2699 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2700 return true;
2704 if (loop && nested_in_vect_loop_p (loop, stmt))
2706 if (dump_enabled_p ())
2707 dump_printf_loc (MSG_NOTE, vect_location,
2708 "grouped access in outer loop.\n");
2709 return false;
2713 /* Assume this is a DR handled by non-constant strided load case. */
2714 if (TREE_CODE (step) != INTEGER_CST)
2715 return (STMT_VINFO_STRIDED_P (stmt_info)
2716 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2717 || vect_analyze_group_access (dr)));
2719 /* Not consecutive access - check if it's a part of interleaving group. */
2720 return vect_analyze_group_access (dr);
2723 /* Compare two data-references DRA and DRB to group them into chunks
2724 suitable for grouping. */
2726 static int
2727 dr_group_sort_cmp (const void *dra_, const void *drb_)
2729 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2730 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2731 int cmp;
2733 /* Stabilize sort. */
2734 if (dra == drb)
2735 return 0;
2737 /* DRs in different loops never belong to the same group. */
2738 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2739 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2740 if (loopa != loopb)
2741 return loopa->num < loopb->num ? -1 : 1;
2743 /* Ordering of DRs according to base. */
2744 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2745 DR_BASE_ADDRESS (drb));
2746 if (cmp != 0)
2747 return cmp;
2749 /* And according to DR_OFFSET. */
2750 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2751 if (cmp != 0)
2752 return cmp;
2754 /* Put reads before writes. */
2755 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2756 return DR_IS_READ (dra) ? -1 : 1;
2758 /* Then sort after access size. */
2759 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2760 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2761 if (cmp != 0)
2762 return cmp;
2764 /* And after step. */
2765 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2766 if (cmp != 0)
2767 return cmp;
2769 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2770 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2771 if (cmp == 0)
2772 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2773 return cmp;
2776 /* Function vect_analyze_data_ref_accesses.
2778 Analyze the access pattern of all the data references in the loop.
2780 FORNOW: the only access pattern that is considered vectorizable is a
2781 simple step 1 (consecutive) access.
2783 FORNOW: handle only arrays and pointer accesses. */
2785 bool
2786 vect_analyze_data_ref_accesses (vec_info *vinfo)
2788 unsigned int i;
2789 vec<data_reference_p> datarefs = vinfo->datarefs;
2790 struct data_reference *dr;
2792 if (dump_enabled_p ())
2793 dump_printf_loc (MSG_NOTE, vect_location,
2794 "=== vect_analyze_data_ref_accesses ===\n");
2796 if (datarefs.is_empty ())
2797 return true;
2799 /* Sort the array of datarefs to make building the interleaving chains
2800 linear. Don't modify the original vector's order, it is needed for
2801 determining what dependencies are reversed. */
2802 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2803 datarefs_copy.qsort (dr_group_sort_cmp);
2805 /* Build the interleaving chains. */
2806 for (i = 0; i < datarefs_copy.length () - 1;)
2808 data_reference_p dra = datarefs_copy[i];
2809 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2810 stmt_vec_info lastinfo = NULL;
2811 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2813 ++i;
2814 continue;
2816 for (i = i + 1; i < datarefs_copy.length (); ++i)
2818 data_reference_p drb = datarefs_copy[i];
2819 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2820 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2821 break;
2823 /* ??? Imperfect sorting (non-compatible types, non-modulo
2824 accesses, same accesses) can lead to a group to be artificially
2825 split here as we don't just skip over those. If it really
2826 matters we can push those to a worklist and re-iterate
2827 over them. The we can just skip ahead to the next DR here. */
2829 /* DRs in a different loop should not be put into the same
2830 interleaving group. */
2831 if (gimple_bb (DR_STMT (dra))->loop_father
2832 != gimple_bb (DR_STMT (drb))->loop_father)
2833 break;
2835 /* Check that the data-refs have same first location (except init)
2836 and they are both either store or load (not load and store,
2837 not masked loads or stores). */
2838 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2839 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2840 DR_BASE_ADDRESS (drb)) != 0
2841 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2842 || !gimple_assign_single_p (DR_STMT (dra))
2843 || !gimple_assign_single_p (DR_STMT (drb)))
2844 break;
2846 /* Check that the data-refs have the same constant size. */
2847 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2848 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2849 if (!tree_fits_uhwi_p (sza)
2850 || !tree_fits_uhwi_p (szb)
2851 || !tree_int_cst_equal (sza, szb))
2852 break;
2854 /* Check that the data-refs have the same step. */
2855 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2856 break;
2858 /* Check the types are compatible.
2859 ??? We don't distinguish this during sorting. */
2860 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2861 TREE_TYPE (DR_REF (drb))))
2862 break;
2864 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2865 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2866 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2867 HOST_WIDE_INT init_prev
2868 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2869 gcc_assert (init_a <= init_b
2870 && init_a <= init_prev
2871 && init_prev <= init_b);
2873 /* Do not place the same access in the interleaving chain twice. */
2874 if (init_b == init_prev)
2876 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2877 < gimple_uid (DR_STMT (drb)));
2878 /* ??? For now we simply "drop" the later reference which is
2879 otherwise the same rather than finishing off this group.
2880 In the end we'd want to re-process duplicates forming
2881 multiple groups from the refs, likely by just collecting
2882 all candidates (including duplicates and split points
2883 below) in a vector and then process them together. */
2884 continue;
2887 /* If init_b == init_a + the size of the type * k, we have an
2888 interleaving, and DRA is accessed before DRB. */
2889 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2890 if (type_size_a == 0
2891 || (init_b - init_a) % type_size_a != 0)
2892 break;
2894 /* If we have a store, the accesses are adjacent. This splits
2895 groups into chunks we support (we don't support vectorization
2896 of stores with gaps). */
2897 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2898 break;
2900 /* If the step (if not zero or non-constant) is greater than the
2901 difference between data-refs' inits this splits groups into
2902 suitable sizes. */
2903 if (tree_fits_shwi_p (DR_STEP (dra)))
2905 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2906 if (step != 0 && step <= (init_b - init_a))
2907 break;
2910 if (dump_enabled_p ())
2912 dump_printf_loc (MSG_NOTE, vect_location,
2913 "Detected interleaving ");
2914 if (DR_IS_READ (dra))
2915 dump_printf (MSG_NOTE, "load ");
2916 else
2917 dump_printf (MSG_NOTE, "store ");
2918 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2919 dump_printf (MSG_NOTE, " and ");
2920 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2921 dump_printf (MSG_NOTE, "\n");
2924 /* Link the found element into the group list. */
2925 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2927 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2928 lastinfo = stmtinfo_a;
2930 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2931 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2932 lastinfo = stmtinfo_b;
2936 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2937 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2938 && !vect_analyze_data_ref_access (dr))
2940 if (dump_enabled_p ())
2941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2942 "not vectorized: complicated access pattern.\n");
2944 if (is_a <bb_vec_info> (vinfo))
2946 /* Mark the statement as not vectorizable. */
2947 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2948 continue;
2950 else
2952 datarefs_copy.release ();
2953 return false;
2957 datarefs_copy.release ();
2958 return true;
2961 /* Function vect_vfa_segment_size.
2963 Create an expression that computes the size of segment
2964 that will be accessed for a data reference. The functions takes into
2965 account that realignment loads may access one more vector.
2967 Input:
2968 DR: The data reference.
2969 LENGTH_FACTOR: segment length to consider.
2971 Return an expression whose value is the size of segment which will be
2972 accessed by DR. */
2974 static tree
2975 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2977 tree segment_length;
2979 if (integer_zerop (DR_STEP (dr)))
2980 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2981 else
2982 segment_length = size_binop (MULT_EXPR,
2983 fold_convert (sizetype, DR_STEP (dr)),
2984 fold_convert (sizetype, length_factor));
2986 if (vect_supportable_dr_alignment (dr, false)
2987 == dr_explicit_realign_optimized)
2989 tree vector_size = TYPE_SIZE_UNIT
2990 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2992 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2994 return segment_length;
2997 /* Function vect_no_alias_p.
2999 Given data references A and B with equal base and offset, the alias
3000 relation can be decided at compilation time, return TRUE if they do
3001 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
3002 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3003 respectively. */
3005 static bool
3006 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
3007 tree segment_length_a, tree segment_length_b)
3009 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
3010 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
3011 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
3012 return false;
3014 tree seg_a_min = DR_INIT (a);
3015 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3016 seg_a_min, segment_length_a);
3017 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3018 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3019 [a, a+12) */
3020 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3022 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3023 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3024 seg_a_max, unit_size);
3025 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3026 DR_INIT (a), unit_size);
3028 tree seg_b_min = DR_INIT (b);
3029 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3030 seg_b_min, segment_length_b);
3031 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3033 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3034 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3035 seg_b_max, unit_size);
3036 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3037 DR_INIT (b), unit_size);
3040 if (tree_int_cst_le (seg_a_max, seg_b_min)
3041 || tree_int_cst_le (seg_b_max, seg_a_min))
3042 return true;
3044 return false;
3047 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3048 in DDR is >= VF. */
3050 static bool
3051 dependence_distance_ge_vf (data_dependence_relation *ddr,
3052 unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3054 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3055 || DDR_NUM_DIST_VECTS (ddr) == 0)
3056 return false;
3058 /* If the dependence is exact, we should have limited the VF instead. */
3059 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3061 unsigned int i;
3062 lambda_vector dist_v;
3063 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3065 HOST_WIDE_INT dist = dist_v[loop_depth];
3066 if (dist != 0
3067 && !(dist > 0 && DDR_REVERSED_P (ddr))
3068 && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3069 return false;
3072 if (dump_enabled_p ())
3074 dump_printf_loc (MSG_NOTE, vect_location,
3075 "dependence distance between ");
3076 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3077 dump_printf (MSG_NOTE, " and ");
3078 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3079 dump_printf (MSG_NOTE, " is >= VF\n");
3082 return true;
3085 /* Function vect_prune_runtime_alias_test_list.
3087 Prune a list of ddrs to be tested at run-time by versioning for alias.
3088 Merge several alias checks into one if possible.
3089 Return FALSE if resulting list of ddrs is longer then allowed by
3090 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3092 bool
3093 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3095 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3096 hash_set <tree_pair_hash> compared_objects;
3098 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3099 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3100 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3101 vec<vec_object_pair> &check_unequal_addrs
3102 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3103 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3104 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3106 ddr_p ddr;
3107 unsigned int i;
3108 tree length_factor;
3110 if (dump_enabled_p ())
3111 dump_printf_loc (MSG_NOTE, vect_location,
3112 "=== vect_prune_runtime_alias_test_list ===\n");
3114 if (may_alias_ddrs.is_empty ())
3115 return true;
3117 comp_alias_ddrs.create (may_alias_ddrs.length ());
3119 unsigned int loop_depth
3120 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3121 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3123 /* First, we collect all data ref pairs for aliasing checks. */
3124 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3126 int comp_res;
3127 struct data_reference *dr_a, *dr_b;
3128 gimple *dr_group_first_a, *dr_group_first_b;
3129 tree segment_length_a, segment_length_b;
3130 gimple *stmt_a, *stmt_b;
3132 /* Ignore the alias if the VF we chose ended up being no greater
3133 than the dependence distance. */
3134 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3135 continue;
3137 if (DDR_OBJECT_A (ddr))
3139 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3140 if (!compared_objects.add (new_pair))
3142 if (dump_enabled_p ())
3144 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3145 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3146 dump_printf (MSG_NOTE, " and ");
3147 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3148 dump_printf (MSG_NOTE, " have different addresses\n");
3150 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3152 continue;
3155 dr_a = DDR_A (ddr);
3156 stmt_a = DR_STMT (DDR_A (ddr));
3157 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3158 if (dr_group_first_a)
3160 stmt_a = dr_group_first_a;
3161 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3164 dr_b = DDR_B (ddr);
3165 stmt_b = DR_STMT (DDR_B (ddr));
3166 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3167 if (dr_group_first_b)
3169 stmt_b = dr_group_first_b;
3170 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3173 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3174 length_factor = scalar_loop_iters;
3175 else
3176 length_factor = size_int (vect_factor);
3177 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3178 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3180 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3181 DR_BASE_ADDRESS (dr_b));
3182 if (comp_res == 0)
3183 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3184 DR_OFFSET (dr_b));
3186 /* Alias is known at compilation time. */
3187 if (comp_res == 0
3188 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3189 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3190 && TREE_CODE (segment_length_a) == INTEGER_CST
3191 && TREE_CODE (segment_length_b) == INTEGER_CST)
3193 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3194 continue;
3196 if (dump_enabled_p ())
3197 dump_printf_loc (MSG_NOTE, vect_location,
3198 "not vectorized: compilation time alias.\n");
3200 return false;
3203 dr_with_seg_len_pair_t dr_with_seg_len_pair
3204 (dr_with_seg_len (dr_a, segment_length_a),
3205 dr_with_seg_len (dr_b, segment_length_b));
3207 /* Canonicalize pairs by sorting the two DR members. */
3208 if (comp_res > 0)
3209 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3211 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3214 prune_runtime_alias_test_list (&comp_alias_ddrs,
3215 (unsigned HOST_WIDE_INT) vect_factor);
3217 unsigned int count = (comp_alias_ddrs.length ()
3218 + check_unequal_addrs.length ());
3219 dump_printf_loc (MSG_NOTE, vect_location,
3220 "improved number of alias checks from %d to %d\n",
3221 may_alias_ddrs.length (), count);
3222 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3224 if (dump_enabled_p ())
3225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3226 "number of versioning for alias "
3227 "run-time tests exceeds %d "
3228 "(--param vect-max-version-for-alias-checks)\n",
3229 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3230 return false;
3233 return true;
3236 /* Return true if a non-affine read or write in STMT is suitable for a
3237 gather load or scatter store. Describe the operation in *INFO if so. */
3239 bool
3240 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3241 gather_scatter_info *info)
3243 HOST_WIDE_INT scale = 1;
3244 poly_int64 pbitpos, pbitsize;
3245 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3246 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3247 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3248 tree offtype = NULL_TREE;
3249 tree decl, base, off;
3250 machine_mode pmode;
3251 int punsignedp, reversep, pvolatilep = 0;
3253 base = DR_REF (dr);
3254 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3255 see if we can use the def stmt of the address. */
3256 if (is_gimple_call (stmt)
3257 && gimple_call_internal_p (stmt)
3258 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3259 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3260 && TREE_CODE (base) == MEM_REF
3261 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3262 && integer_zerop (TREE_OPERAND (base, 1))
3263 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3265 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3266 if (is_gimple_assign (def_stmt)
3267 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3268 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3271 /* The gather and scatter builtins need address of the form
3272 loop_invariant + vector * {1, 2, 4, 8}
3274 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3275 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3276 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3277 multiplications and additions in it. To get a vector, we need
3278 a single SSA_NAME that will be defined in the loop and will
3279 contain everything that is not loop invariant and that can be
3280 vectorized. The following code attempts to find such a preexistng
3281 SSA_NAME OFF and put the loop invariants into a tree BASE
3282 that can be gimplified before the loop. */
3283 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3284 &punsignedp, &reversep, &pvolatilep);
3285 gcc_assert (base && !reversep);
3286 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3288 if (TREE_CODE (base) == MEM_REF)
3290 if (!integer_zerop (TREE_OPERAND (base, 1)))
3292 if (off == NULL_TREE)
3293 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3294 else
3295 off = size_binop (PLUS_EXPR, off,
3296 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3298 base = TREE_OPERAND (base, 0);
3300 else
3301 base = build_fold_addr_expr (base);
3303 if (off == NULL_TREE)
3304 off = size_zero_node;
3306 /* If base is not loop invariant, either off is 0, then we start with just
3307 the constant offset in the loop invariant BASE and continue with base
3308 as OFF, otherwise give up.
3309 We could handle that case by gimplifying the addition of base + off
3310 into some SSA_NAME and use that as off, but for now punt. */
3311 if (!expr_invariant_in_loop_p (loop, base))
3313 if (!integer_zerop (off))
3314 return false;
3315 off = base;
3316 base = size_int (pbytepos);
3318 /* Otherwise put base + constant offset into the loop invariant BASE
3319 and continue with OFF. */
3320 else
3322 base = fold_convert (sizetype, base);
3323 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3326 /* OFF at this point may be either a SSA_NAME or some tree expression
3327 from get_inner_reference. Try to peel off loop invariants from it
3328 into BASE as long as possible. */
3329 STRIP_NOPS (off);
3330 while (offtype == NULL_TREE)
3332 enum tree_code code;
3333 tree op0, op1, add = NULL_TREE;
3335 if (TREE_CODE (off) == SSA_NAME)
3337 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3339 if (expr_invariant_in_loop_p (loop, off))
3340 return false;
3342 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3343 break;
3345 op0 = gimple_assign_rhs1 (def_stmt);
3346 code = gimple_assign_rhs_code (def_stmt);
3347 op1 = gimple_assign_rhs2 (def_stmt);
3349 else
3351 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3352 return false;
3353 code = TREE_CODE (off);
3354 extract_ops_from_tree (off, &code, &op0, &op1);
3356 switch (code)
3358 case POINTER_PLUS_EXPR:
3359 case PLUS_EXPR:
3360 if (expr_invariant_in_loop_p (loop, op0))
3362 add = op0;
3363 off = op1;
3364 do_add:
3365 add = fold_convert (sizetype, add);
3366 if (scale != 1)
3367 add = size_binop (MULT_EXPR, add, size_int (scale));
3368 base = size_binop (PLUS_EXPR, base, add);
3369 continue;
3371 if (expr_invariant_in_loop_p (loop, op1))
3373 add = op1;
3374 off = op0;
3375 goto do_add;
3377 break;
3378 case MINUS_EXPR:
3379 if (expr_invariant_in_loop_p (loop, op1))
3381 add = fold_convert (sizetype, op1);
3382 add = size_binop (MINUS_EXPR, size_zero_node, add);
3383 off = op0;
3384 goto do_add;
3386 break;
3387 case MULT_EXPR:
3388 if (scale == 1 && tree_fits_shwi_p (op1))
3390 scale = tree_to_shwi (op1);
3391 off = op0;
3392 continue;
3394 break;
3395 case SSA_NAME:
3396 off = op0;
3397 continue;
3398 CASE_CONVERT:
3399 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3400 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3401 break;
3402 if (TYPE_PRECISION (TREE_TYPE (op0))
3403 == TYPE_PRECISION (TREE_TYPE (off)))
3405 off = op0;
3406 continue;
3408 if (TYPE_PRECISION (TREE_TYPE (op0))
3409 < TYPE_PRECISION (TREE_TYPE (off)))
3411 off = op0;
3412 offtype = TREE_TYPE (off);
3413 STRIP_NOPS (off);
3414 continue;
3416 break;
3417 default:
3418 break;
3420 break;
3423 /* If at the end OFF still isn't a SSA_NAME or isn't
3424 defined in the loop, punt. */
3425 if (TREE_CODE (off) != SSA_NAME
3426 || expr_invariant_in_loop_p (loop, off))
3427 return false;
3429 if (offtype == NULL_TREE)
3430 offtype = TREE_TYPE (off);
3432 if (DR_IS_READ (dr))
3433 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3434 offtype, scale);
3435 else
3436 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3437 offtype, scale);
3439 if (decl == NULL_TREE)
3440 return false;
3442 info->decl = decl;
3443 info->base = base;
3444 info->offset = off;
3445 info->offset_dt = vect_unknown_def_type;
3446 info->offset_vectype = NULL_TREE;
3447 info->scale = scale;
3448 return true;
3451 /* Function vect_analyze_data_refs.
3453 Find all the data references in the loop or basic block.
3455 The general structure of the analysis of data refs in the vectorizer is as
3456 follows:
3457 1- vect_analyze_data_refs(loop/bb): call
3458 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3459 in the loop/bb and their dependences.
3460 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3461 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3462 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3466 bool
3467 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3469 struct loop *loop = NULL;
3470 unsigned int i;
3471 struct data_reference *dr;
3472 tree scalar_type;
3474 if (dump_enabled_p ())
3475 dump_printf_loc (MSG_NOTE, vect_location,
3476 "=== vect_analyze_data_refs ===\n");
3478 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3479 loop = LOOP_VINFO_LOOP (loop_vinfo);
3481 /* Go through the data-refs, check that the analysis succeeded. Update
3482 pointer from stmt_vec_info struct to DR and vectype. */
3484 vec<data_reference_p> datarefs = vinfo->datarefs;
3485 FOR_EACH_VEC_ELT (datarefs, i, dr)
3487 gimple *stmt;
3488 stmt_vec_info stmt_info;
3489 tree base, offset, init;
3490 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3491 bool simd_lane_access = false;
3492 int vf;
3494 again:
3495 if (!dr || !DR_REF (dr))
3497 if (dump_enabled_p ())
3498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3499 "not vectorized: unhandled data-ref\n");
3500 return false;
3503 stmt = DR_STMT (dr);
3504 stmt_info = vinfo_for_stmt (stmt);
3506 /* Discard clobbers from the dataref vector. We will remove
3507 clobber stmts during vectorization. */
3508 if (gimple_clobber_p (stmt))
3510 free_data_ref (dr);
3511 if (i == datarefs.length () - 1)
3513 datarefs.pop ();
3514 break;
3516 datarefs.ordered_remove (i);
3517 dr = datarefs[i];
3518 goto again;
3521 /* Check that analysis of the data-ref succeeded. */
3522 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3523 || !DR_STEP (dr))
3525 bool maybe_gather
3526 = DR_IS_READ (dr)
3527 && !TREE_THIS_VOLATILE (DR_REF (dr))
3528 && targetm.vectorize.builtin_gather != NULL;
3529 bool maybe_scatter
3530 = DR_IS_WRITE (dr)
3531 && !TREE_THIS_VOLATILE (DR_REF (dr))
3532 && targetm.vectorize.builtin_scatter != NULL;
3533 bool maybe_simd_lane_access
3534 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3536 /* If target supports vector gather loads or scatter stores, or if
3537 this might be a SIMD lane access, see if they can't be used. */
3538 if (is_a <loop_vec_info> (vinfo)
3539 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3540 && !nested_in_vect_loop_p (loop, stmt))
3542 struct data_reference *newdr
3543 = create_data_ref (NULL, loop_containing_stmt (stmt),
3544 DR_REF (dr), stmt, !maybe_scatter,
3545 DR_IS_CONDITIONAL_IN_STMT (dr));
3546 gcc_assert (newdr != NULL && DR_REF (newdr));
3547 if (DR_BASE_ADDRESS (newdr)
3548 && DR_OFFSET (newdr)
3549 && DR_INIT (newdr)
3550 && DR_STEP (newdr)
3551 && integer_zerop (DR_STEP (newdr)))
3553 if (maybe_simd_lane_access)
3555 tree off = DR_OFFSET (newdr);
3556 STRIP_NOPS (off);
3557 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3558 && TREE_CODE (off) == MULT_EXPR
3559 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3561 tree step = TREE_OPERAND (off, 1);
3562 off = TREE_OPERAND (off, 0);
3563 STRIP_NOPS (off);
3564 if (CONVERT_EXPR_P (off)
3565 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3566 0)))
3567 < TYPE_PRECISION (TREE_TYPE (off)))
3568 off = TREE_OPERAND (off, 0);
3569 if (TREE_CODE (off) == SSA_NAME)
3571 gimple *def = SSA_NAME_DEF_STMT (off);
3572 tree reft = TREE_TYPE (DR_REF (newdr));
3573 if (is_gimple_call (def)
3574 && gimple_call_internal_p (def)
3575 && (gimple_call_internal_fn (def)
3576 == IFN_GOMP_SIMD_LANE))
3578 tree arg = gimple_call_arg (def, 0);
3579 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3580 arg = SSA_NAME_VAR (arg);
3581 if (arg == loop->simduid
3582 /* For now. */
3583 && tree_int_cst_equal
3584 (TYPE_SIZE_UNIT (reft),
3585 step))
3587 DR_OFFSET (newdr) = ssize_int (0);
3588 DR_STEP (newdr) = step;
3589 DR_OFFSET_ALIGNMENT (newdr)
3590 = BIGGEST_ALIGNMENT;
3591 DR_STEP_ALIGNMENT (newdr)
3592 = highest_pow2_factor (step);
3593 dr = newdr;
3594 simd_lane_access = true;
3600 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3602 dr = newdr;
3603 if (maybe_gather)
3604 gatherscatter = GATHER;
3605 else
3606 gatherscatter = SCATTER;
3609 if (gatherscatter == SG_NONE && !simd_lane_access)
3610 free_data_ref (newdr);
3613 if (gatherscatter == SG_NONE && !simd_lane_access)
3615 if (dump_enabled_p ())
3617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3618 "not vectorized: data ref analysis "
3619 "failed ");
3620 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3623 if (is_a <bb_vec_info> (vinfo))
3624 break;
3626 return false;
3630 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3632 if (dump_enabled_p ())
3633 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3634 "not vectorized: base addr of dr is a "
3635 "constant\n");
3637 if (is_a <bb_vec_info> (vinfo))
3638 break;
3640 if (gatherscatter != SG_NONE || simd_lane_access)
3641 free_data_ref (dr);
3642 return false;
3645 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3647 if (dump_enabled_p ())
3649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3650 "not vectorized: volatile type ");
3651 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3654 if (is_a <bb_vec_info> (vinfo))
3655 break;
3657 return false;
3660 if (stmt_can_throw_internal (stmt))
3662 if (dump_enabled_p ())
3664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3665 "not vectorized: statement can throw an "
3666 "exception ");
3667 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3670 if (is_a <bb_vec_info> (vinfo))
3671 break;
3673 if (gatherscatter != SG_NONE || simd_lane_access)
3674 free_data_ref (dr);
3675 return false;
3678 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3679 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3681 if (dump_enabled_p ())
3683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3684 "not vectorized: statement is bitfield "
3685 "access ");
3686 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3689 if (is_a <bb_vec_info> (vinfo))
3690 break;
3692 if (gatherscatter != SG_NONE || simd_lane_access)
3693 free_data_ref (dr);
3694 return false;
3697 base = unshare_expr (DR_BASE_ADDRESS (dr));
3698 offset = unshare_expr (DR_OFFSET (dr));
3699 init = unshare_expr (DR_INIT (dr));
3701 if (is_gimple_call (stmt)
3702 && (!gimple_call_internal_p (stmt)
3703 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3704 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3706 if (dump_enabled_p ())
3708 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3709 "not vectorized: dr in a call ");
3710 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3713 if (is_a <bb_vec_info> (vinfo))
3714 break;
3716 if (gatherscatter != SG_NONE || simd_lane_access)
3717 free_data_ref (dr);
3718 return false;
3721 /* Update DR field in stmt_vec_info struct. */
3723 /* If the dataref is in an inner-loop of the loop that is considered for
3724 for vectorization, we also want to analyze the access relative to
3725 the outer-loop (DR contains information only relative to the
3726 inner-most enclosing loop). We do that by building a reference to the
3727 first location accessed by the inner-loop, and analyze it relative to
3728 the outer-loop. */
3729 if (loop && nested_in_vect_loop_p (loop, stmt))
3731 /* Build a reference to the first location accessed by the
3732 inner loop: *(BASE + INIT + OFFSET). By construction,
3733 this address must be invariant in the inner loop, so we
3734 can consider it as being used in the outer loop. */
3735 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3736 init, offset);
3737 tree init_addr = fold_build_pointer_plus (base, init_offset);
3738 tree init_ref = build_fold_indirect_ref (init_addr);
3740 if (dump_enabled_p ())
3742 dump_printf_loc (MSG_NOTE, vect_location,
3743 "analyze in outer loop: ");
3744 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3745 dump_printf (MSG_NOTE, "\n");
3748 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3749 init_ref, loop))
3750 /* dr_analyze_innermost already explained the failure. */
3751 return false;
3753 if (dump_enabled_p ())
3755 dump_printf_loc (MSG_NOTE, vect_location,
3756 "\touter base_address: ");
3757 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3758 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3759 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3760 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3761 STMT_VINFO_DR_OFFSET (stmt_info));
3762 dump_printf (MSG_NOTE,
3763 "\n\touter constant offset from base address: ");
3764 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3765 STMT_VINFO_DR_INIT (stmt_info));
3766 dump_printf (MSG_NOTE, "\n\touter step: ");
3767 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3768 STMT_VINFO_DR_STEP (stmt_info));
3769 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3770 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3771 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3772 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3773 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3774 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3775 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3776 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3780 if (STMT_VINFO_DATA_REF (stmt_info))
3782 if (dump_enabled_p ())
3784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3785 "not vectorized: more than one data ref "
3786 "in stmt: ");
3787 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3790 if (is_a <bb_vec_info> (vinfo))
3791 break;
3793 if (gatherscatter != SG_NONE || simd_lane_access)
3794 free_data_ref (dr);
3795 return false;
3798 STMT_VINFO_DATA_REF (stmt_info) = dr;
3799 if (simd_lane_access)
3801 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3802 free_data_ref (datarefs[i]);
3803 datarefs[i] = dr;
3806 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3807 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3808 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3810 if (dump_enabled_p ())
3812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3813 "not vectorized: base object not addressable "
3814 "for stmt: ");
3815 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3817 if (is_a <bb_vec_info> (vinfo))
3819 /* In BB vectorization the ref can still participate
3820 in dependence analysis, we just can't vectorize it. */
3821 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3822 continue;
3824 return false;
3827 /* Set vectype for STMT. */
3828 scalar_type = TREE_TYPE (DR_REF (dr));
3829 STMT_VINFO_VECTYPE (stmt_info)
3830 = get_vectype_for_scalar_type (scalar_type);
3831 if (!STMT_VINFO_VECTYPE (stmt_info))
3833 if (dump_enabled_p ())
3835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3836 "not vectorized: no vectype for stmt: ");
3837 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3838 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3839 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3840 scalar_type);
3841 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3844 if (is_a <bb_vec_info> (vinfo))
3846 /* No vector type is fine, the ref can still participate
3847 in dependence analysis, we just can't vectorize it. */
3848 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3849 continue;
3852 if (gatherscatter != SG_NONE || simd_lane_access)
3854 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3855 if (gatherscatter != SG_NONE)
3856 free_data_ref (dr);
3858 return false;
3860 else
3862 if (dump_enabled_p ())
3864 dump_printf_loc (MSG_NOTE, vect_location,
3865 "got vectype for stmt: ");
3866 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3867 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3868 STMT_VINFO_VECTYPE (stmt_info));
3869 dump_printf (MSG_NOTE, "\n");
3873 /* Adjust the minimal vectorization factor according to the
3874 vector type. */
3875 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3876 if (vf > *min_vf)
3877 *min_vf = vf;
3879 if (gatherscatter != SG_NONE)
3881 gather_scatter_info gs_info;
3882 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3883 &gs_info)
3884 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3886 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3887 free_data_ref (dr);
3888 if (dump_enabled_p ())
3890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3891 (gatherscatter == GATHER) ?
3892 "not vectorized: not suitable for gather "
3893 "load " :
3894 "not vectorized: not suitable for scatter "
3895 "store ");
3896 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3898 return false;
3901 free_data_ref (datarefs[i]);
3902 datarefs[i] = dr;
3903 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3906 else if (is_a <loop_vec_info> (vinfo)
3907 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3909 if (nested_in_vect_loop_p (loop, stmt))
3911 if (dump_enabled_p ())
3913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3914 "not vectorized: not suitable for strided "
3915 "load ");
3916 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3918 return false;
3920 STMT_VINFO_STRIDED_P (stmt_info) = true;
3924 /* If we stopped analysis at the first dataref we could not analyze
3925 when trying to vectorize a basic-block mark the rest of the datarefs
3926 as not vectorizable and truncate the vector of datarefs. That
3927 avoids spending useless time in analyzing their dependence. */
3928 if (i != datarefs.length ())
3930 gcc_assert (is_a <bb_vec_info> (vinfo));
3931 for (unsigned j = i; j < datarefs.length (); ++j)
3933 data_reference_p dr = datarefs[j];
3934 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3935 free_data_ref (dr);
3937 datarefs.truncate (i);
3940 return true;
3944 /* Function vect_get_new_vect_var.
3946 Returns a name for a new variable. The current naming scheme appends the
3947 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3948 the name of vectorizer generated variables, and appends that to NAME if
3949 provided. */
3951 tree
3952 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3954 const char *prefix;
3955 tree new_vect_var;
3957 switch (var_kind)
3959 case vect_simple_var:
3960 prefix = "vect";
3961 break;
3962 case vect_scalar_var:
3963 prefix = "stmp";
3964 break;
3965 case vect_mask_var:
3966 prefix = "mask";
3967 break;
3968 case vect_pointer_var:
3969 prefix = "vectp";
3970 break;
3971 default:
3972 gcc_unreachable ();
3975 if (name)
3977 char* tmp = concat (prefix, "_", name, NULL);
3978 new_vect_var = create_tmp_reg (type, tmp);
3979 free (tmp);
3981 else
3982 new_vect_var = create_tmp_reg (type, prefix);
3984 return new_vect_var;
3987 /* Like vect_get_new_vect_var but return an SSA name. */
3989 tree
3990 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3992 const char *prefix;
3993 tree new_vect_var;
3995 switch (var_kind)
3997 case vect_simple_var:
3998 prefix = "vect";
3999 break;
4000 case vect_scalar_var:
4001 prefix = "stmp";
4002 break;
4003 case vect_pointer_var:
4004 prefix = "vectp";
4005 break;
4006 default:
4007 gcc_unreachable ();
4010 if (name)
4012 char* tmp = concat (prefix, "_", name, NULL);
4013 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4014 free (tmp);
4016 else
4017 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4019 return new_vect_var;
4022 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4024 static void
4025 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4027 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4028 int misalign = DR_MISALIGNMENT (dr);
4029 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4030 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4031 else
4032 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4033 DR_TARGET_ALIGNMENT (dr), misalign);
4036 /* Function vect_create_addr_base_for_vector_ref.
4038 Create an expression that computes the address of the first memory location
4039 that will be accessed for a data reference.
4041 Input:
4042 STMT: The statement containing the data reference.
4043 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4044 OFFSET: Optional. If supplied, it is be added to the initial address.
4045 LOOP: Specify relative to which loop-nest should the address be computed.
4046 For example, when the dataref is in an inner-loop nested in an
4047 outer-loop that is now being vectorized, LOOP can be either the
4048 outer-loop, or the inner-loop. The first memory location accessed
4049 by the following dataref ('in' points to short):
4051 for (i=0; i<N; i++)
4052 for (j=0; j<M; j++)
4053 s += in[i+j]
4055 is as follows:
4056 if LOOP=i_loop: &in (relative to i_loop)
4057 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4058 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4059 initial address. Unlike OFFSET, which is number of elements to
4060 be added, BYTE_OFFSET is measured in bytes.
4062 Output:
4063 1. Return an SSA_NAME whose value is the address of the memory location of
4064 the first vector of the data reference.
4065 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4066 these statement(s) which define the returned SSA_NAME.
4068 FORNOW: We are only handling array accesses with step 1. */
4070 tree
4071 vect_create_addr_base_for_vector_ref (gimple *stmt,
4072 gimple_seq *new_stmt_list,
4073 tree offset,
4074 tree byte_offset)
4076 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4077 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4078 const char *base_name;
4079 tree addr_base;
4080 tree dest;
4081 gimple_seq seq = NULL;
4082 tree vect_ptr_type;
4083 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4084 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4085 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4087 tree data_ref_base = unshare_expr (drb->base_address);
4088 tree base_offset = unshare_expr (drb->offset);
4089 tree init = unshare_expr (drb->init);
4091 if (loop_vinfo)
4092 base_name = get_name (data_ref_base);
4093 else
4095 base_offset = ssize_int (0);
4096 init = ssize_int (0);
4097 base_name = get_name (DR_REF (dr));
4100 /* Create base_offset */
4101 base_offset = size_binop (PLUS_EXPR,
4102 fold_convert (sizetype, base_offset),
4103 fold_convert (sizetype, init));
4105 if (offset)
4107 offset = fold_build2 (MULT_EXPR, sizetype,
4108 fold_convert (sizetype, offset), step);
4109 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4110 base_offset, offset);
4112 if (byte_offset)
4114 byte_offset = fold_convert (sizetype, byte_offset);
4115 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4116 base_offset, byte_offset);
4119 /* base + base_offset */
4120 if (loop_vinfo)
4121 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4122 else
4124 addr_base = build1 (ADDR_EXPR,
4125 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4126 unshare_expr (DR_REF (dr)));
4129 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4130 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4131 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4132 gimple_seq_add_seq (new_stmt_list, seq);
4134 if (DR_PTR_INFO (dr)
4135 && TREE_CODE (addr_base) == SSA_NAME
4136 && !SSA_NAME_PTR_INFO (addr_base))
4138 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4139 if (offset || byte_offset)
4140 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4143 if (dump_enabled_p ())
4145 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4146 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4147 dump_printf (MSG_NOTE, "\n");
4150 return addr_base;
4154 /* Function vect_create_data_ref_ptr.
4156 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4157 location accessed in the loop by STMT, along with the def-use update
4158 chain to appropriately advance the pointer through the loop iterations.
4159 Also set aliasing information for the pointer. This pointer is used by
4160 the callers to this function to create a memory reference expression for
4161 vector load/store access.
4163 Input:
4164 1. STMT: a stmt that references memory. Expected to be of the form
4165 GIMPLE_ASSIGN <name, data-ref> or
4166 GIMPLE_ASSIGN <data-ref, name>.
4167 2. AGGR_TYPE: the type of the reference, which should be either a vector
4168 or an array.
4169 3. AT_LOOP: the loop where the vector memref is to be created.
4170 4. OFFSET (optional): an offset to be added to the initial address accessed
4171 by the data-ref in STMT.
4172 5. BSI: location where the new stmts are to be placed if there is no loop
4173 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4174 pointing to the initial address.
4175 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4176 to the initial address accessed by the data-ref in STMT. This is
4177 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4178 in bytes.
4180 Output:
4181 1. Declare a new ptr to vector_type, and have it point to the base of the
4182 data reference (initial addressed accessed by the data reference).
4183 For example, for vector of type V8HI, the following code is generated:
4185 v8hi *ap;
4186 ap = (v8hi *)initial_address;
4188 if OFFSET is not supplied:
4189 initial_address = &a[init];
4190 if OFFSET is supplied:
4191 initial_address = &a[init + OFFSET];
4192 if BYTE_OFFSET is supplied:
4193 initial_address = &a[init] + BYTE_OFFSET;
4195 Return the initial_address in INITIAL_ADDRESS.
4197 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4198 update the pointer in each iteration of the loop.
4200 Return the increment stmt that updates the pointer in PTR_INCR.
4202 3. Set INV_P to true if the access pattern of the data reference in the
4203 vectorized loop is invariant. Set it to false otherwise.
4205 4. Return the pointer. */
4207 tree
4208 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4209 tree offset, tree *initial_address,
4210 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4211 bool only_init, bool *inv_p, tree byte_offset)
4213 const char *base_name;
4214 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4215 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4216 struct loop *loop = NULL;
4217 bool nested_in_vect_loop = false;
4218 struct loop *containing_loop = NULL;
4219 tree aggr_ptr_type;
4220 tree aggr_ptr;
4221 tree new_temp;
4222 gimple_seq new_stmt_list = NULL;
4223 edge pe = NULL;
4224 basic_block new_bb;
4225 tree aggr_ptr_init;
4226 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4227 tree aptr;
4228 gimple_stmt_iterator incr_gsi;
4229 bool insert_after;
4230 tree indx_before_incr, indx_after_incr;
4231 gimple *incr;
4232 tree step;
4233 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4235 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4236 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4238 if (loop_vinfo)
4240 loop = LOOP_VINFO_LOOP (loop_vinfo);
4241 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4242 containing_loop = (gimple_bb (stmt))->loop_father;
4243 pe = loop_preheader_edge (loop);
4245 else
4247 gcc_assert (bb_vinfo);
4248 only_init = true;
4249 *ptr_incr = NULL;
4252 /* Check the step (evolution) of the load in LOOP, and record
4253 whether it's invariant. */
4254 step = vect_dr_behavior (dr)->step;
4255 if (integer_zerop (step))
4256 *inv_p = true;
4257 else
4258 *inv_p = false;
4260 /* Create an expression for the first address accessed by this load
4261 in LOOP. */
4262 base_name = get_name (DR_BASE_ADDRESS (dr));
4264 if (dump_enabled_p ())
4266 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4267 dump_printf_loc (MSG_NOTE, vect_location,
4268 "create %s-pointer variable to type: ",
4269 get_tree_code_name (TREE_CODE (aggr_type)));
4270 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4271 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4272 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4273 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4274 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4275 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4276 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4277 else
4278 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4279 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4280 dump_printf (MSG_NOTE, "\n");
4283 /* (1) Create the new aggregate-pointer variable.
4284 Vector and array types inherit the alias set of their component
4285 type by default so we need to use a ref-all pointer if the data
4286 reference does not conflict with the created aggregated data
4287 reference because it is not addressable. */
4288 bool need_ref_all = false;
4289 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4290 get_alias_set (DR_REF (dr))))
4291 need_ref_all = true;
4292 /* Likewise for any of the data references in the stmt group. */
4293 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4295 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4298 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4299 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4300 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4301 get_alias_set (DR_REF (sdr))))
4303 need_ref_all = true;
4304 break;
4306 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4308 while (orig_stmt);
4310 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4311 need_ref_all);
4312 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4315 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4316 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4317 def-use update cycles for the pointer: one relative to the outer-loop
4318 (LOOP), which is what steps (3) and (4) below do. The other is relative
4319 to the inner-loop (which is the inner-most loop containing the dataref),
4320 and this is done be step (5) below.
4322 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4323 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4324 redundant. Steps (3),(4) create the following:
4326 vp0 = &base_addr;
4327 LOOP: vp1 = phi(vp0,vp2)
4330 vp2 = vp1 + step
4331 goto LOOP
4333 If there is an inner-loop nested in loop, then step (5) will also be
4334 applied, and an additional update in the inner-loop will be created:
4336 vp0 = &base_addr;
4337 LOOP: vp1 = phi(vp0,vp2)
4339 inner: vp3 = phi(vp1,vp4)
4340 vp4 = vp3 + inner_step
4341 if () goto inner
4343 vp2 = vp1 + step
4344 if () goto LOOP */
4346 /* (2) Calculate the initial address of the aggregate-pointer, and set
4347 the aggregate-pointer to point to it before the loop. */
4349 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4351 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4352 offset, byte_offset);
4353 if (new_stmt_list)
4355 if (pe)
4357 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4358 gcc_assert (!new_bb);
4360 else
4361 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4364 *initial_address = new_temp;
4365 aggr_ptr_init = new_temp;
4367 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4368 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4369 inner-loop nested in LOOP (during outer-loop vectorization). */
4371 /* No update in loop is required. */
4372 if (only_init && (!loop_vinfo || at_loop == loop))
4373 aptr = aggr_ptr_init;
4374 else
4376 /* The step of the aggregate pointer is the type size. */
4377 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4378 /* One exception to the above is when the scalar step of the load in
4379 LOOP is zero. In this case the step here is also zero. */
4380 if (*inv_p)
4381 iv_step = size_zero_node;
4382 else if (tree_int_cst_sgn (step) == -1)
4383 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4385 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4387 create_iv (aggr_ptr_init,
4388 fold_convert (aggr_ptr_type, iv_step),
4389 aggr_ptr, loop, &incr_gsi, insert_after,
4390 &indx_before_incr, &indx_after_incr);
4391 incr = gsi_stmt (incr_gsi);
4392 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4394 /* Copy the points-to information if it exists. */
4395 if (DR_PTR_INFO (dr))
4397 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4398 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4400 if (ptr_incr)
4401 *ptr_incr = incr;
4403 aptr = indx_before_incr;
4406 if (!nested_in_vect_loop || only_init)
4407 return aptr;
4410 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4411 nested in LOOP, if exists. */
4413 gcc_assert (nested_in_vect_loop);
4414 if (!only_init)
4416 standard_iv_increment_position (containing_loop, &incr_gsi,
4417 &insert_after);
4418 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4419 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4420 &indx_after_incr);
4421 incr = gsi_stmt (incr_gsi);
4422 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4424 /* Copy the points-to information if it exists. */
4425 if (DR_PTR_INFO (dr))
4427 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4428 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4430 if (ptr_incr)
4431 *ptr_incr = incr;
4433 return indx_before_incr;
4435 else
4436 gcc_unreachable ();
4440 /* Function bump_vector_ptr
4442 Increment a pointer (to a vector type) by vector-size. If requested,
4443 i.e. if PTR-INCR is given, then also connect the new increment stmt
4444 to the existing def-use update-chain of the pointer, by modifying
4445 the PTR_INCR as illustrated below:
4447 The pointer def-use update-chain before this function:
4448 DATAREF_PTR = phi (p_0, p_2)
4449 ....
4450 PTR_INCR: p_2 = DATAREF_PTR + step
4452 The pointer def-use update-chain after this function:
4453 DATAREF_PTR = phi (p_0, p_2)
4454 ....
4455 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4456 ....
4457 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4459 Input:
4460 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4461 in the loop.
4462 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4463 the loop. The increment amount across iterations is expected
4464 to be vector_size.
4465 BSI - location where the new update stmt is to be placed.
4466 STMT - the original scalar memory-access stmt that is being vectorized.
4467 BUMP - optional. The offset by which to bump the pointer. If not given,
4468 the offset is assumed to be vector_size.
4470 Output: Return NEW_DATAREF_PTR as illustrated above.
4474 tree
4475 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4476 gimple *stmt, tree bump)
4478 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4479 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4480 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4481 tree update = TYPE_SIZE_UNIT (vectype);
4482 gassign *incr_stmt;
4483 ssa_op_iter iter;
4484 use_operand_p use_p;
4485 tree new_dataref_ptr;
4487 if (bump)
4488 update = bump;
4490 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4491 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4492 else
4493 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4494 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4495 dataref_ptr, update);
4496 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4498 /* Copy the points-to information if it exists. */
4499 if (DR_PTR_INFO (dr))
4501 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4502 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4505 if (!ptr_incr)
4506 return new_dataref_ptr;
4508 /* Update the vector-pointer's cross-iteration increment. */
4509 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4511 tree use = USE_FROM_PTR (use_p);
4513 if (use == dataref_ptr)
4514 SET_USE (use_p, new_dataref_ptr);
4515 else
4516 gcc_assert (tree_int_cst_compare (use, update) == 0);
4519 return new_dataref_ptr;
4523 /* Function vect_create_destination_var.
4525 Create a new temporary of type VECTYPE. */
4527 tree
4528 vect_create_destination_var (tree scalar_dest, tree vectype)
4530 tree vec_dest;
4531 const char *name;
4532 char *new_name;
4533 tree type;
4534 enum vect_var_kind kind;
4536 kind = vectype
4537 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4538 ? vect_mask_var
4539 : vect_simple_var
4540 : vect_scalar_var;
4541 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4543 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4545 name = get_name (scalar_dest);
4546 if (name)
4547 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4548 else
4549 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4550 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4551 free (new_name);
4553 return vec_dest;
4556 /* Function vect_grouped_store_supported.
4558 Returns TRUE if interleave high and interleave low permutations
4559 are supported, and FALSE otherwise. */
4561 bool
4562 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4564 machine_mode mode = TYPE_MODE (vectype);
4566 /* vect_permute_store_chain requires the group size to be equal to 3 or
4567 be a power of two. */
4568 if (count != 3 && exact_log2 (count) == -1)
4570 if (dump_enabled_p ())
4571 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4572 "the size of the group of accesses"
4573 " is not a power of 2 or not eqaul to 3\n");
4574 return false;
4577 /* Check that the permutation is supported. */
4578 if (VECTOR_MODE_P (mode))
4580 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4581 auto_vec_perm_indices sel (nelt);
4582 sel.quick_grow (nelt);
4584 if (count == 3)
4586 unsigned int j0 = 0, j1 = 0, j2 = 0;
4587 unsigned int i, j;
4589 for (j = 0; j < 3; j++)
4591 int nelt0 = ((3 - j) * nelt) % 3;
4592 int nelt1 = ((3 - j) * nelt + 1) % 3;
4593 int nelt2 = ((3 - j) * nelt + 2) % 3;
4594 for (i = 0; i < nelt; i++)
4596 if (3 * i + nelt0 < nelt)
4597 sel[3 * i + nelt0] = j0++;
4598 if (3 * i + nelt1 < nelt)
4599 sel[3 * i + nelt1] = nelt + j1++;
4600 if (3 * i + nelt2 < nelt)
4601 sel[3 * i + nelt2] = 0;
4603 if (!can_vec_perm_p (mode, false, &sel))
4605 if (dump_enabled_p ())
4606 dump_printf (MSG_MISSED_OPTIMIZATION,
4607 "permutaion op not supported by target.\n");
4608 return false;
4611 for (i = 0; i < nelt; i++)
4613 if (3 * i + nelt0 < nelt)
4614 sel[3 * i + nelt0] = 3 * i + nelt0;
4615 if (3 * i + nelt1 < nelt)
4616 sel[3 * i + nelt1] = 3 * i + nelt1;
4617 if (3 * i + nelt2 < nelt)
4618 sel[3 * i + nelt2] = nelt + j2++;
4620 if (!can_vec_perm_p (mode, false, &sel))
4622 if (dump_enabled_p ())
4623 dump_printf (MSG_MISSED_OPTIMIZATION,
4624 "permutaion op not supported by target.\n");
4625 return false;
4628 return true;
4630 else
4632 /* If length is not equal to 3 then only power of 2 is supported. */
4633 gcc_assert (pow2p_hwi (count));
4635 for (i = 0; i < nelt / 2; i++)
4637 sel[i * 2] = i;
4638 sel[i * 2 + 1] = i + nelt;
4640 if (can_vec_perm_p (mode, false, &sel))
4642 for (i = 0; i < nelt; i++)
4643 sel[i] += nelt / 2;
4644 if (can_vec_perm_p (mode, false, &sel))
4645 return true;
4650 if (dump_enabled_p ())
4651 dump_printf (MSG_MISSED_OPTIMIZATION,
4652 "permutaion op not supported by target.\n");
4653 return false;
4657 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4658 type VECTYPE. */
4660 bool
4661 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4663 return vect_lanes_optab_supported_p ("vec_store_lanes",
4664 vec_store_lanes_optab,
4665 vectype, count);
4669 /* Function vect_permute_store_chain.
4671 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4672 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4673 the data correctly for the stores. Return the final references for stores
4674 in RESULT_CHAIN.
4676 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4677 The input is 4 vectors each containing 8 elements. We assign a number to
4678 each element, the input sequence is:
4680 1st vec: 0 1 2 3 4 5 6 7
4681 2nd vec: 8 9 10 11 12 13 14 15
4682 3rd vec: 16 17 18 19 20 21 22 23
4683 4th vec: 24 25 26 27 28 29 30 31
4685 The output sequence should be:
4687 1st vec: 0 8 16 24 1 9 17 25
4688 2nd vec: 2 10 18 26 3 11 19 27
4689 3rd vec: 4 12 20 28 5 13 21 30
4690 4th vec: 6 14 22 30 7 15 23 31
4692 i.e., we interleave the contents of the four vectors in their order.
4694 We use interleave_high/low instructions to create such output. The input of
4695 each interleave_high/low operation is two vectors:
4696 1st vec 2nd vec
4697 0 1 2 3 4 5 6 7
4698 the even elements of the result vector are obtained left-to-right from the
4699 high/low elements of the first vector. The odd elements of the result are
4700 obtained left-to-right from the high/low elements of the second vector.
4701 The output of interleave_high will be: 0 4 1 5
4702 and of interleave_low: 2 6 3 7
4705 The permutation is done in log LENGTH stages. In each stage interleave_high
4706 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4707 where the first argument is taken from the first half of DR_CHAIN and the
4708 second argument from it's second half.
4709 In our example,
4711 I1: interleave_high (1st vec, 3rd vec)
4712 I2: interleave_low (1st vec, 3rd vec)
4713 I3: interleave_high (2nd vec, 4th vec)
4714 I4: interleave_low (2nd vec, 4th vec)
4716 The output for the first stage is:
4718 I1: 0 16 1 17 2 18 3 19
4719 I2: 4 20 5 21 6 22 7 23
4720 I3: 8 24 9 25 10 26 11 27
4721 I4: 12 28 13 29 14 30 15 31
4723 The output of the second stage, i.e. the final result is:
4725 I1: 0 8 16 24 1 9 17 25
4726 I2: 2 10 18 26 3 11 19 27
4727 I3: 4 12 20 28 5 13 21 30
4728 I4: 6 14 22 30 7 15 23 31. */
4730 void
4731 vect_permute_store_chain (vec<tree> dr_chain,
4732 unsigned int length,
4733 gimple *stmt,
4734 gimple_stmt_iterator *gsi,
4735 vec<tree> *result_chain)
4737 tree vect1, vect2, high, low;
4738 gimple *perm_stmt;
4739 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4740 tree perm_mask_low, perm_mask_high;
4741 tree data_ref;
4742 tree perm3_mask_low, perm3_mask_high;
4743 unsigned int i, n, log_length = exact_log2 (length);
4744 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4746 auto_vec_perm_indices sel (nelt);
4747 sel.quick_grow (nelt);
4749 result_chain->quick_grow (length);
4750 memcpy (result_chain->address (), dr_chain.address (),
4751 length * sizeof (tree));
4753 if (length == 3)
4755 unsigned int j0 = 0, j1 = 0, j2 = 0;
4757 for (j = 0; j < 3; j++)
4759 int nelt0 = ((3 - j) * nelt) % 3;
4760 int nelt1 = ((3 - j) * nelt + 1) % 3;
4761 int nelt2 = ((3 - j) * nelt + 2) % 3;
4763 for (i = 0; i < nelt; i++)
4765 if (3 * i + nelt0 < nelt)
4766 sel[3 * i + nelt0] = j0++;
4767 if (3 * i + nelt1 < nelt)
4768 sel[3 * i + nelt1] = nelt + j1++;
4769 if (3 * i + nelt2 < nelt)
4770 sel[3 * i + nelt2] = 0;
4772 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4774 for (i = 0; i < nelt; i++)
4776 if (3 * i + nelt0 < nelt)
4777 sel[3 * i + nelt0] = 3 * i + nelt0;
4778 if (3 * i + nelt1 < nelt)
4779 sel[3 * i + nelt1] = 3 * i + nelt1;
4780 if (3 * i + nelt2 < nelt)
4781 sel[3 * i + nelt2] = nelt + j2++;
4783 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4785 vect1 = dr_chain[0];
4786 vect2 = dr_chain[1];
4788 /* Create interleaving stmt:
4789 low = VEC_PERM_EXPR <vect1, vect2,
4790 {j, nelt, *, j + 1, nelt + j + 1, *,
4791 j + 2, nelt + j + 2, *, ...}> */
4792 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4793 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4794 vect2, perm3_mask_low);
4795 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4797 vect1 = data_ref;
4798 vect2 = dr_chain[2];
4799 /* Create interleaving stmt:
4800 low = VEC_PERM_EXPR <vect1, vect2,
4801 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4802 6, 7, nelt + j + 2, ...}> */
4803 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4804 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4805 vect2, perm3_mask_high);
4806 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4807 (*result_chain)[j] = data_ref;
4810 else
4812 /* If length is not equal to 3 then only power of 2 is supported. */
4813 gcc_assert (pow2p_hwi (length));
4815 for (i = 0, n = nelt / 2; i < n; i++)
4817 sel[i * 2] = i;
4818 sel[i * 2 + 1] = i + nelt;
4820 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4822 for (i = 0; i < nelt; i++)
4823 sel[i] += nelt / 2;
4824 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4826 for (i = 0, n = log_length; i < n; i++)
4828 for (j = 0; j < length/2; j++)
4830 vect1 = dr_chain[j];
4831 vect2 = dr_chain[j+length/2];
4833 /* Create interleaving stmt:
4834 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4835 ...}> */
4836 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4837 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4838 vect2, perm_mask_high);
4839 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4840 (*result_chain)[2*j] = high;
4842 /* Create interleaving stmt:
4843 low = VEC_PERM_EXPR <vect1, vect2,
4844 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4845 ...}> */
4846 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4847 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4848 vect2, perm_mask_low);
4849 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4850 (*result_chain)[2*j+1] = low;
4852 memcpy (dr_chain.address (), result_chain->address (),
4853 length * sizeof (tree));
4858 /* Function vect_setup_realignment
4860 This function is called when vectorizing an unaligned load using
4861 the dr_explicit_realign[_optimized] scheme.
4862 This function generates the following code at the loop prolog:
4864 p = initial_addr;
4865 x msq_init = *(floor(p)); # prolog load
4866 realignment_token = call target_builtin;
4867 loop:
4868 x msq = phi (msq_init, ---)
4870 The stmts marked with x are generated only for the case of
4871 dr_explicit_realign_optimized.
4873 The code above sets up a new (vector) pointer, pointing to the first
4874 location accessed by STMT, and a "floor-aligned" load using that pointer.
4875 It also generates code to compute the "realignment-token" (if the relevant
4876 target hook was defined), and creates a phi-node at the loop-header bb
4877 whose arguments are the result of the prolog-load (created by this
4878 function) and the result of a load that takes place in the loop (to be
4879 created by the caller to this function).
4881 For the case of dr_explicit_realign_optimized:
4882 The caller to this function uses the phi-result (msq) to create the
4883 realignment code inside the loop, and sets up the missing phi argument,
4884 as follows:
4885 loop:
4886 msq = phi (msq_init, lsq)
4887 lsq = *(floor(p')); # load in loop
4888 result = realign_load (msq, lsq, realignment_token);
4890 For the case of dr_explicit_realign:
4891 loop:
4892 msq = *(floor(p)); # load in loop
4893 p' = p + (VS-1);
4894 lsq = *(floor(p')); # load in loop
4895 result = realign_load (msq, lsq, realignment_token);
4897 Input:
4898 STMT - (scalar) load stmt to be vectorized. This load accesses
4899 a memory location that may be unaligned.
4900 BSI - place where new code is to be inserted.
4901 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4902 is used.
4904 Output:
4905 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4906 target hook, if defined.
4907 Return value - the result of the loop-header phi node. */
4909 tree
4910 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4911 tree *realignment_token,
4912 enum dr_alignment_support alignment_support_scheme,
4913 tree init_addr,
4914 struct loop **at_loop)
4916 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4917 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4918 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4919 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4920 struct loop *loop = NULL;
4921 edge pe = NULL;
4922 tree scalar_dest = gimple_assign_lhs (stmt);
4923 tree vec_dest;
4924 gimple *inc;
4925 tree ptr;
4926 tree data_ref;
4927 basic_block new_bb;
4928 tree msq_init = NULL_TREE;
4929 tree new_temp;
4930 gphi *phi_stmt;
4931 tree msq = NULL_TREE;
4932 gimple_seq stmts = NULL;
4933 bool inv_p;
4934 bool compute_in_loop = false;
4935 bool nested_in_vect_loop = false;
4936 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4937 struct loop *loop_for_initial_load = NULL;
4939 if (loop_vinfo)
4941 loop = LOOP_VINFO_LOOP (loop_vinfo);
4942 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4945 gcc_assert (alignment_support_scheme == dr_explicit_realign
4946 || alignment_support_scheme == dr_explicit_realign_optimized);
4948 /* We need to generate three things:
4949 1. the misalignment computation
4950 2. the extra vector load (for the optimized realignment scheme).
4951 3. the phi node for the two vectors from which the realignment is
4952 done (for the optimized realignment scheme). */
4954 /* 1. Determine where to generate the misalignment computation.
4956 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4957 calculation will be generated by this function, outside the loop (in the
4958 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4959 caller, inside the loop.
4961 Background: If the misalignment remains fixed throughout the iterations of
4962 the loop, then both realignment schemes are applicable, and also the
4963 misalignment computation can be done outside LOOP. This is because we are
4964 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4965 are a multiple of VS (the Vector Size), and therefore the misalignment in
4966 different vectorized LOOP iterations is always the same.
4967 The problem arises only if the memory access is in an inner-loop nested
4968 inside LOOP, which is now being vectorized using outer-loop vectorization.
4969 This is the only case when the misalignment of the memory access may not
4970 remain fixed throughout the iterations of the inner-loop (as explained in
4971 detail in vect_supportable_dr_alignment). In this case, not only is the
4972 optimized realignment scheme not applicable, but also the misalignment
4973 computation (and generation of the realignment token that is passed to
4974 REALIGN_LOAD) have to be done inside the loop.
4976 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4977 or not, which in turn determines if the misalignment is computed inside
4978 the inner-loop, or outside LOOP. */
4980 if (init_addr != NULL_TREE || !loop_vinfo)
4982 compute_in_loop = true;
4983 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4987 /* 2. Determine where to generate the extra vector load.
4989 For the optimized realignment scheme, instead of generating two vector
4990 loads in each iteration, we generate a single extra vector load in the
4991 preheader of the loop, and in each iteration reuse the result of the
4992 vector load from the previous iteration. In case the memory access is in
4993 an inner-loop nested inside LOOP, which is now being vectorized using
4994 outer-loop vectorization, we need to determine whether this initial vector
4995 load should be generated at the preheader of the inner-loop, or can be
4996 generated at the preheader of LOOP. If the memory access has no evolution
4997 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4998 to be generated inside LOOP (in the preheader of the inner-loop). */
5000 if (nested_in_vect_loop)
5002 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5003 bool invariant_in_outerloop =
5004 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5005 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5007 else
5008 loop_for_initial_load = loop;
5009 if (at_loop)
5010 *at_loop = loop_for_initial_load;
5012 if (loop_for_initial_load)
5013 pe = loop_preheader_edge (loop_for_initial_load);
5015 /* 3. For the case of the optimized realignment, create the first vector
5016 load at the loop preheader. */
5018 if (alignment_support_scheme == dr_explicit_realign_optimized)
5020 /* Create msq_init = *(floor(p1)) in the loop preheader */
5021 gassign *new_stmt;
5023 gcc_assert (!compute_in_loop);
5024 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5025 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5026 NULL_TREE, &init_addr, NULL, &inc,
5027 true, &inv_p);
5028 if (TREE_CODE (ptr) == SSA_NAME)
5029 new_temp = copy_ssa_name (ptr);
5030 else
5031 new_temp = make_ssa_name (TREE_TYPE (ptr));
5032 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5033 new_stmt = gimple_build_assign
5034 (new_temp, BIT_AND_EXPR, ptr,
5035 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5036 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5037 gcc_assert (!new_bb);
5038 data_ref
5039 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5040 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5041 new_stmt = gimple_build_assign (vec_dest, data_ref);
5042 new_temp = make_ssa_name (vec_dest, new_stmt);
5043 gimple_assign_set_lhs (new_stmt, new_temp);
5044 if (pe)
5046 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5047 gcc_assert (!new_bb);
5049 else
5050 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5052 msq_init = gimple_assign_lhs (new_stmt);
5055 /* 4. Create realignment token using a target builtin, if available.
5056 It is done either inside the containing loop, or before LOOP (as
5057 determined above). */
5059 if (targetm.vectorize.builtin_mask_for_load)
5061 gcall *new_stmt;
5062 tree builtin_decl;
5064 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5065 if (!init_addr)
5067 /* Generate the INIT_ADDR computation outside LOOP. */
5068 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5069 NULL_TREE);
5070 if (loop)
5072 pe = loop_preheader_edge (loop);
5073 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5074 gcc_assert (!new_bb);
5076 else
5077 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5080 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5081 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5082 vec_dest =
5083 vect_create_destination_var (scalar_dest,
5084 gimple_call_return_type (new_stmt));
5085 new_temp = make_ssa_name (vec_dest, new_stmt);
5086 gimple_call_set_lhs (new_stmt, new_temp);
5088 if (compute_in_loop)
5089 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5090 else
5092 /* Generate the misalignment computation outside LOOP. */
5093 pe = loop_preheader_edge (loop);
5094 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5095 gcc_assert (!new_bb);
5098 *realignment_token = gimple_call_lhs (new_stmt);
5100 /* The result of the CALL_EXPR to this builtin is determined from
5101 the value of the parameter and no global variables are touched
5102 which makes the builtin a "const" function. Requiring the
5103 builtin to have the "const" attribute makes it unnecessary
5104 to call mark_call_clobbered. */
5105 gcc_assert (TREE_READONLY (builtin_decl));
5108 if (alignment_support_scheme == dr_explicit_realign)
5109 return msq;
5111 gcc_assert (!compute_in_loop);
5112 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5115 /* 5. Create msq = phi <msq_init, lsq> in loop */
5117 pe = loop_preheader_edge (containing_loop);
5118 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5119 msq = make_ssa_name (vec_dest);
5120 phi_stmt = create_phi_node (msq, containing_loop->header);
5121 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5123 return msq;
5127 /* Function vect_grouped_load_supported.
5129 COUNT is the size of the load group (the number of statements plus the
5130 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5131 only one statement, with a gap of COUNT - 1.
5133 Returns true if a suitable permute exists. */
5135 bool
5136 vect_grouped_load_supported (tree vectype, bool single_element_p,
5137 unsigned HOST_WIDE_INT count)
5139 machine_mode mode = TYPE_MODE (vectype);
5141 /* If this is single-element interleaving with an element distance
5142 that leaves unused vector loads around punt - we at least create
5143 very sub-optimal code in that case (and blow up memory,
5144 see PR65518). */
5145 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5147 if (dump_enabled_p ())
5148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5149 "single-element interleaving not supported "
5150 "for not adjacent vector loads\n");
5151 return false;
5154 /* vect_permute_load_chain requires the group size to be equal to 3 or
5155 be a power of two. */
5156 if (count != 3 && exact_log2 (count) == -1)
5158 if (dump_enabled_p ())
5159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5160 "the size of the group of accesses"
5161 " is not a power of 2 or not equal to 3\n");
5162 return false;
5165 /* Check that the permutation is supported. */
5166 if (VECTOR_MODE_P (mode))
5168 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5169 auto_vec_perm_indices sel (nelt);
5170 sel.quick_grow (nelt);
5172 if (count == 3)
5174 unsigned int k;
5175 for (k = 0; k < 3; k++)
5177 for (i = 0; i < nelt; i++)
5178 if (3 * i + k < 2 * nelt)
5179 sel[i] = 3 * i + k;
5180 else
5181 sel[i] = 0;
5182 if (!can_vec_perm_p (mode, false, &sel))
5184 if (dump_enabled_p ())
5185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5186 "shuffle of 3 loads is not supported by"
5187 " target\n");
5188 return false;
5190 for (i = 0, j = 0; i < nelt; i++)
5191 if (3 * i + k < 2 * nelt)
5192 sel[i] = i;
5193 else
5194 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5195 if (!can_vec_perm_p (mode, false, &sel))
5197 if (dump_enabled_p ())
5198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5199 "shuffle of 3 loads is not supported by"
5200 " target\n");
5201 return false;
5204 return true;
5206 else
5208 /* If length is not equal to 3 then only power of 2 is supported. */
5209 gcc_assert (pow2p_hwi (count));
5210 for (i = 0; i < nelt; i++)
5211 sel[i] = i * 2;
5212 if (can_vec_perm_p (mode, false, &sel))
5214 for (i = 0; i < nelt; i++)
5215 sel[i] = i * 2 + 1;
5216 if (can_vec_perm_p (mode, false, &sel))
5217 return true;
5222 if (dump_enabled_p ())
5223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5224 "extract even/odd not supported by target\n");
5225 return false;
5228 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5229 type VECTYPE. */
5231 bool
5232 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5234 return vect_lanes_optab_supported_p ("vec_load_lanes",
5235 vec_load_lanes_optab,
5236 vectype, count);
5239 /* Function vect_permute_load_chain.
5241 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5242 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5243 the input data correctly. Return the final references for loads in
5244 RESULT_CHAIN.
5246 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5247 The input is 4 vectors each containing 8 elements. We assign a number to each
5248 element, the input sequence is:
5250 1st vec: 0 1 2 3 4 5 6 7
5251 2nd vec: 8 9 10 11 12 13 14 15
5252 3rd vec: 16 17 18 19 20 21 22 23
5253 4th vec: 24 25 26 27 28 29 30 31
5255 The output sequence should be:
5257 1st vec: 0 4 8 12 16 20 24 28
5258 2nd vec: 1 5 9 13 17 21 25 29
5259 3rd vec: 2 6 10 14 18 22 26 30
5260 4th vec: 3 7 11 15 19 23 27 31
5262 i.e., the first output vector should contain the first elements of each
5263 interleaving group, etc.
5265 We use extract_even/odd instructions to create such output. The input of
5266 each extract_even/odd operation is two vectors
5267 1st vec 2nd vec
5268 0 1 2 3 4 5 6 7
5270 and the output is the vector of extracted even/odd elements. The output of
5271 extract_even will be: 0 2 4 6
5272 and of extract_odd: 1 3 5 7
5275 The permutation is done in log LENGTH stages. In each stage extract_even
5276 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5277 their order. In our example,
5279 E1: extract_even (1st vec, 2nd vec)
5280 E2: extract_odd (1st vec, 2nd vec)
5281 E3: extract_even (3rd vec, 4th vec)
5282 E4: extract_odd (3rd vec, 4th vec)
5284 The output for the first stage will be:
5286 E1: 0 2 4 6 8 10 12 14
5287 E2: 1 3 5 7 9 11 13 15
5288 E3: 16 18 20 22 24 26 28 30
5289 E4: 17 19 21 23 25 27 29 31
5291 In order to proceed and create the correct sequence for the next stage (or
5292 for the correct output, if the second stage is the last one, as in our
5293 example), we first put the output of extract_even operation and then the
5294 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5295 The input for the second stage is:
5297 1st vec (E1): 0 2 4 6 8 10 12 14
5298 2nd vec (E3): 16 18 20 22 24 26 28 30
5299 3rd vec (E2): 1 3 5 7 9 11 13 15
5300 4th vec (E4): 17 19 21 23 25 27 29 31
5302 The output of the second stage:
5304 E1: 0 4 8 12 16 20 24 28
5305 E2: 2 6 10 14 18 22 26 30
5306 E3: 1 5 9 13 17 21 25 29
5307 E4: 3 7 11 15 19 23 27 31
5309 And RESULT_CHAIN after reordering:
5311 1st vec (E1): 0 4 8 12 16 20 24 28
5312 2nd vec (E3): 1 5 9 13 17 21 25 29
5313 3rd vec (E2): 2 6 10 14 18 22 26 30
5314 4th vec (E4): 3 7 11 15 19 23 27 31. */
5316 static void
5317 vect_permute_load_chain (vec<tree> dr_chain,
5318 unsigned int length,
5319 gimple *stmt,
5320 gimple_stmt_iterator *gsi,
5321 vec<tree> *result_chain)
5323 tree data_ref, first_vect, second_vect;
5324 tree perm_mask_even, perm_mask_odd;
5325 tree perm3_mask_low, perm3_mask_high;
5326 gimple *perm_stmt;
5327 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5328 unsigned int i, j, log_length = exact_log2 (length);
5329 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5331 auto_vec_perm_indices sel (nelt);
5332 sel.quick_grow (nelt);
5334 result_chain->quick_grow (length);
5335 memcpy (result_chain->address (), dr_chain.address (),
5336 length * sizeof (tree));
5338 if (length == 3)
5340 unsigned int k;
5342 for (k = 0; k < 3; k++)
5344 for (i = 0; i < nelt; i++)
5345 if (3 * i + k < 2 * nelt)
5346 sel[i] = 3 * i + k;
5347 else
5348 sel[i] = 0;
5349 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5351 for (i = 0, j = 0; i < nelt; i++)
5352 if (3 * i + k < 2 * nelt)
5353 sel[i] = i;
5354 else
5355 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5357 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5359 first_vect = dr_chain[0];
5360 second_vect = dr_chain[1];
5362 /* Create interleaving stmt (low part of):
5363 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5364 ...}> */
5365 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5366 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5367 second_vect, perm3_mask_low);
5368 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5370 /* Create interleaving stmt (high part of):
5371 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5372 ...}> */
5373 first_vect = data_ref;
5374 second_vect = dr_chain[2];
5375 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5376 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5377 second_vect, perm3_mask_high);
5378 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5379 (*result_chain)[k] = data_ref;
5382 else
5384 /* If length is not equal to 3 then only power of 2 is supported. */
5385 gcc_assert (pow2p_hwi (length));
5387 for (i = 0; i < nelt; ++i)
5388 sel[i] = i * 2;
5389 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5391 for (i = 0; i < nelt; ++i)
5392 sel[i] = i * 2 + 1;
5393 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5395 for (i = 0; i < log_length; i++)
5397 for (j = 0; j < length; j += 2)
5399 first_vect = dr_chain[j];
5400 second_vect = dr_chain[j+1];
5402 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5403 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5404 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5405 first_vect, second_vect,
5406 perm_mask_even);
5407 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5408 (*result_chain)[j/2] = data_ref;
5410 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5411 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5412 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5413 first_vect, second_vect,
5414 perm_mask_odd);
5415 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5416 (*result_chain)[j/2+length/2] = data_ref;
5418 memcpy (dr_chain.address (), result_chain->address (),
5419 length * sizeof (tree));
5424 /* Function vect_shift_permute_load_chain.
5426 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5427 sequence of stmts to reorder the input data accordingly.
5428 Return the final references for loads in RESULT_CHAIN.
5429 Return true if successed, false otherwise.
5431 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5432 The input is 3 vectors each containing 8 elements. We assign a
5433 number to each element, the input sequence is:
5435 1st vec: 0 1 2 3 4 5 6 7
5436 2nd vec: 8 9 10 11 12 13 14 15
5437 3rd vec: 16 17 18 19 20 21 22 23
5439 The output sequence should be:
5441 1st vec: 0 3 6 9 12 15 18 21
5442 2nd vec: 1 4 7 10 13 16 19 22
5443 3rd vec: 2 5 8 11 14 17 20 23
5445 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5447 First we shuffle all 3 vectors to get correct elements order:
5449 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5450 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5451 3rd vec: (16 19 22) (17 20 23) (18 21)
5453 Next we unite and shift vector 3 times:
5455 1st step:
5456 shift right by 6 the concatenation of:
5457 "1st vec" and "2nd vec"
5458 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5459 "2nd vec" and "3rd vec"
5460 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5461 "3rd vec" and "1st vec"
5462 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5463 | New vectors |
5465 So that now new vectors are:
5467 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5468 2nd vec: (10 13) (16 19 22) (17 20 23)
5469 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5471 2nd step:
5472 shift right by 5 the concatenation of:
5473 "1st vec" and "3rd vec"
5474 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5475 "2nd vec" and "1st vec"
5476 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5477 "3rd vec" and "2nd vec"
5478 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5479 | New vectors |
5481 So that now new vectors are:
5483 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5484 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5485 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5487 3rd step:
5488 shift right by 5 the concatenation of:
5489 "1st vec" and "1st vec"
5490 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5491 shift right by 3 the concatenation of:
5492 "2nd vec" and "2nd vec"
5493 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5494 | New vectors |
5496 So that now all vectors are READY:
5497 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5498 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5499 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5501 This algorithm is faster than one in vect_permute_load_chain if:
5502 1. "shift of a concatination" is faster than general permutation.
5503 This is usually so.
5504 2. The TARGET machine can't execute vector instructions in parallel.
5505 This is because each step of the algorithm depends on previous.
5506 The algorithm in vect_permute_load_chain is much more parallel.
5508 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5511 static bool
5512 vect_shift_permute_load_chain (vec<tree> dr_chain,
5513 unsigned int length,
5514 gimple *stmt,
5515 gimple_stmt_iterator *gsi,
5516 vec<tree> *result_chain)
5518 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5519 tree perm2_mask1, perm2_mask2, perm3_mask;
5520 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5521 gimple *perm_stmt;
5523 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5524 unsigned int i;
5525 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5526 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5527 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5529 auto_vec_perm_indices sel (nelt);
5530 sel.quick_grow (nelt);
5532 result_chain->quick_grow (length);
5533 memcpy (result_chain->address (), dr_chain.address (),
5534 length * sizeof (tree));
5536 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5538 unsigned int j, log_length = exact_log2 (length);
5539 for (i = 0; i < nelt / 2; ++i)
5540 sel[i] = i * 2;
5541 for (i = 0; i < nelt / 2; ++i)
5542 sel[nelt / 2 + i] = i * 2 + 1;
5543 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5545 if (dump_enabled_p ())
5546 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5547 "shuffle of 2 fields structure is not \
5548 supported by target\n");
5549 return false;
5551 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5553 for (i = 0; i < nelt / 2; ++i)
5554 sel[i] = i * 2 + 1;
5555 for (i = 0; i < nelt / 2; ++i)
5556 sel[nelt / 2 + i] = i * 2;
5557 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5559 if (dump_enabled_p ())
5560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5561 "shuffle of 2 fields structure is not \
5562 supported by target\n");
5563 return false;
5565 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5567 /* Generating permutation constant to shift all elements.
5568 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5569 for (i = 0; i < nelt; i++)
5570 sel[i] = nelt / 2 + i;
5571 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5573 if (dump_enabled_p ())
5574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5575 "shift permutation is not supported by target\n");
5576 return false;
5578 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5580 /* Generating permutation constant to select vector from 2.
5581 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5582 for (i = 0; i < nelt / 2; i++)
5583 sel[i] = i;
5584 for (i = nelt / 2; i < nelt; i++)
5585 sel[i] = nelt + i;
5586 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5588 if (dump_enabled_p ())
5589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5590 "select is not supported by target\n");
5591 return false;
5593 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5595 for (i = 0; i < log_length; i++)
5597 for (j = 0; j < length; j += 2)
5599 first_vect = dr_chain[j];
5600 second_vect = dr_chain[j + 1];
5602 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5603 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5604 first_vect, first_vect,
5605 perm2_mask1);
5606 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5607 vect[0] = data_ref;
5609 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5610 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5611 second_vect, second_vect,
5612 perm2_mask2);
5613 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5614 vect[1] = data_ref;
5616 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5617 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5618 vect[0], vect[1], shift1_mask);
5619 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5620 (*result_chain)[j/2 + length/2] = data_ref;
5622 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5623 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5624 vect[0], vect[1], select_mask);
5625 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5626 (*result_chain)[j/2] = data_ref;
5628 memcpy (dr_chain.address (), result_chain->address (),
5629 length * sizeof (tree));
5631 return true;
5633 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5635 unsigned int k = 0, l = 0;
5637 /* Generating permutation constant to get all elements in rigth order.
5638 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5639 for (i = 0; i < nelt; i++)
5641 if (3 * k + (l % 3) >= nelt)
5643 k = 0;
5644 l += (3 - (nelt % 3));
5646 sel[i] = 3 * k + (l % 3);
5647 k++;
5649 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5651 if (dump_enabled_p ())
5652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5653 "shuffle of 3 fields structure is not \
5654 supported by target\n");
5655 return false;
5657 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5659 /* Generating permutation constant to shift all elements.
5660 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5661 for (i = 0; i < nelt; i++)
5662 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5663 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5665 if (dump_enabled_p ())
5666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5667 "shift permutation is not supported by target\n");
5668 return false;
5670 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5672 /* Generating permutation constant to shift all elements.
5673 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5674 for (i = 0; i < nelt; i++)
5675 sel[i] = 2 * (nelt / 3) + 1 + i;
5676 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5678 if (dump_enabled_p ())
5679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5680 "shift permutation is not supported by target\n");
5681 return false;
5683 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5685 /* Generating permutation constant to shift all elements.
5686 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5687 for (i = 0; i < nelt; i++)
5688 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5689 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5691 if (dump_enabled_p ())
5692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5693 "shift permutation is not supported by target\n");
5694 return false;
5696 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5698 /* Generating permutation constant to shift all elements.
5699 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5700 for (i = 0; i < nelt; i++)
5701 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5702 if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5704 if (dump_enabled_p ())
5705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5706 "shift permutation is not supported by target\n");
5707 return false;
5709 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5711 for (k = 0; k < 3; k++)
5713 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5714 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5715 dr_chain[k], dr_chain[k],
5716 perm3_mask);
5717 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5718 vect[k] = data_ref;
5721 for (k = 0; k < 3; k++)
5723 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5724 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5725 vect[k % 3], vect[(k + 1) % 3],
5726 shift1_mask);
5727 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5728 vect_shift[k] = data_ref;
5731 for (k = 0; k < 3; k++)
5733 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5734 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5735 vect_shift[(4 - k) % 3],
5736 vect_shift[(3 - k) % 3],
5737 shift2_mask);
5738 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5739 vect[k] = data_ref;
5742 (*result_chain)[3 - (nelt % 3)] = vect[2];
5744 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5745 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5746 vect[0], shift3_mask);
5747 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5748 (*result_chain)[nelt % 3] = data_ref;
5750 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5751 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5752 vect[1], shift4_mask);
5753 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5754 (*result_chain)[0] = data_ref;
5755 return true;
5757 return false;
5760 /* Function vect_transform_grouped_load.
5762 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5763 to perform their permutation and ascribe the result vectorized statements to
5764 the scalar statements.
5767 void
5768 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5769 gimple_stmt_iterator *gsi)
5771 machine_mode mode;
5772 vec<tree> result_chain = vNULL;
5774 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5775 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5776 vectors, that are ready for vector computation. */
5777 result_chain.create (size);
5779 /* If reassociation width for vector type is 2 or greater target machine can
5780 execute 2 or more vector instructions in parallel. Otherwise try to
5781 get chain for loads group using vect_shift_permute_load_chain. */
5782 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5783 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5784 || pow2p_hwi (size)
5785 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5786 gsi, &result_chain))
5787 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5788 vect_record_grouped_load_vectors (stmt, result_chain);
5789 result_chain.release ();
5792 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5793 generated as part of the vectorization of STMT. Assign the statement
5794 for each vector to the associated scalar statement. */
5796 void
5797 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5799 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5800 gimple *next_stmt, *new_stmt;
5801 unsigned int i, gap_count;
5802 tree tmp_data_ref;
5804 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5805 Since we scan the chain starting from it's first node, their order
5806 corresponds the order of data-refs in RESULT_CHAIN. */
5807 next_stmt = first_stmt;
5808 gap_count = 1;
5809 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5811 if (!next_stmt)
5812 break;
5814 /* Skip the gaps. Loads created for the gaps will be removed by dead
5815 code elimination pass later. No need to check for the first stmt in
5816 the group, since it always exists.
5817 GROUP_GAP is the number of steps in elements from the previous
5818 access (if there is no gap GROUP_GAP is 1). We skip loads that
5819 correspond to the gaps. */
5820 if (next_stmt != first_stmt
5821 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5823 gap_count++;
5824 continue;
5827 while (next_stmt)
5829 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5830 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5831 copies, and we put the new vector statement in the first available
5832 RELATED_STMT. */
5833 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5834 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5835 else
5837 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5839 gimple *prev_stmt =
5840 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5841 gimple *rel_stmt =
5842 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5843 while (rel_stmt)
5845 prev_stmt = rel_stmt;
5846 rel_stmt =
5847 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5850 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5851 new_stmt;
5855 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5856 gap_count = 1;
5857 /* If NEXT_STMT accesses the same DR as the previous statement,
5858 put the same TMP_DATA_REF as its vectorized statement; otherwise
5859 get the next data-ref from RESULT_CHAIN. */
5860 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5861 break;
5866 /* Function vect_force_dr_alignment_p.
5868 Returns whether the alignment of a DECL can be forced to be aligned
5869 on ALIGNMENT bit boundary. */
5871 bool
5872 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5874 if (!VAR_P (decl))
5875 return false;
5877 if (decl_in_symtab_p (decl)
5878 && !symtab_node::get (decl)->can_increase_alignment_p ())
5879 return false;
5881 if (TREE_STATIC (decl))
5882 return (alignment <= MAX_OFILE_ALIGNMENT);
5883 else
5884 return (alignment <= MAX_STACK_ALIGNMENT);
5888 /* Return whether the data reference DR is supported with respect to its
5889 alignment.
5890 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5891 it is aligned, i.e., check if it is possible to vectorize it with different
5892 alignment. */
5894 enum dr_alignment_support
5895 vect_supportable_dr_alignment (struct data_reference *dr,
5896 bool check_aligned_accesses)
5898 gimple *stmt = DR_STMT (dr);
5899 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5900 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5901 machine_mode mode = TYPE_MODE (vectype);
5902 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5903 struct loop *vect_loop = NULL;
5904 bool nested_in_vect_loop = false;
5906 if (aligned_access_p (dr) && !check_aligned_accesses)
5907 return dr_aligned;
5909 /* For now assume all conditional loads/stores support unaligned
5910 access without any special code. */
5911 if (is_gimple_call (stmt)
5912 && gimple_call_internal_p (stmt)
5913 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5914 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5915 return dr_unaligned_supported;
5917 if (loop_vinfo)
5919 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5920 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5923 /* Possibly unaligned access. */
5925 /* We can choose between using the implicit realignment scheme (generating
5926 a misaligned_move stmt) and the explicit realignment scheme (generating
5927 aligned loads with a REALIGN_LOAD). There are two variants to the
5928 explicit realignment scheme: optimized, and unoptimized.
5929 We can optimize the realignment only if the step between consecutive
5930 vector loads is equal to the vector size. Since the vector memory
5931 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5932 is guaranteed that the misalignment amount remains the same throughout the
5933 execution of the vectorized loop. Therefore, we can create the
5934 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5935 at the loop preheader.
5937 However, in the case of outer-loop vectorization, when vectorizing a
5938 memory access in the inner-loop nested within the LOOP that is now being
5939 vectorized, while it is guaranteed that the misalignment of the
5940 vectorized memory access will remain the same in different outer-loop
5941 iterations, it is *not* guaranteed that is will remain the same throughout
5942 the execution of the inner-loop. This is because the inner-loop advances
5943 with the original scalar step (and not in steps of VS). If the inner-loop
5944 step happens to be a multiple of VS, then the misalignment remains fixed
5945 and we can use the optimized realignment scheme. For example:
5947 for (i=0; i<N; i++)
5948 for (j=0; j<M; j++)
5949 s += a[i+j];
5951 When vectorizing the i-loop in the above example, the step between
5952 consecutive vector loads is 1, and so the misalignment does not remain
5953 fixed across the execution of the inner-loop, and the realignment cannot
5954 be optimized (as illustrated in the following pseudo vectorized loop):
5956 for (i=0; i<N; i+=4)
5957 for (j=0; j<M; j++){
5958 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5959 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5960 // (assuming that we start from an aligned address).
5963 We therefore have to use the unoptimized realignment scheme:
5965 for (i=0; i<N; i+=4)
5966 for (j=k; j<M; j+=4)
5967 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5968 // that the misalignment of the initial address is
5969 // 0).
5971 The loop can then be vectorized as follows:
5973 for (k=0; k<4; k++){
5974 rt = get_realignment_token (&vp[k]);
5975 for (i=0; i<N; i+=4){
5976 v1 = vp[i+k];
5977 for (j=k; j<M; j+=4){
5978 v2 = vp[i+j+VS-1];
5979 va = REALIGN_LOAD <v1,v2,rt>;
5980 vs += va;
5981 v1 = v2;
5984 } */
5986 if (DR_IS_READ (dr))
5988 bool is_packed = false;
5989 tree type = (TREE_TYPE (DR_REF (dr)));
5991 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5992 && (!targetm.vectorize.builtin_mask_for_load
5993 || targetm.vectorize.builtin_mask_for_load ()))
5995 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5997 /* If we are doing SLP then the accesses need not have the
5998 same alignment, instead it depends on the SLP group size. */
5999 if (loop_vinfo
6000 && STMT_SLP_TYPE (stmt_info)
6001 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6002 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6003 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6005 else if (!loop_vinfo
6006 || (nested_in_vect_loop
6007 && (TREE_INT_CST_LOW (DR_STEP (dr))
6008 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6009 return dr_explicit_realign;
6010 else
6011 return dr_explicit_realign_optimized;
6013 if (!known_alignment_for_access_p (dr))
6014 is_packed = not_size_aligned (DR_REF (dr));
6016 if (targetm.vectorize.support_vector_misalignment
6017 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6018 /* Can't software pipeline the loads, but can at least do them. */
6019 return dr_unaligned_supported;
6021 else
6023 bool is_packed = false;
6024 tree type = (TREE_TYPE (DR_REF (dr)));
6026 if (!known_alignment_for_access_p (dr))
6027 is_packed = not_size_aligned (DR_REF (dr));
6029 if (targetm.vectorize.support_vector_misalignment
6030 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6031 return dr_unaligned_supported;
6034 /* Unsupported. */
6035 return dr_unaligned_unsupported;