Give the target more control over ARRAY_TYPE modes
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob759c1e30edffaad705392d744ac0a926f0f95aa4
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
64 machine_mode mode, array_mode;
65 bool limit_p;
67 mode = TYPE_MODE (vectype);
68 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
71 limit_p = !targetm.array_mode_supported_p (mode, count);
72 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 if (dump_enabled_p ())
75 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
76 "no array mode for %s["
77 HOST_WIDE_INT_PRINT_DEC "]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
97 return true;
101 /* Return the smallest scalar part of STMT.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
118 tree
119 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
120 HOST_WIDE_INT *rhs_size_unit)
122 tree scalar_type = gimple_expr_type (stmt);
123 HOST_WIDE_INT lhs, rhs;
125 /* During the analysis phase, this function is called on arbitrary
126 statements that might not have scalar results. */
127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
128 return scalar_type;
130 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132 if (is_gimple_assign (stmt)
133 && (gimple_assign_cast_p (stmt)
134 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
135 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
136 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
138 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
140 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
141 if (rhs < lhs)
142 scalar_type = rhs_type;
145 *lhs_size_unit = lhs;
146 *rhs_size_unit = rhs;
147 return scalar_type;
151 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
152 tested at run-time. Return TRUE if DDR was successfully inserted.
153 Return false if versioning is not supported. */
155 static bool
156 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
158 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
160 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
161 return false;
163 if (!runtime_alias_check_p (ddr, loop,
164 optimize_loop_nest_for_speed_p (loop)))
165 return false;
167 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
168 return true;
172 /* A subroutine of vect_analyze_data_ref_dependence. Handle
173 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
174 distances. These distances are conservatively correct but they don't
175 reflect a guaranteed dependence.
177 Return true if this function does all the work necessary to avoid
178 an alias or false if the caller should use the dependence distances
179 to limit the vectorization factor in the usual way. LOOP_DEPTH is
180 the depth of the loop described by LOOP_VINFO and the other arguments
181 are as for vect_analyze_data_ref_dependence. */
183 static bool
184 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
185 loop_vec_info loop_vinfo,
186 int loop_depth, unsigned int *max_vf)
188 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
189 lambda_vector dist_v;
190 unsigned int i;
191 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
193 int dist = dist_v[loop_depth];
194 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
196 /* If the user asserted safelen >= DIST consecutive iterations
197 can be executed concurrently, assume independence.
199 ??? An alternative would be to add the alias check even
200 in this case, and vectorize the fallback loop with the
201 maximum VF set to safelen. However, if the user has
202 explicitly given a length, it's less likely that that
203 would be a win. */
204 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
206 if ((unsigned int) loop->safelen < *max_vf)
207 *max_vf = loop->safelen;
208 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
209 continue;
212 /* For dependence distances of 2 or more, we have the option
213 of limiting VF or checking for an alias at runtime.
214 Prefer to check at runtime if we can, to avoid limiting
215 the VF unnecessarily when the bases are in fact independent.
217 Note that the alias checks will be removed if the VF ends up
218 being small enough. */
219 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
222 return true;
226 /* Function vect_analyze_data_ref_dependence.
228 Return TRUE if there (might) exist a dependence between a memory-reference
229 DRA and a memory-reference DRB. When versioning for alias may check a
230 dependence at run-time, return FALSE. Adjust *MAX_VF according to
231 the data dependence. */
233 static bool
234 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
235 loop_vec_info loop_vinfo,
236 unsigned int *max_vf)
238 unsigned int i;
239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
240 struct data_reference *dra = DDR_A (ddr);
241 struct data_reference *drb = DDR_B (ddr);
242 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
243 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
244 lambda_vector dist_v;
245 unsigned int loop_depth;
247 /* In loop analysis all data references should be vectorizable. */
248 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
249 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
250 gcc_unreachable ();
252 /* Independent data accesses. */
253 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
254 return false;
256 if (dra == drb
257 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
258 return false;
260 /* We do not have to consider dependences between accesses that belong
261 to the same group. */
262 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
263 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
264 return false;
266 /* Even if we have an anti-dependence then, as the vectorized loop covers at
267 least two scalar iterations, there is always also a true dependence.
268 As the vectorizer does not re-order loads and stores we can ignore
269 the anti-dependence if TBAA can disambiguate both DRs similar to the
270 case with known negative distance anti-dependences (positive
271 distance anti-dependences would violate TBAA constraints). */
272 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
273 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
274 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
275 get_alias_set (DR_REF (drb))))
276 return false;
278 /* Unknown data dependence. */
279 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
281 /* If user asserted safelen consecutive iterations can be
282 executed concurrently, assume independence. */
283 if (loop->safelen >= 2)
285 if ((unsigned int) loop->safelen < *max_vf)
286 *max_vf = loop->safelen;
287 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
288 return false;
291 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
292 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
294 if (dump_enabled_p ())
296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
297 "versioning for alias not supported for: "
298 "can't determine dependence between ");
299 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
300 DR_REF (dra));
301 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
302 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
303 DR_REF (drb));
304 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
306 return true;
309 if (dump_enabled_p ())
311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
312 "versioning for alias required: "
313 "can't determine dependence between ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (dra));
316 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
317 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
318 DR_REF (drb));
319 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
322 /* Add to list of ddrs that need to be tested at run-time. */
323 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
326 /* Known data dependence. */
327 if (DDR_NUM_DIST_VECTS (ddr) == 0)
329 /* If user asserted safelen consecutive iterations can be
330 executed concurrently, assume independence. */
331 if (loop->safelen >= 2)
333 if ((unsigned int) loop->safelen < *max_vf)
334 *max_vf = loop->safelen;
335 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
336 return false;
339 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
340 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
342 if (dump_enabled_p ())
344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
345 "versioning for alias not supported for: "
346 "bad dist vector for ");
347 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
348 DR_REF (dra));
349 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
350 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
351 DR_REF (drb));
352 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
354 return true;
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
360 "versioning for alias required: "
361 "bad dist vector for ");
362 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
363 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
364 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
365 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
367 /* Add to list of ddrs that need to be tested at run-time. */
368 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
371 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
373 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
374 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
375 loop_depth, max_vf))
376 return false;
378 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
380 int dist = dist_v[loop_depth];
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_NOTE, vect_location,
384 "dependence distance = %d.\n", dist);
386 if (dist == 0)
388 if (dump_enabled_p ())
390 dump_printf_loc (MSG_NOTE, vect_location,
391 "dependence distance == 0 between ");
392 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
393 dump_printf (MSG_NOTE, " and ");
394 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
395 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
398 /* When we perform grouped accesses and perform implicit CSE
399 by detecting equal accesses and doing disambiguation with
400 runtime alias tests like for
401 .. = a[i];
402 .. = a[i+1];
403 a[i] = ..;
404 a[i+1] = ..;
405 *p = ..;
406 .. = a[i];
407 .. = a[i+1];
408 where we will end up loading { a[i], a[i+1] } once, make
409 sure that inserting group loads before the first load and
410 stores after the last store will do the right thing.
411 Similar for groups like
412 a[i] = ...;
413 ... = a[i];
414 a[i+1] = ...;
415 where loads from the group interleave with the store. */
416 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
417 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
419 gimple *earlier_stmt;
420 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
421 if (DR_IS_WRITE
422 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
426 "READ_WRITE dependence in interleaving."
427 "\n");
428 return true;
432 continue;
435 if (dist > 0 && DDR_REVERSED_P (ddr))
437 /* If DDR_REVERSED_P the order of the data-refs in DDR was
438 reversed (to make distance vector positive), and the actual
439 distance is negative. */
440 if (dump_enabled_p ())
441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
442 "dependence distance negative.\n");
443 /* Record a negative dependence distance to later limit the
444 amount of stmt copying / unrolling we can perform.
445 Only need to handle read-after-write dependence. */
446 if (DR_IS_READ (drb)
447 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
448 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
449 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
450 continue;
453 unsigned int abs_dist = abs (dist);
454 if (abs_dist >= 2 && abs_dist < *max_vf)
456 /* The dependence distance requires reduction of the maximal
457 vectorization factor. */
458 *max_vf = abs (dist);
459 if (dump_enabled_p ())
460 dump_printf_loc (MSG_NOTE, vect_location,
461 "adjusting maximal vectorization factor to %i\n",
462 *max_vf);
465 if (abs_dist >= *max_vf)
467 /* Dependence distance does not create dependence, as far as
468 vectorization is concerned, in this case. */
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location,
471 "dependence distance >= VF.\n");
472 continue;
475 if (dump_enabled_p ())
477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
478 "not vectorized, possible dependence "
479 "between data-refs ");
480 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
481 dump_printf (MSG_NOTE, " and ");
482 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
483 dump_printf (MSG_NOTE, "\n");
486 return true;
489 return false;
492 /* Function vect_analyze_data_ref_dependences.
494 Examine all the data references in the loop, and make sure there do not
495 exist any data dependences between them. Set *MAX_VF according to
496 the maximum vectorization factor the data dependences allow. */
498 bool
499 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
500 unsigned int *max_vf)
502 unsigned int i;
503 struct data_dependence_relation *ddr;
505 if (dump_enabled_p ())
506 dump_printf_loc (MSG_NOTE, vect_location,
507 "=== vect_analyze_data_ref_dependences ===\n");
509 LOOP_VINFO_DDRS (loop_vinfo)
510 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
511 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
512 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
513 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
514 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
515 &LOOP_VINFO_DDRS (loop_vinfo),
516 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
517 return false;
519 /* For epilogues we either have no aliases or alias versioning
520 was applied to original loop. Therefore we may just get max_vf
521 using VF of original loop. */
522 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
523 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
524 else
525 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
526 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
527 return false;
529 return true;
533 /* Function vect_slp_analyze_data_ref_dependence.
535 Return TRUE if there (might) exist a dependence between a memory-reference
536 DRA and a memory-reference DRB. When versioning for alias may check a
537 dependence at run-time, return FALSE. Adjust *MAX_VF according to
538 the data dependence. */
540 static bool
541 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
543 struct data_reference *dra = DDR_A (ddr);
544 struct data_reference *drb = DDR_B (ddr);
546 /* We need to check dependences of statements marked as unvectorizable
547 as well, they still can prohibit vectorization. */
549 /* Independent data accesses. */
550 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
551 return false;
553 if (dra == drb)
554 return false;
556 /* Read-read is OK. */
557 if (DR_IS_READ (dra) && DR_IS_READ (drb))
558 return false;
560 /* If dra and drb are part of the same interleaving chain consider
561 them independent. */
562 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
563 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
564 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
565 return false;
567 /* Unknown data dependence. */
568 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
570 if (dump_enabled_p ())
572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
573 "can't determine dependence between ");
574 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
575 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
576 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
577 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
580 else if (dump_enabled_p ())
582 dump_printf_loc (MSG_NOTE, vect_location,
583 "determined dependence between ");
584 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
585 dump_printf (MSG_NOTE, " and ");
586 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
587 dump_printf (MSG_NOTE, "\n");
590 return true;
594 /* Analyze dependences involved in the transform of SLP NODE. STORES
595 contain the vector of scalar stores of this instance if we are
596 disambiguating the loads. */
598 static bool
599 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
600 vec<gimple *> stores, gimple *last_store)
602 /* This walks over all stmts involved in the SLP load/store done
603 in NODE verifying we can sink them up to the last stmt in the
604 group. */
605 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
606 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
608 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
609 if (access == last_access)
610 continue;
611 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
612 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
613 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
615 gimple *stmt = gsi_stmt (gsi);
616 if (! gimple_vuse (stmt)
617 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
618 continue;
620 /* If we couldn't record a (single) data reference for this
621 stmt we have to give up. */
622 /* ??? Here and below if dependence analysis fails we can resort
623 to the alias oracle which can handle more kinds of stmts. */
624 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
625 if (!dr_b)
626 return false;
628 bool dependent = false;
629 /* If we run into a store of this same instance (we've just
630 marked those) then delay dependence checking until we run
631 into the last store because this is where it will have
632 been sunk to (and we verify if we can do that as well). */
633 if (gimple_visited_p (stmt))
635 if (stmt != last_store)
636 continue;
637 unsigned i;
638 gimple *store;
639 FOR_EACH_VEC_ELT (stores, i, store)
641 data_reference *store_dr
642 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
643 ddr_p ddr = initialize_data_dependence_relation
644 (dr_a, store_dr, vNULL);
645 dependent = vect_slp_analyze_data_ref_dependence (ddr);
646 free_dependence_relation (ddr);
647 if (dependent)
648 break;
651 else
653 ddr_p ddr = initialize_data_dependence_relation (dr_a,
654 dr_b, vNULL);
655 dependent = vect_slp_analyze_data_ref_dependence (ddr);
656 free_dependence_relation (ddr);
658 if (dependent)
659 return false;
662 return true;
666 /* Function vect_analyze_data_ref_dependences.
668 Examine all the data references in the basic-block, and make sure there
669 do not exist any data dependences between them. Set *MAX_VF according to
670 the maximum vectorization factor the data dependences allow. */
672 bool
673 vect_slp_analyze_instance_dependence (slp_instance instance)
675 if (dump_enabled_p ())
676 dump_printf_loc (MSG_NOTE, vect_location,
677 "=== vect_slp_analyze_instance_dependence ===\n");
679 /* The stores of this instance are at the root of the SLP tree. */
680 slp_tree store = SLP_INSTANCE_TREE (instance);
681 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
682 store = NULL;
684 /* Verify we can sink stores to the vectorized stmt insert location. */
685 gimple *last_store = NULL;
686 if (store)
688 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
689 return false;
691 /* Mark stores in this instance and remember the last one. */
692 last_store = vect_find_last_scalar_stmt_in_slp (store);
693 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
694 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
697 bool res = true;
699 /* Verify we can sink loads to the vectorized stmt insert location,
700 special-casing stores of this instance. */
701 slp_tree load;
702 unsigned int i;
703 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
704 if (! vect_slp_analyze_node_dependences (instance, load,
705 store
706 ? SLP_TREE_SCALAR_STMTS (store)
707 : vNULL, last_store))
709 res = false;
710 break;
713 /* Unset the visited flag. */
714 if (store)
715 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
716 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
718 return res;
721 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
722 the statement that contains DRB, which is useful for recording in the
723 dump file. */
725 static void
726 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
727 innermost_loop_behavior *drb)
729 bool existed;
730 innermost_loop_behavior *&entry
731 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
732 if (!existed || entry->base_alignment < drb->base_alignment)
734 entry = drb;
735 if (dump_enabled_p ())
737 dump_printf_loc (MSG_NOTE, vect_location,
738 "recording new base alignment for ");
739 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
740 dump_printf (MSG_NOTE, "\n");
741 dump_printf_loc (MSG_NOTE, vect_location,
742 " alignment: %d\n", drb->base_alignment);
743 dump_printf_loc (MSG_NOTE, vect_location,
744 " misalignment: %d\n", drb->base_misalignment);
745 dump_printf_loc (MSG_NOTE, vect_location,
746 " based on: ");
747 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
752 /* If the region we're going to vectorize is reached, all unconditional
753 data references occur at least once. We can therefore pool the base
754 alignment guarantees from each unconditional reference. Do this by
755 going through all the data references in VINFO and checking whether
756 the containing statement makes the reference unconditionally. If so,
757 record the alignment of the base address in VINFO so that it can be
758 used for all other references with the same base. */
760 void
761 vect_record_base_alignments (vec_info *vinfo)
763 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
764 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
765 data_reference *dr;
766 unsigned int i;
767 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
768 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
770 gimple *stmt = DR_STMT (dr);
771 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
773 /* If DR is nested in the loop that is being vectorized, we can also
774 record the alignment of the base wrt the outer loop. */
775 if (loop && nested_in_vect_loop_p (loop, stmt))
777 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
778 vect_record_base_alignment
779 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
784 /* Return the target alignment for the vectorized form of DR. */
786 static unsigned int
787 vect_calculate_target_alignment (struct data_reference *dr)
789 gimple *stmt = DR_STMT (dr);
790 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
791 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
792 return targetm.vectorize.preferred_vector_alignment (vectype);
795 /* Function vect_compute_data_ref_alignment
797 Compute the misalignment of the data reference DR.
799 Output:
800 1. If during the misalignment computation it is found that the data reference
801 cannot be vectorized then false is returned.
802 2. DR_MISALIGNMENT (DR) is defined.
804 FOR NOW: No analysis is actually performed. Misalignment is calculated
805 only for trivial cases. TODO. */
807 bool
808 vect_compute_data_ref_alignment (struct data_reference *dr)
810 gimple *stmt = DR_STMT (dr);
811 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
812 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
813 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
814 struct loop *loop = NULL;
815 tree ref = DR_REF (dr);
816 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
818 if (dump_enabled_p ())
819 dump_printf_loc (MSG_NOTE, vect_location,
820 "vect_compute_data_ref_alignment:\n");
822 if (loop_vinfo)
823 loop = LOOP_VINFO_LOOP (loop_vinfo);
825 /* Initialize misalignment to unknown. */
826 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
828 innermost_loop_behavior *drb = vect_dr_behavior (dr);
829 bool step_preserves_misalignment_p;
831 unsigned HOST_WIDE_INT vector_alignment
832 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
833 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
835 /* No step for BB vectorization. */
836 if (!loop)
838 gcc_assert (integer_zerop (drb->step));
839 step_preserves_misalignment_p = true;
842 /* In case the dataref is in an inner-loop of the loop that is being
843 vectorized (LOOP), we use the base and misalignment information
844 relative to the outer-loop (LOOP). This is ok only if the misalignment
845 stays the same throughout the execution of the inner-loop, which is why
846 we have to check that the stride of the dataref in the inner-loop evenly
847 divides by the vector alignment. */
848 else if (nested_in_vect_loop_p (loop, stmt))
850 step_preserves_misalignment_p
851 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
853 if (dump_enabled_p ())
855 if (step_preserves_misalignment_p)
856 dump_printf_loc (MSG_NOTE, vect_location,
857 "inner step divides the vector alignment.\n");
858 else
859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
860 "inner step doesn't divide the vector"
861 " alignment.\n");
865 /* Similarly we can only use base and misalignment information relative to
866 an innermost loop if the misalignment stays the same throughout the
867 execution of the loop. As above, this is the case if the stride of
868 the dataref evenly divides by the alignment. */
869 else
871 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
872 step_preserves_misalignment_p
873 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
875 if (!step_preserves_misalignment_p && dump_enabled_p ())
876 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
877 "step doesn't divide the vector alignment.\n");
880 unsigned int base_alignment = drb->base_alignment;
881 unsigned int base_misalignment = drb->base_misalignment;
883 /* Calculate the maximum of the pooled base address alignment and the
884 alignment that we can compute for DR itself. */
885 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
886 if (entry && base_alignment < (*entry)->base_alignment)
888 base_alignment = (*entry)->base_alignment;
889 base_misalignment = (*entry)->base_misalignment;
892 if (drb->offset_alignment < vector_alignment
893 || !step_preserves_misalignment_p
894 /* We need to know whether the step wrt the vectorized loop is
895 negative when computing the starting misalignment below. */
896 || TREE_CODE (drb->step) != INTEGER_CST)
898 if (dump_enabled_p ())
900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
901 "Unknown alignment for access: ");
902 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
903 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
905 return true;
908 if (base_alignment < vector_alignment)
910 tree base = drb->base_address;
911 if (TREE_CODE (base) == ADDR_EXPR)
912 base = TREE_OPERAND (base, 0);
913 if (!vect_can_force_dr_alignment_p (base,
914 vector_alignment * BITS_PER_UNIT))
916 if (dump_enabled_p ())
918 dump_printf_loc (MSG_NOTE, vect_location,
919 "can't force alignment of ref: ");
920 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
921 dump_printf (MSG_NOTE, "\n");
923 return true;
926 /* Force the alignment of the decl.
927 NOTE: This is the only change to the code we make during
928 the analysis phase, before deciding to vectorize the loop. */
929 if (dump_enabled_p ())
931 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
932 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
933 dump_printf (MSG_NOTE, "\n");
936 DR_VECT_AUX (dr)->base_decl = base;
937 DR_VECT_AUX (dr)->base_misaligned = true;
938 base_misalignment = 0;
940 poly_int64 misalignment
941 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
943 /* If this is a backward running DR then first access in the larger
944 vectype actually is N-1 elements before the address in the DR.
945 Adjust misalign accordingly. */
946 if (tree_int_cst_sgn (drb->step) < 0)
947 /* PLUS because STEP is negative. */
948 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
949 * TREE_INT_CST_LOW (drb->step));
951 unsigned int const_misalignment;
952 if (!known_misalignment (misalignment, vector_alignment,
953 &const_misalignment))
955 if (dump_enabled_p ())
957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
958 "Non-constant misalignment for access: ");
959 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
960 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
962 return true;
965 SET_DR_MISALIGNMENT (dr, const_misalignment);
967 if (dump_enabled_p ())
969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
970 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
971 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
972 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
975 return true;
978 /* Function vect_update_misalignment_for_peel.
979 Sets DR's misalignment
980 - to 0 if it has the same alignment as DR_PEEL,
981 - to the misalignment computed using NPEEL if DR's salignment is known,
982 - to -1 (unknown) otherwise.
984 DR - the data reference whose misalignment is to be adjusted.
985 DR_PEEL - the data reference whose misalignment is being made
986 zero in the vector loop by the peel.
987 NPEEL - the number of iterations in the peel loop if the misalignment
988 of DR_PEEL is known at compile time. */
990 static void
991 vect_update_misalignment_for_peel (struct data_reference *dr,
992 struct data_reference *dr_peel, int npeel)
994 unsigned int i;
995 vec<dr_p> same_aligned_drs;
996 struct data_reference *current_dr;
997 int dr_size = vect_get_scalar_dr_size (dr);
998 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
999 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1000 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1002 /* For interleaved data accesses the step in the loop must be multiplied by
1003 the size of the interleaving group. */
1004 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1005 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1006 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1007 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1009 /* It can be assumed that the data refs with the same alignment as dr_peel
1010 are aligned in the vector loop. */
1011 same_aligned_drs
1012 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1013 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1015 if (current_dr != dr)
1016 continue;
1017 gcc_assert (!known_alignment_for_access_p (dr)
1018 || !known_alignment_for_access_p (dr_peel)
1019 || (DR_MISALIGNMENT (dr) / dr_size
1020 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1021 SET_DR_MISALIGNMENT (dr, 0);
1022 return;
1025 if (known_alignment_for_access_p (dr)
1026 && known_alignment_for_access_p (dr_peel))
1028 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1029 int misal = DR_MISALIGNMENT (dr);
1030 misal += negative ? -npeel * dr_size : npeel * dr_size;
1031 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1032 SET_DR_MISALIGNMENT (dr, misal);
1033 return;
1036 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1038 "to unknown (-1).\n");
1039 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1043 /* Function verify_data_ref_alignment
1045 Return TRUE if DR can be handled with respect to alignment. */
1047 static bool
1048 verify_data_ref_alignment (data_reference_p dr)
1050 enum dr_alignment_support supportable_dr_alignment
1051 = vect_supportable_dr_alignment (dr, false);
1052 if (!supportable_dr_alignment)
1054 if (dump_enabled_p ())
1056 if (DR_IS_READ (dr))
1057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058 "not vectorized: unsupported unaligned load.");
1059 else
1060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1061 "not vectorized: unsupported unaligned "
1062 "store.");
1064 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1065 DR_REF (dr));
1066 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1068 return false;
1071 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1072 dump_printf_loc (MSG_NOTE, vect_location,
1073 "Vectorizing an unaligned access.\n");
1075 return true;
1078 /* Function vect_verify_datarefs_alignment
1080 Return TRUE if all data references in the loop can be
1081 handled with respect to alignment. */
1083 bool
1084 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1086 vec<data_reference_p> datarefs = vinfo->datarefs;
1087 struct data_reference *dr;
1088 unsigned int i;
1090 FOR_EACH_VEC_ELT (datarefs, i, dr)
1092 gimple *stmt = DR_STMT (dr);
1093 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1095 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1096 continue;
1098 /* For interleaving, only the alignment of the first access matters. */
1099 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1100 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1101 continue;
1103 /* Strided accesses perform only component accesses, alignment is
1104 irrelevant for them. */
1105 if (STMT_VINFO_STRIDED_P (stmt_info)
1106 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1107 continue;
1109 if (! verify_data_ref_alignment (dr))
1110 return false;
1113 return true;
1116 /* Given an memory reference EXP return whether its alignment is less
1117 than its size. */
1119 static bool
1120 not_size_aligned (tree exp)
1122 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1123 return true;
1125 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1126 > get_object_alignment (exp));
1129 /* Function vector_alignment_reachable_p
1131 Return true if vector alignment for DR is reachable by peeling
1132 a few loop iterations. Return false otherwise. */
1134 static bool
1135 vector_alignment_reachable_p (struct data_reference *dr)
1137 gimple *stmt = DR_STMT (dr);
1138 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1139 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1141 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1143 /* For interleaved access we peel only if number of iterations in
1144 the prolog loop ({VF - misalignment}), is a multiple of the
1145 number of the interleaved accesses. */
1146 int elem_size, mis_in_elements;
1148 /* FORNOW: handle only known alignment. */
1149 if (!known_alignment_for_access_p (dr))
1150 return false;
1152 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1153 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1154 elem_size = vector_element_size (vector_size, nelements);
1155 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1157 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1158 return false;
1161 /* If misalignment is known at the compile time then allow peeling
1162 only if natural alignment is reachable through peeling. */
1163 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1165 HOST_WIDE_INT elmsize =
1166 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1167 if (dump_enabled_p ())
1169 dump_printf_loc (MSG_NOTE, vect_location,
1170 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1171 dump_printf (MSG_NOTE,
1172 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1174 if (DR_MISALIGNMENT (dr) % elmsize)
1176 if (dump_enabled_p ())
1177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1178 "data size does not divide the misalignment.\n");
1179 return false;
1183 if (!known_alignment_for_access_p (dr))
1185 tree type = TREE_TYPE (DR_REF (dr));
1186 bool is_packed = not_size_aligned (DR_REF (dr));
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189 "Unknown misalignment, %snaturally aligned\n",
1190 is_packed ? "not " : "");
1191 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1194 return true;
1198 /* Calculate the cost of the memory access represented by DR. */
1200 static void
1201 vect_get_data_access_cost (struct data_reference *dr,
1202 unsigned int *inside_cost,
1203 unsigned int *outside_cost,
1204 stmt_vector_for_cost *body_cost_vec)
1206 gimple *stmt = DR_STMT (dr);
1207 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1208 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1209 int ncopies;
1211 if (PURE_SLP_STMT (stmt_info))
1212 ncopies = 1;
1213 else
1214 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1216 if (DR_IS_READ (dr))
1217 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1218 NULL, body_cost_vec, false);
1219 else
1220 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1222 if (dump_enabled_p ())
1223 dump_printf_loc (MSG_NOTE, vect_location,
1224 "vect_get_data_access_cost: inside_cost = %d, "
1225 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1229 typedef struct _vect_peel_info
1231 struct data_reference *dr;
1232 int npeel;
1233 unsigned int count;
1234 } *vect_peel_info;
1236 typedef struct _vect_peel_extended_info
1238 struct _vect_peel_info peel_info;
1239 unsigned int inside_cost;
1240 unsigned int outside_cost;
1241 } *vect_peel_extended_info;
1244 /* Peeling hashtable helpers. */
1246 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1248 static inline hashval_t hash (const _vect_peel_info *);
1249 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1252 inline hashval_t
1253 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1255 return (hashval_t) peel_info->npeel;
1258 inline bool
1259 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1261 return (a->npeel == b->npeel);
1265 /* Insert DR into peeling hash table with NPEEL as key. */
1267 static void
1268 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1269 loop_vec_info loop_vinfo, struct data_reference *dr,
1270 int npeel)
1272 struct _vect_peel_info elem, *slot;
1273 _vect_peel_info **new_slot;
1274 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1276 elem.npeel = npeel;
1277 slot = peeling_htab->find (&elem);
1278 if (slot)
1279 slot->count++;
1280 else
1282 slot = XNEW (struct _vect_peel_info);
1283 slot->npeel = npeel;
1284 slot->dr = dr;
1285 slot->count = 1;
1286 new_slot = peeling_htab->find_slot (slot, INSERT);
1287 *new_slot = slot;
1290 if (!supportable_dr_alignment
1291 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1292 slot->count += VECT_MAX_COST;
1296 /* Traverse peeling hash table to find peeling option that aligns maximum
1297 number of data accesses. */
1300 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1301 _vect_peel_extended_info *max)
1303 vect_peel_info elem = *slot;
1305 if (elem->count > max->peel_info.count
1306 || (elem->count == max->peel_info.count
1307 && max->peel_info.npeel > elem->npeel))
1309 max->peel_info.npeel = elem->npeel;
1310 max->peel_info.count = elem->count;
1311 max->peel_info.dr = elem->dr;
1314 return 1;
1317 /* Get the costs of peeling NPEEL iterations checking data access costs
1318 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1319 misalignment will be zero after peeling. */
1321 static void
1322 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1323 struct data_reference *dr0,
1324 unsigned int *inside_cost,
1325 unsigned int *outside_cost,
1326 stmt_vector_for_cost *body_cost_vec,
1327 unsigned int npeel,
1328 bool unknown_misalignment)
1330 unsigned i;
1331 data_reference *dr;
1333 FOR_EACH_VEC_ELT (datarefs, i, dr)
1335 gimple *stmt = DR_STMT (dr);
1336 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1337 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1338 continue;
1340 /* For interleaving, only the alignment of the first access
1341 matters. */
1342 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1343 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1344 continue;
1346 /* Strided accesses perform only component accesses, alignment is
1347 irrelevant for them. */
1348 if (STMT_VINFO_STRIDED_P (stmt_info)
1349 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1350 continue;
1352 int save_misalignment;
1353 save_misalignment = DR_MISALIGNMENT (dr);
1354 if (npeel == 0)
1356 else if (unknown_misalignment && dr == dr0)
1357 SET_DR_MISALIGNMENT (dr, 0);
1358 else
1359 vect_update_misalignment_for_peel (dr, dr0, npeel);
1360 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1361 body_cost_vec);
1362 SET_DR_MISALIGNMENT (dr, save_misalignment);
1366 /* Traverse peeling hash table and calculate cost for each peeling option.
1367 Find the one with the lowest cost. */
1370 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1371 _vect_peel_extended_info *min)
1373 vect_peel_info elem = *slot;
1374 int dummy;
1375 unsigned int inside_cost = 0, outside_cost = 0;
1376 gimple *stmt = DR_STMT (elem->dr);
1377 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1378 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1379 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1380 epilogue_cost_vec;
1382 prologue_cost_vec.create (2);
1383 body_cost_vec.create (2);
1384 epilogue_cost_vec.create (2);
1386 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1387 elem->dr, &inside_cost, &outside_cost,
1388 &body_cost_vec, elem->npeel, false);
1390 body_cost_vec.release ();
1392 outside_cost += vect_get_known_peeling_cost
1393 (loop_vinfo, elem->npeel, &dummy,
1394 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1395 &prologue_cost_vec, &epilogue_cost_vec);
1397 /* Prologue and epilogue costs are added to the target model later.
1398 These costs depend only on the scalar iteration cost, the
1399 number of peeling iterations finally chosen, and the number of
1400 misaligned statements. So discard the information found here. */
1401 prologue_cost_vec.release ();
1402 epilogue_cost_vec.release ();
1404 if (inside_cost < min->inside_cost
1405 || (inside_cost == min->inside_cost
1406 && outside_cost < min->outside_cost))
1408 min->inside_cost = inside_cost;
1409 min->outside_cost = outside_cost;
1410 min->peel_info.dr = elem->dr;
1411 min->peel_info.npeel = elem->npeel;
1412 min->peel_info.count = elem->count;
1415 return 1;
1419 /* Choose best peeling option by traversing peeling hash table and either
1420 choosing an option with the lowest cost (if cost model is enabled) or the
1421 option that aligns as many accesses as possible. */
1423 static struct _vect_peel_extended_info
1424 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1425 loop_vec_info loop_vinfo)
1427 struct _vect_peel_extended_info res;
1429 res.peel_info.dr = NULL;
1431 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1433 res.inside_cost = INT_MAX;
1434 res.outside_cost = INT_MAX;
1435 peeling_htab->traverse <_vect_peel_extended_info *,
1436 vect_peeling_hash_get_lowest_cost> (&res);
1438 else
1440 res.peel_info.count = 0;
1441 peeling_htab->traverse <_vect_peel_extended_info *,
1442 vect_peeling_hash_get_most_frequent> (&res);
1443 res.inside_cost = 0;
1444 res.outside_cost = 0;
1447 return res;
1450 /* Return true if the new peeling NPEEL is supported. */
1452 static bool
1453 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1454 unsigned npeel)
1456 unsigned i;
1457 struct data_reference *dr = NULL;
1458 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1459 gimple *stmt;
1460 stmt_vec_info stmt_info;
1461 enum dr_alignment_support supportable_dr_alignment;
1463 /* Ensure that all data refs can be vectorized after the peel. */
1464 FOR_EACH_VEC_ELT (datarefs, i, dr)
1466 int save_misalignment;
1468 if (dr == dr0)
1469 continue;
1471 stmt = DR_STMT (dr);
1472 stmt_info = vinfo_for_stmt (stmt);
1473 /* For interleaving, only the alignment of the first access
1474 matters. */
1475 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1476 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1477 continue;
1479 /* Strided accesses perform only component accesses, alignment is
1480 irrelevant for them. */
1481 if (STMT_VINFO_STRIDED_P (stmt_info)
1482 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1483 continue;
1485 save_misalignment = DR_MISALIGNMENT (dr);
1486 vect_update_misalignment_for_peel (dr, dr0, npeel);
1487 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1488 SET_DR_MISALIGNMENT (dr, save_misalignment);
1490 if (!supportable_dr_alignment)
1491 return false;
1494 return true;
1497 /* Function vect_enhance_data_refs_alignment
1499 This pass will use loop versioning and loop peeling in order to enhance
1500 the alignment of data references in the loop.
1502 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1503 original loop is to be vectorized. Any other loops that are created by
1504 the transformations performed in this pass - are not supposed to be
1505 vectorized. This restriction will be relaxed.
1507 This pass will require a cost model to guide it whether to apply peeling
1508 or versioning or a combination of the two. For example, the scheme that
1509 intel uses when given a loop with several memory accesses, is as follows:
1510 choose one memory access ('p') which alignment you want to force by doing
1511 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1512 other accesses are not necessarily aligned, or (2) use loop versioning to
1513 generate one loop in which all accesses are aligned, and another loop in
1514 which only 'p' is necessarily aligned.
1516 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1517 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1518 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1520 Devising a cost model is the most critical aspect of this work. It will
1521 guide us on which access to peel for, whether to use loop versioning, how
1522 many versions to create, etc. The cost model will probably consist of
1523 generic considerations as well as target specific considerations (on
1524 powerpc for example, misaligned stores are more painful than misaligned
1525 loads).
1527 Here are the general steps involved in alignment enhancements:
1529 -- original loop, before alignment analysis:
1530 for (i=0; i<N; i++){
1531 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1532 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1535 -- After vect_compute_data_refs_alignment:
1536 for (i=0; i<N; i++){
1537 x = q[i]; # DR_MISALIGNMENT(q) = 3
1538 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1541 -- Possibility 1: we do loop versioning:
1542 if (p is aligned) {
1543 for (i=0; i<N; i++){ # loop 1A
1544 x = q[i]; # DR_MISALIGNMENT(q) = 3
1545 p[i] = y; # DR_MISALIGNMENT(p) = 0
1548 else {
1549 for (i=0; i<N; i++){ # loop 1B
1550 x = q[i]; # DR_MISALIGNMENT(q) = 3
1551 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1555 -- Possibility 2: we do loop peeling:
1556 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1557 x = q[i];
1558 p[i] = y;
1560 for (i = 3; i < N; i++){ # loop 2A
1561 x = q[i]; # DR_MISALIGNMENT(q) = 0
1562 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1565 -- Possibility 3: combination of loop peeling and versioning:
1566 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1567 x = q[i];
1568 p[i] = y;
1570 if (p is aligned) {
1571 for (i = 3; i<N; i++){ # loop 3A
1572 x = q[i]; # DR_MISALIGNMENT(q) = 0
1573 p[i] = y; # DR_MISALIGNMENT(p) = 0
1576 else {
1577 for (i = 3; i<N; i++){ # loop 3B
1578 x = q[i]; # DR_MISALIGNMENT(q) = 0
1579 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1583 These loops are later passed to loop_transform to be vectorized. The
1584 vectorizer will use the alignment information to guide the transformation
1585 (whether to generate regular loads/stores, or with special handling for
1586 misalignment). */
1588 bool
1589 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1591 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1592 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1593 enum dr_alignment_support supportable_dr_alignment;
1594 struct data_reference *dr0 = NULL, *first_store = NULL;
1595 struct data_reference *dr;
1596 unsigned int i, j;
1597 bool do_peeling = false;
1598 bool do_versioning = false;
1599 bool stat;
1600 gimple *stmt;
1601 stmt_vec_info stmt_info;
1602 unsigned int npeel = 0;
1603 bool one_misalignment_known = false;
1604 bool one_misalignment_unknown = false;
1605 bool one_dr_unsupportable = false;
1606 struct data_reference *unsupportable_dr = NULL;
1607 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1608 unsigned possible_npeel_number = 1;
1609 tree vectype;
1610 unsigned int mis, same_align_drs_max = 0;
1611 hash_table<peel_info_hasher> peeling_htab (1);
1613 if (dump_enabled_p ())
1614 dump_printf_loc (MSG_NOTE, vect_location,
1615 "=== vect_enhance_data_refs_alignment ===\n");
1617 /* Reset data so we can safely be called multiple times. */
1618 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1619 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1621 /* While cost model enhancements are expected in the future, the high level
1622 view of the code at this time is as follows:
1624 A) If there is a misaligned access then see if peeling to align
1625 this access can make all data references satisfy
1626 vect_supportable_dr_alignment. If so, update data structures
1627 as needed and return true.
1629 B) If peeling wasn't possible and there is a data reference with an
1630 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1631 then see if loop versioning checks can be used to make all data
1632 references satisfy vect_supportable_dr_alignment. If so, update
1633 data structures as needed and return true.
1635 C) If neither peeling nor versioning were successful then return false if
1636 any data reference does not satisfy vect_supportable_dr_alignment.
1638 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1640 Note, Possibility 3 above (which is peeling and versioning together) is not
1641 being done at this time. */
1643 /* (1) Peeling to force alignment. */
1645 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1646 Considerations:
1647 + How many accesses will become aligned due to the peeling
1648 - How many accesses will become unaligned due to the peeling,
1649 and the cost of misaligned accesses.
1650 - The cost of peeling (the extra runtime checks, the increase
1651 in code size). */
1653 FOR_EACH_VEC_ELT (datarefs, i, dr)
1655 stmt = DR_STMT (dr);
1656 stmt_info = vinfo_for_stmt (stmt);
1658 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1659 continue;
1661 /* For interleaving, only the alignment of the first access
1662 matters. */
1663 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1664 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1665 continue;
1667 /* For invariant accesses there is nothing to enhance. */
1668 if (integer_zerop (DR_STEP (dr)))
1669 continue;
1671 /* Strided accesses perform only component accesses, alignment is
1672 irrelevant for them. */
1673 if (STMT_VINFO_STRIDED_P (stmt_info)
1674 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1675 continue;
1677 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1678 do_peeling = vector_alignment_reachable_p (dr);
1679 if (do_peeling)
1681 if (known_alignment_for_access_p (dr))
1683 unsigned int npeel_tmp = 0;
1684 bool negative = tree_int_cst_compare (DR_STEP (dr),
1685 size_zero_node) < 0;
1687 vectype = STMT_VINFO_VECTYPE (stmt_info);
1688 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1689 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1690 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1691 if (DR_MISALIGNMENT (dr) != 0)
1692 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1694 /* For multiple types, it is possible that the bigger type access
1695 will have more than one peeling option. E.g., a loop with two
1696 types: one of size (vector size / 4), and the other one of
1697 size (vector size / 8). Vectorization factor will 8. If both
1698 accesses are misaligned by 3, the first one needs one scalar
1699 iteration to be aligned, and the second one needs 5. But the
1700 first one will be aligned also by peeling 5 scalar
1701 iterations, and in that case both accesses will be aligned.
1702 Hence, except for the immediate peeling amount, we also want
1703 to try to add full vector size, while we don't exceed
1704 vectorization factor.
1705 We do this automatically for cost model, since we calculate
1706 cost for every peeling option. */
1707 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1709 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1710 ? vf * GROUP_SIZE (stmt_info) : vf);
1711 possible_npeel_number
1712 = vect_get_num_vectors (nscalars, vectype);
1714 /* NPEEL_TMP is 0 when there is no misalignment, but also
1715 allow peeling NELEMENTS. */
1716 if (DR_MISALIGNMENT (dr) == 0)
1717 possible_npeel_number++;
1720 /* Save info about DR in the hash table. Also include peeling
1721 amounts according to the explanation above. */
1722 for (j = 0; j < possible_npeel_number; j++)
1724 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1725 dr, npeel_tmp);
1726 npeel_tmp += target_align / dr_size;
1729 one_misalignment_known = true;
1731 else
1733 /* If we don't know any misalignment values, we prefer
1734 peeling for data-ref that has the maximum number of data-refs
1735 with the same alignment, unless the target prefers to align
1736 stores over load. */
1737 unsigned same_align_drs
1738 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1739 if (!dr0
1740 || same_align_drs_max < same_align_drs)
1742 same_align_drs_max = same_align_drs;
1743 dr0 = dr;
1745 /* For data-refs with the same number of related
1746 accesses prefer the one where the misalign
1747 computation will be invariant in the outermost loop. */
1748 else if (same_align_drs_max == same_align_drs)
1750 struct loop *ivloop0, *ivloop;
1751 ivloop0 = outermost_invariant_loop_for_expr
1752 (loop, DR_BASE_ADDRESS (dr0));
1753 ivloop = outermost_invariant_loop_for_expr
1754 (loop, DR_BASE_ADDRESS (dr));
1755 if ((ivloop && !ivloop0)
1756 || (ivloop && ivloop0
1757 && flow_loop_nested_p (ivloop, ivloop0)))
1758 dr0 = dr;
1761 one_misalignment_unknown = true;
1763 /* Check for data refs with unsupportable alignment that
1764 can be peeled. */
1765 if (!supportable_dr_alignment)
1767 one_dr_unsupportable = true;
1768 unsupportable_dr = dr;
1771 if (!first_store && DR_IS_WRITE (dr))
1772 first_store = dr;
1775 else
1777 if (!aligned_access_p (dr))
1779 if (dump_enabled_p ())
1780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1781 "vector alignment may not be reachable\n");
1782 break;
1787 /* Check if we can possibly peel the loop. */
1788 if (!vect_can_advance_ivs_p (loop_vinfo)
1789 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1790 || loop->inner)
1791 do_peeling = false;
1793 struct _vect_peel_extended_info peel_for_known_alignment;
1794 struct _vect_peel_extended_info peel_for_unknown_alignment;
1795 struct _vect_peel_extended_info best_peel;
1797 peel_for_unknown_alignment.inside_cost = INT_MAX;
1798 peel_for_unknown_alignment.outside_cost = INT_MAX;
1799 peel_for_unknown_alignment.peel_info.count = 0;
1801 if (do_peeling
1802 && one_misalignment_unknown)
1804 /* Check if the target requires to prefer stores over loads, i.e., if
1805 misaligned stores are more expensive than misaligned loads (taking
1806 drs with same alignment into account). */
1807 unsigned int load_inside_cost = 0;
1808 unsigned int load_outside_cost = 0;
1809 unsigned int store_inside_cost = 0;
1810 unsigned int store_outside_cost = 0;
1811 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1813 stmt_vector_for_cost dummy;
1814 dummy.create (2);
1815 vect_get_peeling_costs_all_drs (datarefs, dr0,
1816 &load_inside_cost,
1817 &load_outside_cost,
1818 &dummy, estimated_npeels, true);
1819 dummy.release ();
1821 if (first_store)
1823 dummy.create (2);
1824 vect_get_peeling_costs_all_drs (datarefs, first_store,
1825 &store_inside_cost,
1826 &store_outside_cost,
1827 &dummy, estimated_npeels, true);
1828 dummy.release ();
1830 else
1832 store_inside_cost = INT_MAX;
1833 store_outside_cost = INT_MAX;
1836 if (load_inside_cost > store_inside_cost
1837 || (load_inside_cost == store_inside_cost
1838 && load_outside_cost > store_outside_cost))
1840 dr0 = first_store;
1841 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1842 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1844 else
1846 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1847 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1850 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1851 prologue_cost_vec.create (2);
1852 epilogue_cost_vec.create (2);
1854 int dummy2;
1855 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1856 (loop_vinfo, estimated_npeels, &dummy2,
1857 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1858 &prologue_cost_vec, &epilogue_cost_vec);
1860 prologue_cost_vec.release ();
1861 epilogue_cost_vec.release ();
1863 peel_for_unknown_alignment.peel_info.count = 1
1864 + STMT_VINFO_SAME_ALIGN_REFS
1865 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1868 peel_for_unknown_alignment.peel_info.npeel = 0;
1869 peel_for_unknown_alignment.peel_info.dr = dr0;
1871 best_peel = peel_for_unknown_alignment;
1873 peel_for_known_alignment.inside_cost = INT_MAX;
1874 peel_for_known_alignment.outside_cost = INT_MAX;
1875 peel_for_known_alignment.peel_info.count = 0;
1876 peel_for_known_alignment.peel_info.dr = NULL;
1878 if (do_peeling && one_misalignment_known)
1880 /* Peeling is possible, but there is no data access that is not supported
1881 unless aligned. So we try to choose the best possible peeling from
1882 the hash table. */
1883 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1884 (&peeling_htab, loop_vinfo);
1887 /* Compare costs of peeling for known and unknown alignment. */
1888 if (peel_for_known_alignment.peel_info.dr != NULL
1889 && peel_for_unknown_alignment.inside_cost
1890 >= peel_for_known_alignment.inside_cost)
1892 best_peel = peel_for_known_alignment;
1894 /* If the best peeling for known alignment has NPEEL == 0, perform no
1895 peeling at all except if there is an unsupportable dr that we can
1896 align. */
1897 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1898 do_peeling = false;
1901 /* If there is an unsupportable data ref, prefer this over all choices so far
1902 since we'd have to discard a chosen peeling except when it accidentally
1903 aligned the unsupportable data ref. */
1904 if (one_dr_unsupportable)
1905 dr0 = unsupportable_dr;
1906 else if (do_peeling)
1908 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1909 TODO: Use nopeel_outside_cost or get rid of it? */
1910 unsigned nopeel_inside_cost = 0;
1911 unsigned nopeel_outside_cost = 0;
1913 stmt_vector_for_cost dummy;
1914 dummy.create (2);
1915 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1916 &nopeel_outside_cost, &dummy, 0, false);
1917 dummy.release ();
1919 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1920 costs will be recorded. */
1921 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1922 prologue_cost_vec.create (2);
1923 epilogue_cost_vec.create (2);
1925 int dummy2;
1926 nopeel_outside_cost += vect_get_known_peeling_cost
1927 (loop_vinfo, 0, &dummy2,
1928 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1929 &prologue_cost_vec, &epilogue_cost_vec);
1931 prologue_cost_vec.release ();
1932 epilogue_cost_vec.release ();
1934 npeel = best_peel.peel_info.npeel;
1935 dr0 = best_peel.peel_info.dr;
1937 /* If no peeling is not more expensive than the best peeling we
1938 have so far, don't perform any peeling. */
1939 if (nopeel_inside_cost <= best_peel.inside_cost)
1940 do_peeling = false;
1943 if (do_peeling)
1945 stmt = DR_STMT (dr0);
1946 stmt_info = vinfo_for_stmt (stmt);
1947 vectype = STMT_VINFO_VECTYPE (stmt_info);
1949 if (known_alignment_for_access_p (dr0))
1951 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1952 size_zero_node) < 0;
1953 if (!npeel)
1955 /* Since it's known at compile time, compute the number of
1956 iterations in the peeled loop (the peeling factor) for use in
1957 updating DR_MISALIGNMENT values. The peeling factor is the
1958 vectorization factor minus the misalignment as an element
1959 count. */
1960 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1961 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1962 npeel = ((mis & (target_align - 1))
1963 / vect_get_scalar_dr_size (dr0));
1966 /* For interleaved data access every iteration accesses all the
1967 members of the group, therefore we divide the number of iterations
1968 by the group size. */
1969 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1970 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1971 npeel /= GROUP_SIZE (stmt_info);
1973 if (dump_enabled_p ())
1974 dump_printf_loc (MSG_NOTE, vect_location,
1975 "Try peeling by %d\n", npeel);
1978 /* Ensure that all datarefs can be vectorized after the peel. */
1979 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1980 do_peeling = false;
1982 /* Check if all datarefs are supportable and log. */
1983 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1985 stat = vect_verify_datarefs_alignment (loop_vinfo);
1986 if (!stat)
1987 do_peeling = false;
1988 else
1989 return stat;
1992 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1993 if (do_peeling)
1995 unsigned max_allowed_peel
1996 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1997 if (max_allowed_peel != (unsigned)-1)
1999 unsigned max_peel = npeel;
2000 if (max_peel == 0)
2002 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2003 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2005 if (max_peel > max_allowed_peel)
2007 do_peeling = false;
2008 if (dump_enabled_p ())
2009 dump_printf_loc (MSG_NOTE, vect_location,
2010 "Disable peeling, max peels reached: %d\n", max_peel);
2015 /* Cost model #2 - if peeling may result in a remaining loop not
2016 iterating enough to be vectorized then do not peel. Since this
2017 is a cost heuristic rather than a correctness decision, use the
2018 most likely runtime value for variable vectorization factors. */
2019 if (do_peeling
2020 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2022 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2023 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2024 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2025 < assumed_vf + max_peel)
2026 do_peeling = false;
2029 if (do_peeling)
2031 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2032 If the misalignment of DR_i is identical to that of dr0 then set
2033 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2034 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2035 by the peeling factor times the element size of DR_i (MOD the
2036 vectorization factor times the size). Otherwise, the
2037 misalignment of DR_i must be set to unknown. */
2038 FOR_EACH_VEC_ELT (datarefs, i, dr)
2039 if (dr != dr0)
2041 /* Strided accesses perform only component accesses, alignment
2042 is irrelevant for them. */
2043 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2044 if (STMT_VINFO_STRIDED_P (stmt_info)
2045 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2046 continue;
2048 vect_update_misalignment_for_peel (dr, dr0, npeel);
2051 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2052 if (npeel)
2053 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2054 else
2055 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2056 = DR_MISALIGNMENT (dr0);
2057 SET_DR_MISALIGNMENT (dr0, 0);
2058 if (dump_enabled_p ())
2060 dump_printf_loc (MSG_NOTE, vect_location,
2061 "Alignment of access forced using peeling.\n");
2062 dump_printf_loc (MSG_NOTE, vect_location,
2063 "Peeling for alignment will be applied.\n");
2066 /* The inside-loop cost will be accounted for in vectorizable_load
2067 and vectorizable_store correctly with adjusted alignments.
2068 Drop the body_cst_vec on the floor here. */
2069 stat = vect_verify_datarefs_alignment (loop_vinfo);
2070 gcc_assert (stat);
2071 return stat;
2075 /* (2) Versioning to force alignment. */
2077 /* Try versioning if:
2078 1) optimize loop for speed
2079 2) there is at least one unsupported misaligned data ref with an unknown
2080 misalignment, and
2081 3) all misaligned data refs with a known misalignment are supported, and
2082 4) the number of runtime alignment checks is within reason. */
2084 do_versioning =
2085 optimize_loop_nest_for_speed_p (loop)
2086 && (!loop->inner); /* FORNOW */
2088 if (do_versioning)
2090 FOR_EACH_VEC_ELT (datarefs, i, dr)
2092 stmt = DR_STMT (dr);
2093 stmt_info = vinfo_for_stmt (stmt);
2095 /* For interleaving, only the alignment of the first access
2096 matters. */
2097 if (aligned_access_p (dr)
2098 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2099 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2100 continue;
2102 if (STMT_VINFO_STRIDED_P (stmt_info))
2104 /* Strided loads perform only component accesses, alignment is
2105 irrelevant for them. */
2106 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2107 continue;
2108 do_versioning = false;
2109 break;
2112 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2114 if (!supportable_dr_alignment)
2116 gimple *stmt;
2117 int mask;
2118 tree vectype;
2120 if (known_alignment_for_access_p (dr)
2121 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2122 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2124 do_versioning = false;
2125 break;
2128 stmt = DR_STMT (dr);
2129 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2130 gcc_assert (vectype);
2132 /* At present we don't support versioning for alignment
2133 with variable VF, since there's no guarantee that the
2134 VF is a power of two. We could relax this if we added
2135 a way of enforcing a power-of-two size. */
2136 unsigned HOST_WIDE_INT size;
2137 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2139 do_versioning = false;
2140 break;
2143 /* The rightmost bits of an aligned address must be zeros.
2144 Construct the mask needed for this test. For example,
2145 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2146 mask must be 15 = 0xf. */
2147 mask = size - 1;
2149 /* FORNOW: use the same mask to test all potentially unaligned
2150 references in the loop. The vectorizer currently supports
2151 a single vector size, see the reference to
2152 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2153 vectorization factor is computed. */
2154 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2155 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2156 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2157 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2158 DR_STMT (dr));
2162 /* Versioning requires at least one misaligned data reference. */
2163 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2164 do_versioning = false;
2165 else if (!do_versioning)
2166 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2169 if (do_versioning)
2171 vec<gimple *> may_misalign_stmts
2172 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2173 gimple *stmt;
2175 /* It can now be assumed that the data references in the statements
2176 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2177 of the loop being vectorized. */
2178 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2180 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2181 dr = STMT_VINFO_DATA_REF (stmt_info);
2182 SET_DR_MISALIGNMENT (dr, 0);
2183 if (dump_enabled_p ())
2184 dump_printf_loc (MSG_NOTE, vect_location,
2185 "Alignment of access forced using versioning.\n");
2188 if (dump_enabled_p ())
2189 dump_printf_loc (MSG_NOTE, vect_location,
2190 "Versioning for alignment will be applied.\n");
2192 /* Peeling and versioning can't be done together at this time. */
2193 gcc_assert (! (do_peeling && do_versioning));
2195 stat = vect_verify_datarefs_alignment (loop_vinfo);
2196 gcc_assert (stat);
2197 return stat;
2200 /* This point is reached if neither peeling nor versioning is being done. */
2201 gcc_assert (! (do_peeling || do_versioning));
2203 stat = vect_verify_datarefs_alignment (loop_vinfo);
2204 return stat;
2208 /* Function vect_find_same_alignment_drs.
2210 Update group and alignment relations according to the chosen
2211 vectorization factor. */
2213 static void
2214 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2216 struct data_reference *dra = DDR_A (ddr);
2217 struct data_reference *drb = DDR_B (ddr);
2218 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2219 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2221 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2222 return;
2224 if (dra == drb)
2225 return;
2227 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2228 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2229 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2230 return;
2232 /* Two references with distance zero have the same alignment. */
2233 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2234 - wi::to_poly_offset (DR_INIT (drb)));
2235 if (maybe_ne (diff, 0))
2237 /* Get the wider of the two alignments. */
2238 unsigned int align_a = (vect_calculate_target_alignment (dra)
2239 / BITS_PER_UNIT);
2240 unsigned int align_b = (vect_calculate_target_alignment (drb)
2241 / BITS_PER_UNIT);
2242 unsigned int max_align = MAX (align_a, align_b);
2244 /* Require the gap to be a multiple of the larger vector alignment. */
2245 if (!multiple_p (diff, max_align))
2246 return;
2249 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2250 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2251 if (dump_enabled_p ())
2253 dump_printf_loc (MSG_NOTE, vect_location,
2254 "accesses have the same alignment: ");
2255 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2256 dump_printf (MSG_NOTE, " and ");
2257 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2258 dump_printf (MSG_NOTE, "\n");
2263 /* Function vect_analyze_data_refs_alignment
2265 Analyze the alignment of the data-references in the loop.
2266 Return FALSE if a data reference is found that cannot be vectorized. */
2268 bool
2269 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_NOTE, vect_location,
2273 "=== vect_analyze_data_refs_alignment ===\n");
2275 /* Mark groups of data references with same alignment using
2276 data dependence information. */
2277 vec<ddr_p> ddrs = vinfo->ddrs;
2278 struct data_dependence_relation *ddr;
2279 unsigned int i;
2281 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2282 vect_find_same_alignment_drs (ddr);
2284 vec<data_reference_p> datarefs = vinfo->datarefs;
2285 struct data_reference *dr;
2287 vect_record_base_alignments (vinfo);
2288 FOR_EACH_VEC_ELT (datarefs, i, dr)
2290 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2291 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2292 && !vect_compute_data_ref_alignment (dr))
2294 /* Strided accesses perform only component accesses, misalignment
2295 information is irrelevant for them. */
2296 if (STMT_VINFO_STRIDED_P (stmt_info)
2297 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2298 continue;
2300 if (dump_enabled_p ())
2301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2302 "not vectorized: can't calculate alignment "
2303 "for data ref.\n");
2305 return false;
2309 return true;
2313 /* Analyze alignment of DRs of stmts in NODE. */
2315 static bool
2316 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2318 /* We vectorize from the first scalar stmt in the node unless
2319 the node is permuted in which case we start from the first
2320 element in the group. */
2321 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2322 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2323 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2324 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2326 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2327 if (! vect_compute_data_ref_alignment (dr)
2328 /* For creating the data-ref pointer we need alignment of the
2329 first element anyway. */
2330 || (dr != first_dr
2331 && ! vect_compute_data_ref_alignment (first_dr))
2332 || ! verify_data_ref_alignment (dr))
2334 if (dump_enabled_p ())
2335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2336 "not vectorized: bad data alignment in basic "
2337 "block.\n");
2338 return false;
2341 return true;
2344 /* Function vect_slp_analyze_instance_alignment
2346 Analyze the alignment of the data-references in the SLP instance.
2347 Return FALSE if a data reference is found that cannot be vectorized. */
2349 bool
2350 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2352 if (dump_enabled_p ())
2353 dump_printf_loc (MSG_NOTE, vect_location,
2354 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2356 slp_tree node;
2357 unsigned i;
2358 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2359 if (! vect_slp_analyze_and_verify_node_alignment (node))
2360 return false;
2362 node = SLP_INSTANCE_TREE (instance);
2363 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2364 && ! vect_slp_analyze_and_verify_node_alignment
2365 (SLP_INSTANCE_TREE (instance)))
2366 return false;
2368 return true;
2372 /* Analyze groups of accesses: check that DR belongs to a group of
2373 accesses of legal size, step, etc. Detect gaps, single element
2374 interleaving, and other special cases. Set grouped access info.
2375 Collect groups of strided stores for further use in SLP analysis.
2376 Worker for vect_analyze_group_access. */
2378 static bool
2379 vect_analyze_group_access_1 (struct data_reference *dr)
2381 tree step = DR_STEP (dr);
2382 tree scalar_type = TREE_TYPE (DR_REF (dr));
2383 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2384 gimple *stmt = DR_STMT (dr);
2385 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2386 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2387 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2388 HOST_WIDE_INT dr_step = -1;
2389 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2390 bool slp_impossible = false;
2392 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2393 size of the interleaving group (including gaps). */
2394 if (tree_fits_shwi_p (step))
2396 dr_step = tree_to_shwi (step);
2397 /* Check that STEP is a multiple of type size. Otherwise there is
2398 a non-element-sized gap at the end of the group which we
2399 cannot represent in GROUP_GAP or GROUP_SIZE.
2400 ??? As we can handle non-constant step fine here we should
2401 simply remove uses of GROUP_GAP between the last and first
2402 element and instead rely on DR_STEP. GROUP_SIZE then would
2403 simply not include that gap. */
2404 if ((dr_step % type_size) != 0)
2406 if (dump_enabled_p ())
2408 dump_printf_loc (MSG_NOTE, vect_location,
2409 "Step ");
2410 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2411 dump_printf (MSG_NOTE,
2412 " is not a multiple of the element size for ");
2413 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2414 dump_printf (MSG_NOTE, "\n");
2416 return false;
2418 groupsize = absu_hwi (dr_step) / type_size;
2420 else
2421 groupsize = 0;
2423 /* Not consecutive access is possible only if it is a part of interleaving. */
2424 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2426 /* Check if it this DR is a part of interleaving, and is a single
2427 element of the group that is accessed in the loop. */
2429 /* Gaps are supported only for loads. STEP must be a multiple of the type
2430 size. The size of the group must be a power of 2. */
2431 if (DR_IS_READ (dr)
2432 && (dr_step % type_size) == 0
2433 && groupsize > 0
2434 && pow2p_hwi (groupsize))
2436 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2437 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2438 GROUP_GAP (stmt_info) = groupsize - 1;
2439 if (dump_enabled_p ())
2441 dump_printf_loc (MSG_NOTE, vect_location,
2442 "Detected single element interleaving ");
2443 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2444 dump_printf (MSG_NOTE, " step ");
2445 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2446 dump_printf (MSG_NOTE, "\n");
2449 return true;
2452 if (dump_enabled_p ())
2454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2455 "not consecutive access ");
2456 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2459 if (bb_vinfo)
2461 /* Mark the statement as unvectorizable. */
2462 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2463 return true;
2466 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2467 STMT_VINFO_STRIDED_P (stmt_info) = true;
2468 return true;
2471 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2473 /* First stmt in the interleaving chain. Check the chain. */
2474 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2475 struct data_reference *data_ref = dr;
2476 unsigned int count = 1;
2477 tree prev_init = DR_INIT (data_ref);
2478 gimple *prev = stmt;
2479 HOST_WIDE_INT diff, gaps = 0;
2481 /* By construction, all group members have INTEGER_CST DR_INITs. */
2482 while (next)
2484 /* Skip same data-refs. In case that two or more stmts share
2485 data-ref (supported only for loads), we vectorize only the first
2486 stmt, and the rest get their vectorized loads from the first
2487 one. */
2488 if (!tree_int_cst_compare (DR_INIT (data_ref),
2489 DR_INIT (STMT_VINFO_DATA_REF (
2490 vinfo_for_stmt (next)))))
2492 if (DR_IS_WRITE (data_ref))
2494 if (dump_enabled_p ())
2495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2496 "Two store stmts share the same dr.\n");
2497 return false;
2500 if (dump_enabled_p ())
2501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2502 "Two or more load stmts share the same dr.\n");
2504 /* For load use the same data-ref load. */
2505 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2507 prev = next;
2508 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2509 continue;
2512 prev = next;
2513 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2515 /* All group members have the same STEP by construction. */
2516 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2518 /* Check that the distance between two accesses is equal to the type
2519 size. Otherwise, we have gaps. */
2520 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2521 - TREE_INT_CST_LOW (prev_init)) / type_size;
2522 if (diff != 1)
2524 /* FORNOW: SLP of accesses with gaps is not supported. */
2525 slp_impossible = true;
2526 if (DR_IS_WRITE (data_ref))
2528 if (dump_enabled_p ())
2529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2530 "interleaved store with gaps\n");
2531 return false;
2534 gaps += diff - 1;
2537 last_accessed_element += diff;
2539 /* Store the gap from the previous member of the group. If there is no
2540 gap in the access, GROUP_GAP is always 1. */
2541 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2543 prev_init = DR_INIT (data_ref);
2544 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2545 /* Count the number of data-refs in the chain. */
2546 count++;
2549 if (groupsize == 0)
2550 groupsize = count + gaps;
2552 /* This could be UINT_MAX but as we are generating code in a very
2553 inefficient way we have to cap earlier. See PR78699 for example. */
2554 if (groupsize > 4096)
2556 if (dump_enabled_p ())
2557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2558 "group is too large\n");
2559 return false;
2562 /* Check that the size of the interleaving is equal to count for stores,
2563 i.e., that there are no gaps. */
2564 if (groupsize != count
2565 && !DR_IS_READ (dr))
2567 if (dump_enabled_p ())
2568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2569 "interleaved store with gaps\n");
2570 return false;
2573 /* If there is a gap after the last load in the group it is the
2574 difference between the groupsize and the last accessed
2575 element.
2576 When there is no gap, this difference should be 0. */
2577 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2579 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2580 if (dump_enabled_p ())
2582 dump_printf_loc (MSG_NOTE, vect_location,
2583 "Detected interleaving ");
2584 if (DR_IS_READ (dr))
2585 dump_printf (MSG_NOTE, "load ");
2586 else
2587 dump_printf (MSG_NOTE, "store ");
2588 dump_printf (MSG_NOTE, "of size %u starting with ",
2589 (unsigned)groupsize);
2590 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2591 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2592 dump_printf_loc (MSG_NOTE, vect_location,
2593 "There is a gap of %u elements after the group\n",
2594 GROUP_GAP (vinfo_for_stmt (stmt)));
2597 /* SLP: create an SLP data structure for every interleaving group of
2598 stores for further analysis in vect_analyse_slp. */
2599 if (DR_IS_WRITE (dr) && !slp_impossible)
2601 if (loop_vinfo)
2602 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2603 if (bb_vinfo)
2604 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2608 return true;
2611 /* Analyze groups of accesses: check that DR belongs to a group of
2612 accesses of legal size, step, etc. Detect gaps, single element
2613 interleaving, and other special cases. Set grouped access info.
2614 Collect groups of strided stores for further use in SLP analysis. */
2616 static bool
2617 vect_analyze_group_access (struct data_reference *dr)
2619 if (!vect_analyze_group_access_1 (dr))
2621 /* Dissolve the group if present. */
2622 gimple *next;
2623 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2624 while (stmt)
2626 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2627 next = GROUP_NEXT_ELEMENT (vinfo);
2628 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2629 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2630 stmt = next;
2632 return false;
2634 return true;
2637 /* Analyze the access pattern of the data-reference DR.
2638 In case of non-consecutive accesses call vect_analyze_group_access() to
2639 analyze groups of accesses. */
2641 static bool
2642 vect_analyze_data_ref_access (struct data_reference *dr)
2644 tree step = DR_STEP (dr);
2645 tree scalar_type = TREE_TYPE (DR_REF (dr));
2646 gimple *stmt = DR_STMT (dr);
2647 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2648 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2649 struct loop *loop = NULL;
2651 if (loop_vinfo)
2652 loop = LOOP_VINFO_LOOP (loop_vinfo);
2654 if (loop_vinfo && !step)
2656 if (dump_enabled_p ())
2657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2658 "bad data-ref access in loop\n");
2659 return false;
2662 /* Allow loads with zero step in inner-loop vectorization. */
2663 if (loop_vinfo && integer_zerop (step))
2665 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2666 if (!nested_in_vect_loop_p (loop, stmt))
2667 return DR_IS_READ (dr);
2668 /* Allow references with zero step for outer loops marked
2669 with pragma omp simd only - it guarantees absence of
2670 loop-carried dependencies between inner loop iterations. */
2671 if (!loop->force_vectorize)
2673 if (dump_enabled_p ())
2674 dump_printf_loc (MSG_NOTE, vect_location,
2675 "zero step in inner loop of nest\n");
2676 return false;
2680 if (loop && nested_in_vect_loop_p (loop, stmt))
2682 /* Interleaved accesses are not yet supported within outer-loop
2683 vectorization for references in the inner-loop. */
2684 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2686 /* For the rest of the analysis we use the outer-loop step. */
2687 step = STMT_VINFO_DR_STEP (stmt_info);
2688 if (integer_zerop (step))
2690 if (dump_enabled_p ())
2691 dump_printf_loc (MSG_NOTE, vect_location,
2692 "zero step in outer loop.\n");
2693 return DR_IS_READ (dr);
2697 /* Consecutive? */
2698 if (TREE_CODE (step) == INTEGER_CST)
2700 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2701 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2702 || (dr_step < 0
2703 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2705 /* Mark that it is not interleaving. */
2706 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2707 return true;
2711 if (loop && nested_in_vect_loop_p (loop, stmt))
2713 if (dump_enabled_p ())
2714 dump_printf_loc (MSG_NOTE, vect_location,
2715 "grouped access in outer loop.\n");
2716 return false;
2720 /* Assume this is a DR handled by non-constant strided load case. */
2721 if (TREE_CODE (step) != INTEGER_CST)
2722 return (STMT_VINFO_STRIDED_P (stmt_info)
2723 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2724 || vect_analyze_group_access (dr)));
2726 /* Not consecutive access - check if it's a part of interleaving group. */
2727 return vect_analyze_group_access (dr);
2730 /* Compare two data-references DRA and DRB to group them into chunks
2731 suitable for grouping. */
2733 static int
2734 dr_group_sort_cmp (const void *dra_, const void *drb_)
2736 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2737 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2738 int cmp;
2740 /* Stabilize sort. */
2741 if (dra == drb)
2742 return 0;
2744 /* DRs in different loops never belong to the same group. */
2745 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2746 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2747 if (loopa != loopb)
2748 return loopa->num < loopb->num ? -1 : 1;
2750 /* Ordering of DRs according to base. */
2751 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2752 DR_BASE_ADDRESS (drb));
2753 if (cmp != 0)
2754 return cmp;
2756 /* And according to DR_OFFSET. */
2757 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2758 if (cmp != 0)
2759 return cmp;
2761 /* Put reads before writes. */
2762 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2763 return DR_IS_READ (dra) ? -1 : 1;
2765 /* Then sort after access size. */
2766 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2767 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2768 if (cmp != 0)
2769 return cmp;
2771 /* And after step. */
2772 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2773 if (cmp != 0)
2774 return cmp;
2776 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2777 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2778 if (cmp == 0)
2779 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2780 return cmp;
2783 /* Function vect_analyze_data_ref_accesses.
2785 Analyze the access pattern of all the data references in the loop.
2787 FORNOW: the only access pattern that is considered vectorizable is a
2788 simple step 1 (consecutive) access.
2790 FORNOW: handle only arrays and pointer accesses. */
2792 bool
2793 vect_analyze_data_ref_accesses (vec_info *vinfo)
2795 unsigned int i;
2796 vec<data_reference_p> datarefs = vinfo->datarefs;
2797 struct data_reference *dr;
2799 if (dump_enabled_p ())
2800 dump_printf_loc (MSG_NOTE, vect_location,
2801 "=== vect_analyze_data_ref_accesses ===\n");
2803 if (datarefs.is_empty ())
2804 return true;
2806 /* Sort the array of datarefs to make building the interleaving chains
2807 linear. Don't modify the original vector's order, it is needed for
2808 determining what dependencies are reversed. */
2809 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2810 datarefs_copy.qsort (dr_group_sort_cmp);
2812 /* Build the interleaving chains. */
2813 for (i = 0; i < datarefs_copy.length () - 1;)
2815 data_reference_p dra = datarefs_copy[i];
2816 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2817 stmt_vec_info lastinfo = NULL;
2818 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2820 ++i;
2821 continue;
2823 for (i = i + 1; i < datarefs_copy.length (); ++i)
2825 data_reference_p drb = datarefs_copy[i];
2826 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2827 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2828 break;
2830 /* ??? Imperfect sorting (non-compatible types, non-modulo
2831 accesses, same accesses) can lead to a group to be artificially
2832 split here as we don't just skip over those. If it really
2833 matters we can push those to a worklist and re-iterate
2834 over them. The we can just skip ahead to the next DR here. */
2836 /* DRs in a different loop should not be put into the same
2837 interleaving group. */
2838 if (gimple_bb (DR_STMT (dra))->loop_father
2839 != gimple_bb (DR_STMT (drb))->loop_father)
2840 break;
2842 /* Check that the data-refs have same first location (except init)
2843 and they are both either store or load (not load and store,
2844 not masked loads or stores). */
2845 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2846 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2847 DR_BASE_ADDRESS (drb)) != 0
2848 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2849 || !gimple_assign_single_p (DR_STMT (dra))
2850 || !gimple_assign_single_p (DR_STMT (drb)))
2851 break;
2853 /* Check that the data-refs have the same constant size. */
2854 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2855 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2856 if (!tree_fits_uhwi_p (sza)
2857 || !tree_fits_uhwi_p (szb)
2858 || !tree_int_cst_equal (sza, szb))
2859 break;
2861 /* Check that the data-refs have the same step. */
2862 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2863 break;
2865 /* Check the types are compatible.
2866 ??? We don't distinguish this during sorting. */
2867 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2868 TREE_TYPE (DR_REF (drb))))
2869 break;
2871 /* Check that the DR_INITs are compile-time constants. */
2872 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2873 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2874 break;
2876 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2877 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2878 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2879 HOST_WIDE_INT init_prev
2880 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2881 gcc_assert (init_a <= init_b
2882 && init_a <= init_prev
2883 && init_prev <= init_b);
2885 /* Do not place the same access in the interleaving chain twice. */
2886 if (init_b == init_prev)
2888 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2889 < gimple_uid (DR_STMT (drb)));
2890 /* ??? For now we simply "drop" the later reference which is
2891 otherwise the same rather than finishing off this group.
2892 In the end we'd want to re-process duplicates forming
2893 multiple groups from the refs, likely by just collecting
2894 all candidates (including duplicates and split points
2895 below) in a vector and then process them together. */
2896 continue;
2899 /* If init_b == init_a + the size of the type * k, we have an
2900 interleaving, and DRA is accessed before DRB. */
2901 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2902 if (type_size_a == 0
2903 || (init_b - init_a) % type_size_a != 0)
2904 break;
2906 /* If we have a store, the accesses are adjacent. This splits
2907 groups into chunks we support (we don't support vectorization
2908 of stores with gaps). */
2909 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2910 break;
2912 /* If the step (if not zero or non-constant) is greater than the
2913 difference between data-refs' inits this splits groups into
2914 suitable sizes. */
2915 if (tree_fits_shwi_p (DR_STEP (dra)))
2917 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2918 if (step != 0 && step <= (init_b - init_a))
2919 break;
2922 if (dump_enabled_p ())
2924 dump_printf_loc (MSG_NOTE, vect_location,
2925 "Detected interleaving ");
2926 if (DR_IS_READ (dra))
2927 dump_printf (MSG_NOTE, "load ");
2928 else
2929 dump_printf (MSG_NOTE, "store ");
2930 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2931 dump_printf (MSG_NOTE, " and ");
2932 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2933 dump_printf (MSG_NOTE, "\n");
2936 /* Link the found element into the group list. */
2937 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2939 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2940 lastinfo = stmtinfo_a;
2942 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2943 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2944 lastinfo = stmtinfo_b;
2948 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2949 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2950 && !vect_analyze_data_ref_access (dr))
2952 if (dump_enabled_p ())
2953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2954 "not vectorized: complicated access pattern.\n");
2956 if (is_a <bb_vec_info> (vinfo))
2958 /* Mark the statement as not vectorizable. */
2959 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2960 continue;
2962 else
2964 datarefs_copy.release ();
2965 return false;
2969 datarefs_copy.release ();
2970 return true;
2973 /* Function vect_vfa_segment_size.
2975 Create an expression that computes the size of segment
2976 that will be accessed for a data reference. The functions takes into
2977 account that realignment loads may access one more vector.
2979 Input:
2980 DR: The data reference.
2981 LENGTH_FACTOR: segment length to consider.
2983 Return an expression whose value is the size of segment which will be
2984 accessed by DR. */
2986 static tree
2987 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2989 tree segment_length;
2991 if (integer_zerop (DR_STEP (dr)))
2992 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2993 else
2994 segment_length = size_binop (MULT_EXPR,
2995 fold_convert (sizetype, DR_STEP (dr)),
2996 fold_convert (sizetype, length_factor));
2998 if (vect_supportable_dr_alignment (dr, false)
2999 == dr_explicit_realign_optimized)
3001 tree vector_size = TYPE_SIZE_UNIT
3002 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
3004 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
3006 return segment_length;
3009 /* Function vect_no_alias_p.
3011 Given data references A and B with equal base and offset, see whether
3012 the alias relation can be decided at compilation time. Return 1 if
3013 it can and the references alias, 0 if it can and the references do
3014 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A
3015 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3016 respectively. */
3018 static int
3019 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3020 tree segment_length_a, tree segment_length_b)
3022 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3023 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3024 poly_uint64 const_length_a;
3025 poly_uint64 const_length_b;
3027 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3028 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3029 [a, a+12) */
3030 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3032 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3033 offset_a = (offset_a + vect_get_scalar_dr_size (a)) - const_length_a;
3035 else
3036 const_length_a = tree_to_poly_uint64 (segment_length_a);
3037 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3039 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3040 offset_b = (offset_b + vect_get_scalar_dr_size (b)) - const_length_b;
3042 else
3043 const_length_b = tree_to_poly_uint64 (segment_length_b);
3045 if (ranges_known_overlap_p (offset_a, const_length_a,
3046 offset_b, const_length_b))
3047 return 1;
3049 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3050 offset_b, const_length_b))
3051 return 0;
3053 return -1;
3056 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3057 in DDR is >= VF. */
3059 static bool
3060 dependence_distance_ge_vf (data_dependence_relation *ddr,
3061 unsigned int loop_depth, poly_uint64 vf)
3063 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3064 || DDR_NUM_DIST_VECTS (ddr) == 0)
3065 return false;
3067 /* If the dependence is exact, we should have limited the VF instead. */
3068 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3070 unsigned int i;
3071 lambda_vector dist_v;
3072 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3074 HOST_WIDE_INT dist = dist_v[loop_depth];
3075 if (dist != 0
3076 && !(dist > 0 && DDR_REVERSED_P (ddr))
3077 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3078 return false;
3081 if (dump_enabled_p ())
3083 dump_printf_loc (MSG_NOTE, vect_location,
3084 "dependence distance between ");
3085 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3086 dump_printf (MSG_NOTE, " and ");
3087 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3088 dump_printf (MSG_NOTE, " is >= VF\n");
3091 return true;
3094 /* Function vect_prune_runtime_alias_test_list.
3096 Prune a list of ddrs to be tested at run-time by versioning for alias.
3097 Merge several alias checks into one if possible.
3098 Return FALSE if resulting list of ddrs is longer then allowed by
3099 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3101 bool
3102 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3104 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3105 hash_set <tree_pair_hash> compared_objects;
3107 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3108 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3109 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3110 vec<vec_object_pair> &check_unequal_addrs
3111 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3112 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3113 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3115 ddr_p ddr;
3116 unsigned int i;
3117 tree length_factor;
3119 if (dump_enabled_p ())
3120 dump_printf_loc (MSG_NOTE, vect_location,
3121 "=== vect_prune_runtime_alias_test_list ===\n");
3123 if (may_alias_ddrs.is_empty ())
3124 return true;
3126 comp_alias_ddrs.create (may_alias_ddrs.length ());
3128 unsigned int loop_depth
3129 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3130 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3132 /* First, we collect all data ref pairs for aliasing checks. */
3133 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3135 int comp_res;
3136 struct data_reference *dr_a, *dr_b;
3137 gimple *dr_group_first_a, *dr_group_first_b;
3138 tree segment_length_a, segment_length_b;
3139 gimple *stmt_a, *stmt_b;
3141 /* Ignore the alias if the VF we chose ended up being no greater
3142 than the dependence distance. */
3143 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3144 continue;
3146 if (DDR_OBJECT_A (ddr))
3148 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3149 if (!compared_objects.add (new_pair))
3151 if (dump_enabled_p ())
3153 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3154 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3155 dump_printf (MSG_NOTE, " and ");
3156 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3157 dump_printf (MSG_NOTE, " have different addresses\n");
3159 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3161 continue;
3164 dr_a = DDR_A (ddr);
3165 stmt_a = DR_STMT (DDR_A (ddr));
3166 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3167 if (dr_group_first_a)
3169 stmt_a = dr_group_first_a;
3170 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3173 dr_b = DDR_B (ddr);
3174 stmt_b = DR_STMT (DDR_B (ddr));
3175 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3176 if (dr_group_first_b)
3178 stmt_b = dr_group_first_b;
3179 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3182 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3183 length_factor = scalar_loop_iters;
3184 else
3185 length_factor = size_int (vect_factor);
3186 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3187 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3189 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3190 DR_BASE_ADDRESS (dr_b));
3191 if (comp_res == 0)
3192 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3193 DR_OFFSET (dr_b));
3195 /* See whether the alias is known at compilation time. */
3196 if (comp_res == 0
3197 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3198 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3199 && poly_int_tree_p (segment_length_a)
3200 && poly_int_tree_p (segment_length_b))
3202 int res = vect_compile_time_alias (dr_a, dr_b,
3203 segment_length_a,
3204 segment_length_b);
3205 if (res == 0)
3206 continue;
3208 if (res == 1)
3210 if (dump_enabled_p ())
3211 dump_printf_loc (MSG_NOTE, vect_location,
3212 "not vectorized: compilation time alias.\n");
3213 return false;
3217 dr_with_seg_len_pair_t dr_with_seg_len_pair
3218 (dr_with_seg_len (dr_a, segment_length_a),
3219 dr_with_seg_len (dr_b, segment_length_b));
3221 /* Canonicalize pairs by sorting the two DR members. */
3222 if (comp_res > 0)
3223 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3225 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3228 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3230 unsigned int count = (comp_alias_ddrs.length ()
3231 + check_unequal_addrs.length ());
3232 dump_printf_loc (MSG_NOTE, vect_location,
3233 "improved number of alias checks from %d to %d\n",
3234 may_alias_ddrs.length (), count);
3235 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3237 if (dump_enabled_p ())
3238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3239 "number of versioning for alias "
3240 "run-time tests exceeds %d "
3241 "(--param vect-max-version-for-alias-checks)\n",
3242 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3243 return false;
3246 return true;
3249 /* Return true if a non-affine read or write in STMT is suitable for a
3250 gather load or scatter store. Describe the operation in *INFO if so. */
3252 bool
3253 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3254 gather_scatter_info *info)
3256 HOST_WIDE_INT scale = 1;
3257 poly_int64 pbitpos, pbitsize;
3258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3259 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3260 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3261 tree offtype = NULL_TREE;
3262 tree decl, base, off;
3263 machine_mode pmode;
3264 int punsignedp, reversep, pvolatilep = 0;
3266 base = DR_REF (dr);
3267 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3268 see if we can use the def stmt of the address. */
3269 if (is_gimple_call (stmt)
3270 && gimple_call_internal_p (stmt)
3271 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3272 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3273 && TREE_CODE (base) == MEM_REF
3274 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3275 && integer_zerop (TREE_OPERAND (base, 1))
3276 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3278 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3279 if (is_gimple_assign (def_stmt)
3280 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3281 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3284 /* The gather and scatter builtins need address of the form
3285 loop_invariant + vector * {1, 2, 4, 8}
3287 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3288 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3289 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3290 multiplications and additions in it. To get a vector, we need
3291 a single SSA_NAME that will be defined in the loop and will
3292 contain everything that is not loop invariant and that can be
3293 vectorized. The following code attempts to find such a preexistng
3294 SSA_NAME OFF and put the loop invariants into a tree BASE
3295 that can be gimplified before the loop. */
3296 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3297 &punsignedp, &reversep, &pvolatilep);
3298 gcc_assert (base && !reversep);
3299 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3301 if (TREE_CODE (base) == MEM_REF)
3303 if (!integer_zerop (TREE_OPERAND (base, 1)))
3305 if (off == NULL_TREE)
3306 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3307 else
3308 off = size_binop (PLUS_EXPR, off,
3309 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3311 base = TREE_OPERAND (base, 0);
3313 else
3314 base = build_fold_addr_expr (base);
3316 if (off == NULL_TREE)
3317 off = size_zero_node;
3319 /* If base is not loop invariant, either off is 0, then we start with just
3320 the constant offset in the loop invariant BASE and continue with base
3321 as OFF, otherwise give up.
3322 We could handle that case by gimplifying the addition of base + off
3323 into some SSA_NAME and use that as off, but for now punt. */
3324 if (!expr_invariant_in_loop_p (loop, base))
3326 if (!integer_zerop (off))
3327 return false;
3328 off = base;
3329 base = size_int (pbytepos);
3331 /* Otherwise put base + constant offset into the loop invariant BASE
3332 and continue with OFF. */
3333 else
3335 base = fold_convert (sizetype, base);
3336 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3339 /* OFF at this point may be either a SSA_NAME or some tree expression
3340 from get_inner_reference. Try to peel off loop invariants from it
3341 into BASE as long as possible. */
3342 STRIP_NOPS (off);
3343 while (offtype == NULL_TREE)
3345 enum tree_code code;
3346 tree op0, op1, add = NULL_TREE;
3348 if (TREE_CODE (off) == SSA_NAME)
3350 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3352 if (expr_invariant_in_loop_p (loop, off))
3353 return false;
3355 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3356 break;
3358 op0 = gimple_assign_rhs1 (def_stmt);
3359 code = gimple_assign_rhs_code (def_stmt);
3360 op1 = gimple_assign_rhs2 (def_stmt);
3362 else
3364 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3365 return false;
3366 code = TREE_CODE (off);
3367 extract_ops_from_tree (off, &code, &op0, &op1);
3369 switch (code)
3371 case POINTER_PLUS_EXPR:
3372 case PLUS_EXPR:
3373 if (expr_invariant_in_loop_p (loop, op0))
3375 add = op0;
3376 off = op1;
3377 do_add:
3378 add = fold_convert (sizetype, add);
3379 if (scale != 1)
3380 add = size_binop (MULT_EXPR, add, size_int (scale));
3381 base = size_binop (PLUS_EXPR, base, add);
3382 continue;
3384 if (expr_invariant_in_loop_p (loop, op1))
3386 add = op1;
3387 off = op0;
3388 goto do_add;
3390 break;
3391 case MINUS_EXPR:
3392 if (expr_invariant_in_loop_p (loop, op1))
3394 add = fold_convert (sizetype, op1);
3395 add = size_binop (MINUS_EXPR, size_zero_node, add);
3396 off = op0;
3397 goto do_add;
3399 break;
3400 case MULT_EXPR:
3401 if (scale == 1 && tree_fits_shwi_p (op1))
3403 scale = tree_to_shwi (op1);
3404 off = op0;
3405 continue;
3407 break;
3408 case SSA_NAME:
3409 off = op0;
3410 continue;
3411 CASE_CONVERT:
3412 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3413 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3414 break;
3415 if (TYPE_PRECISION (TREE_TYPE (op0))
3416 == TYPE_PRECISION (TREE_TYPE (off)))
3418 off = op0;
3419 continue;
3421 if (TYPE_PRECISION (TREE_TYPE (op0))
3422 < TYPE_PRECISION (TREE_TYPE (off)))
3424 off = op0;
3425 offtype = TREE_TYPE (off);
3426 STRIP_NOPS (off);
3427 continue;
3429 break;
3430 default:
3431 break;
3433 break;
3436 /* If at the end OFF still isn't a SSA_NAME or isn't
3437 defined in the loop, punt. */
3438 if (TREE_CODE (off) != SSA_NAME
3439 || expr_invariant_in_loop_p (loop, off))
3440 return false;
3442 if (offtype == NULL_TREE)
3443 offtype = TREE_TYPE (off);
3445 if (DR_IS_READ (dr))
3446 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3447 offtype, scale);
3448 else
3449 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3450 offtype, scale);
3452 if (decl == NULL_TREE)
3453 return false;
3455 info->decl = decl;
3456 info->base = base;
3457 info->offset = off;
3458 info->offset_dt = vect_unknown_def_type;
3459 info->offset_vectype = NULL_TREE;
3460 info->scale = scale;
3461 return true;
3464 /* Function vect_analyze_data_refs.
3466 Find all the data references in the loop or basic block.
3468 The general structure of the analysis of data refs in the vectorizer is as
3469 follows:
3470 1- vect_analyze_data_refs(loop/bb): call
3471 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3472 in the loop/bb and their dependences.
3473 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3474 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3475 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3479 bool
3480 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3482 struct loop *loop = NULL;
3483 unsigned int i;
3484 struct data_reference *dr;
3485 tree scalar_type;
3487 if (dump_enabled_p ())
3488 dump_printf_loc (MSG_NOTE, vect_location,
3489 "=== vect_analyze_data_refs ===\n");
3491 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3492 loop = LOOP_VINFO_LOOP (loop_vinfo);
3494 /* Go through the data-refs, check that the analysis succeeded. Update
3495 pointer from stmt_vec_info struct to DR and vectype. */
3497 vec<data_reference_p> datarefs = vinfo->datarefs;
3498 FOR_EACH_VEC_ELT (datarefs, i, dr)
3500 gimple *stmt;
3501 stmt_vec_info stmt_info;
3502 tree base, offset, init;
3503 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3504 bool simd_lane_access = false;
3505 poly_uint64 vf;
3507 again:
3508 if (!dr || !DR_REF (dr))
3510 if (dump_enabled_p ())
3511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3512 "not vectorized: unhandled data-ref\n");
3513 return false;
3516 stmt = DR_STMT (dr);
3517 stmt_info = vinfo_for_stmt (stmt);
3519 /* Discard clobbers from the dataref vector. We will remove
3520 clobber stmts during vectorization. */
3521 if (gimple_clobber_p (stmt))
3523 free_data_ref (dr);
3524 if (i == datarefs.length () - 1)
3526 datarefs.pop ();
3527 break;
3529 datarefs.ordered_remove (i);
3530 dr = datarefs[i];
3531 goto again;
3534 /* Check that analysis of the data-ref succeeded. */
3535 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3536 || !DR_STEP (dr))
3538 bool maybe_gather
3539 = DR_IS_READ (dr)
3540 && !TREE_THIS_VOLATILE (DR_REF (dr))
3541 && targetm.vectorize.builtin_gather != NULL;
3542 bool maybe_scatter
3543 = DR_IS_WRITE (dr)
3544 && !TREE_THIS_VOLATILE (DR_REF (dr))
3545 && targetm.vectorize.builtin_scatter != NULL;
3546 bool maybe_simd_lane_access
3547 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3549 /* If target supports vector gather loads or scatter stores, or if
3550 this might be a SIMD lane access, see if they can't be used. */
3551 if (is_a <loop_vec_info> (vinfo)
3552 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3553 && !nested_in_vect_loop_p (loop, stmt))
3555 struct data_reference *newdr
3556 = create_data_ref (NULL, loop_containing_stmt (stmt),
3557 DR_REF (dr), stmt, !maybe_scatter,
3558 DR_IS_CONDITIONAL_IN_STMT (dr));
3559 gcc_assert (newdr != NULL && DR_REF (newdr));
3560 if (DR_BASE_ADDRESS (newdr)
3561 && DR_OFFSET (newdr)
3562 && DR_INIT (newdr)
3563 && DR_STEP (newdr)
3564 && integer_zerop (DR_STEP (newdr)))
3566 if (maybe_simd_lane_access)
3568 tree off = DR_OFFSET (newdr);
3569 STRIP_NOPS (off);
3570 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3571 && TREE_CODE (off) == MULT_EXPR
3572 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3574 tree step = TREE_OPERAND (off, 1);
3575 off = TREE_OPERAND (off, 0);
3576 STRIP_NOPS (off);
3577 if (CONVERT_EXPR_P (off)
3578 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3579 0)))
3580 < TYPE_PRECISION (TREE_TYPE (off)))
3581 off = TREE_OPERAND (off, 0);
3582 if (TREE_CODE (off) == SSA_NAME)
3584 gimple *def = SSA_NAME_DEF_STMT (off);
3585 tree reft = TREE_TYPE (DR_REF (newdr));
3586 if (is_gimple_call (def)
3587 && gimple_call_internal_p (def)
3588 && (gimple_call_internal_fn (def)
3589 == IFN_GOMP_SIMD_LANE))
3591 tree arg = gimple_call_arg (def, 0);
3592 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3593 arg = SSA_NAME_VAR (arg);
3594 if (arg == loop->simduid
3595 /* For now. */
3596 && tree_int_cst_equal
3597 (TYPE_SIZE_UNIT (reft),
3598 step))
3600 DR_OFFSET (newdr) = ssize_int (0);
3601 DR_STEP (newdr) = step;
3602 DR_OFFSET_ALIGNMENT (newdr)
3603 = BIGGEST_ALIGNMENT;
3604 DR_STEP_ALIGNMENT (newdr)
3605 = highest_pow2_factor (step);
3606 dr = newdr;
3607 simd_lane_access = true;
3613 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3615 dr = newdr;
3616 if (maybe_gather)
3617 gatherscatter = GATHER;
3618 else
3619 gatherscatter = SCATTER;
3622 if (gatherscatter == SG_NONE && !simd_lane_access)
3623 free_data_ref (newdr);
3626 if (gatherscatter == SG_NONE && !simd_lane_access)
3628 if (dump_enabled_p ())
3630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3631 "not vectorized: data ref analysis "
3632 "failed ");
3633 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3636 if (is_a <bb_vec_info> (vinfo))
3637 break;
3639 return false;
3643 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3645 if (dump_enabled_p ())
3646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3647 "not vectorized: base addr of dr is a "
3648 "constant\n");
3650 if (is_a <bb_vec_info> (vinfo))
3651 break;
3653 if (gatherscatter != SG_NONE || simd_lane_access)
3654 free_data_ref (dr);
3655 return false;
3658 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3660 if (dump_enabled_p ())
3662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3663 "not vectorized: volatile type ");
3664 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3667 if (is_a <bb_vec_info> (vinfo))
3668 break;
3670 return false;
3673 if (stmt_can_throw_internal (stmt))
3675 if (dump_enabled_p ())
3677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3678 "not vectorized: statement can throw an "
3679 "exception ");
3680 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3683 if (is_a <bb_vec_info> (vinfo))
3684 break;
3686 if (gatherscatter != SG_NONE || simd_lane_access)
3687 free_data_ref (dr);
3688 return false;
3691 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3692 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3694 if (dump_enabled_p ())
3696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3697 "not vectorized: statement is bitfield "
3698 "access ");
3699 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3702 if (is_a <bb_vec_info> (vinfo))
3703 break;
3705 if (gatherscatter != SG_NONE || simd_lane_access)
3706 free_data_ref (dr);
3707 return false;
3710 base = unshare_expr (DR_BASE_ADDRESS (dr));
3711 offset = unshare_expr (DR_OFFSET (dr));
3712 init = unshare_expr (DR_INIT (dr));
3714 if (is_gimple_call (stmt)
3715 && (!gimple_call_internal_p (stmt)
3716 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3717 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3719 if (dump_enabled_p ())
3721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3722 "not vectorized: dr in a call ");
3723 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3726 if (is_a <bb_vec_info> (vinfo))
3727 break;
3729 if (gatherscatter != SG_NONE || simd_lane_access)
3730 free_data_ref (dr);
3731 return false;
3734 /* Update DR field in stmt_vec_info struct. */
3736 /* If the dataref is in an inner-loop of the loop that is considered for
3737 for vectorization, we also want to analyze the access relative to
3738 the outer-loop (DR contains information only relative to the
3739 inner-most enclosing loop). We do that by building a reference to the
3740 first location accessed by the inner-loop, and analyze it relative to
3741 the outer-loop. */
3742 if (loop && nested_in_vect_loop_p (loop, stmt))
3744 /* Build a reference to the first location accessed by the
3745 inner loop: *(BASE + INIT + OFFSET). By construction,
3746 this address must be invariant in the inner loop, so we
3747 can consider it as being used in the outer loop. */
3748 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3749 init, offset);
3750 tree init_addr = fold_build_pointer_plus (base, init_offset);
3751 tree init_ref = build_fold_indirect_ref (init_addr);
3753 if (dump_enabled_p ())
3755 dump_printf_loc (MSG_NOTE, vect_location,
3756 "analyze in outer loop: ");
3757 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3758 dump_printf (MSG_NOTE, "\n");
3761 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3762 init_ref, loop))
3763 /* dr_analyze_innermost already explained the failure. */
3764 return false;
3766 if (dump_enabled_p ())
3768 dump_printf_loc (MSG_NOTE, vect_location,
3769 "\touter base_address: ");
3770 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3771 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3772 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3773 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3774 STMT_VINFO_DR_OFFSET (stmt_info));
3775 dump_printf (MSG_NOTE,
3776 "\n\touter constant offset from base address: ");
3777 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3778 STMT_VINFO_DR_INIT (stmt_info));
3779 dump_printf (MSG_NOTE, "\n\touter step: ");
3780 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3781 STMT_VINFO_DR_STEP (stmt_info));
3782 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3783 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3784 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3785 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3786 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3787 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3788 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3789 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3793 if (STMT_VINFO_DATA_REF (stmt_info))
3795 if (dump_enabled_p ())
3797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3798 "not vectorized: more than one data ref "
3799 "in stmt: ");
3800 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3803 if (is_a <bb_vec_info> (vinfo))
3804 break;
3806 if (gatherscatter != SG_NONE || simd_lane_access)
3807 free_data_ref (dr);
3808 return false;
3811 STMT_VINFO_DATA_REF (stmt_info) = dr;
3812 if (simd_lane_access)
3814 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3815 free_data_ref (datarefs[i]);
3816 datarefs[i] = dr;
3819 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3820 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3821 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3823 if (dump_enabled_p ())
3825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3826 "not vectorized: base object not addressable "
3827 "for stmt: ");
3828 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3830 if (is_a <bb_vec_info> (vinfo))
3832 /* In BB vectorization the ref can still participate
3833 in dependence analysis, we just can't vectorize it. */
3834 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3835 continue;
3837 return false;
3840 /* Set vectype for STMT. */
3841 scalar_type = TREE_TYPE (DR_REF (dr));
3842 STMT_VINFO_VECTYPE (stmt_info)
3843 = get_vectype_for_scalar_type (scalar_type);
3844 if (!STMT_VINFO_VECTYPE (stmt_info))
3846 if (dump_enabled_p ())
3848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3849 "not vectorized: no vectype for stmt: ");
3850 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3851 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3852 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3853 scalar_type);
3854 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3857 if (is_a <bb_vec_info> (vinfo))
3859 /* No vector type is fine, the ref can still participate
3860 in dependence analysis, we just can't vectorize it. */
3861 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3862 continue;
3865 if (gatherscatter != SG_NONE || simd_lane_access)
3867 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3868 if (gatherscatter != SG_NONE)
3869 free_data_ref (dr);
3871 return false;
3873 else
3875 if (dump_enabled_p ())
3877 dump_printf_loc (MSG_NOTE, vect_location,
3878 "got vectype for stmt: ");
3879 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3880 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3881 STMT_VINFO_VECTYPE (stmt_info));
3882 dump_printf (MSG_NOTE, "\n");
3886 /* Adjust the minimal vectorization factor according to the
3887 vector type. */
3888 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3889 *min_vf = upper_bound (*min_vf, vf);
3891 if (gatherscatter != SG_NONE)
3893 gather_scatter_info gs_info;
3894 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3895 &gs_info)
3896 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3898 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3899 free_data_ref (dr);
3900 if (dump_enabled_p ())
3902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3903 (gatherscatter == GATHER) ?
3904 "not vectorized: not suitable for gather "
3905 "load " :
3906 "not vectorized: not suitable for scatter "
3907 "store ");
3908 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3910 return false;
3913 free_data_ref (datarefs[i]);
3914 datarefs[i] = dr;
3915 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3918 else if (is_a <loop_vec_info> (vinfo)
3919 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3921 if (nested_in_vect_loop_p (loop, stmt))
3923 if (dump_enabled_p ())
3925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3926 "not vectorized: not suitable for strided "
3927 "load ");
3928 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3930 return false;
3932 STMT_VINFO_STRIDED_P (stmt_info) = true;
3936 /* If we stopped analysis at the first dataref we could not analyze
3937 when trying to vectorize a basic-block mark the rest of the datarefs
3938 as not vectorizable and truncate the vector of datarefs. That
3939 avoids spending useless time in analyzing their dependence. */
3940 if (i != datarefs.length ())
3942 gcc_assert (is_a <bb_vec_info> (vinfo));
3943 for (unsigned j = i; j < datarefs.length (); ++j)
3945 data_reference_p dr = datarefs[j];
3946 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3947 free_data_ref (dr);
3949 datarefs.truncate (i);
3952 return true;
3956 /* Function vect_get_new_vect_var.
3958 Returns a name for a new variable. The current naming scheme appends the
3959 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3960 the name of vectorizer generated variables, and appends that to NAME if
3961 provided. */
3963 tree
3964 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3966 const char *prefix;
3967 tree new_vect_var;
3969 switch (var_kind)
3971 case vect_simple_var:
3972 prefix = "vect";
3973 break;
3974 case vect_scalar_var:
3975 prefix = "stmp";
3976 break;
3977 case vect_mask_var:
3978 prefix = "mask";
3979 break;
3980 case vect_pointer_var:
3981 prefix = "vectp";
3982 break;
3983 default:
3984 gcc_unreachable ();
3987 if (name)
3989 char* tmp = concat (prefix, "_", name, NULL);
3990 new_vect_var = create_tmp_reg (type, tmp);
3991 free (tmp);
3993 else
3994 new_vect_var = create_tmp_reg (type, prefix);
3996 return new_vect_var;
3999 /* Like vect_get_new_vect_var but return an SSA name. */
4001 tree
4002 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4004 const char *prefix;
4005 tree new_vect_var;
4007 switch (var_kind)
4009 case vect_simple_var:
4010 prefix = "vect";
4011 break;
4012 case vect_scalar_var:
4013 prefix = "stmp";
4014 break;
4015 case vect_pointer_var:
4016 prefix = "vectp";
4017 break;
4018 default:
4019 gcc_unreachable ();
4022 if (name)
4024 char* tmp = concat (prefix, "_", name, NULL);
4025 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4026 free (tmp);
4028 else
4029 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4031 return new_vect_var;
4034 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4036 static void
4037 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4039 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4040 int misalign = DR_MISALIGNMENT (dr);
4041 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4042 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4043 else
4044 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4045 DR_TARGET_ALIGNMENT (dr), misalign);
4048 /* Function vect_create_addr_base_for_vector_ref.
4050 Create an expression that computes the address of the first memory location
4051 that will be accessed for a data reference.
4053 Input:
4054 STMT: The statement containing the data reference.
4055 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4056 OFFSET: Optional. If supplied, it is be added to the initial address.
4057 LOOP: Specify relative to which loop-nest should the address be computed.
4058 For example, when the dataref is in an inner-loop nested in an
4059 outer-loop that is now being vectorized, LOOP can be either the
4060 outer-loop, or the inner-loop. The first memory location accessed
4061 by the following dataref ('in' points to short):
4063 for (i=0; i<N; i++)
4064 for (j=0; j<M; j++)
4065 s += in[i+j]
4067 is as follows:
4068 if LOOP=i_loop: &in (relative to i_loop)
4069 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4070 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4071 initial address. Unlike OFFSET, which is number of elements to
4072 be added, BYTE_OFFSET is measured in bytes.
4074 Output:
4075 1. Return an SSA_NAME whose value is the address of the memory location of
4076 the first vector of the data reference.
4077 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4078 these statement(s) which define the returned SSA_NAME.
4080 FORNOW: We are only handling array accesses with step 1. */
4082 tree
4083 vect_create_addr_base_for_vector_ref (gimple *stmt,
4084 gimple_seq *new_stmt_list,
4085 tree offset,
4086 tree byte_offset)
4088 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4089 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4090 const char *base_name;
4091 tree addr_base;
4092 tree dest;
4093 gimple_seq seq = NULL;
4094 tree vect_ptr_type;
4095 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4096 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4097 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4099 tree data_ref_base = unshare_expr (drb->base_address);
4100 tree base_offset = unshare_expr (drb->offset);
4101 tree init = unshare_expr (drb->init);
4103 if (loop_vinfo)
4104 base_name = get_name (data_ref_base);
4105 else
4107 base_offset = ssize_int (0);
4108 init = ssize_int (0);
4109 base_name = get_name (DR_REF (dr));
4112 /* Create base_offset */
4113 base_offset = size_binop (PLUS_EXPR,
4114 fold_convert (sizetype, base_offset),
4115 fold_convert (sizetype, init));
4117 if (offset)
4119 offset = fold_build2 (MULT_EXPR, sizetype,
4120 fold_convert (sizetype, offset), step);
4121 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4122 base_offset, offset);
4124 if (byte_offset)
4126 byte_offset = fold_convert (sizetype, byte_offset);
4127 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4128 base_offset, byte_offset);
4131 /* base + base_offset */
4132 if (loop_vinfo)
4133 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4134 else
4136 addr_base = build1 (ADDR_EXPR,
4137 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4138 unshare_expr (DR_REF (dr)));
4141 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4142 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4143 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4144 gimple_seq_add_seq (new_stmt_list, seq);
4146 if (DR_PTR_INFO (dr)
4147 && TREE_CODE (addr_base) == SSA_NAME
4148 && !SSA_NAME_PTR_INFO (addr_base))
4150 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4151 if (offset || byte_offset)
4152 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4155 if (dump_enabled_p ())
4157 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4158 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4159 dump_printf (MSG_NOTE, "\n");
4162 return addr_base;
4166 /* Function vect_create_data_ref_ptr.
4168 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4169 location accessed in the loop by STMT, along with the def-use update
4170 chain to appropriately advance the pointer through the loop iterations.
4171 Also set aliasing information for the pointer. This pointer is used by
4172 the callers to this function to create a memory reference expression for
4173 vector load/store access.
4175 Input:
4176 1. STMT: a stmt that references memory. Expected to be of the form
4177 GIMPLE_ASSIGN <name, data-ref> or
4178 GIMPLE_ASSIGN <data-ref, name>.
4179 2. AGGR_TYPE: the type of the reference, which should be either a vector
4180 or an array.
4181 3. AT_LOOP: the loop where the vector memref is to be created.
4182 4. OFFSET (optional): an offset to be added to the initial address accessed
4183 by the data-ref in STMT.
4184 5. BSI: location where the new stmts are to be placed if there is no loop
4185 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4186 pointing to the initial address.
4187 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4188 to the initial address accessed by the data-ref in STMT. This is
4189 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4190 in bytes.
4192 Output:
4193 1. Declare a new ptr to vector_type, and have it point to the base of the
4194 data reference (initial addressed accessed by the data reference).
4195 For example, for vector of type V8HI, the following code is generated:
4197 v8hi *ap;
4198 ap = (v8hi *)initial_address;
4200 if OFFSET is not supplied:
4201 initial_address = &a[init];
4202 if OFFSET is supplied:
4203 initial_address = &a[init + OFFSET];
4204 if BYTE_OFFSET is supplied:
4205 initial_address = &a[init] + BYTE_OFFSET;
4207 Return the initial_address in INITIAL_ADDRESS.
4209 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4210 update the pointer in each iteration of the loop.
4212 Return the increment stmt that updates the pointer in PTR_INCR.
4214 3. Set INV_P to true if the access pattern of the data reference in the
4215 vectorized loop is invariant. Set it to false otherwise.
4217 4. Return the pointer. */
4219 tree
4220 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4221 tree offset, tree *initial_address,
4222 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4223 bool only_init, bool *inv_p, tree byte_offset)
4225 const char *base_name;
4226 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4227 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4228 struct loop *loop = NULL;
4229 bool nested_in_vect_loop = false;
4230 struct loop *containing_loop = NULL;
4231 tree aggr_ptr_type;
4232 tree aggr_ptr;
4233 tree new_temp;
4234 gimple_seq new_stmt_list = NULL;
4235 edge pe = NULL;
4236 basic_block new_bb;
4237 tree aggr_ptr_init;
4238 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4239 tree aptr;
4240 gimple_stmt_iterator incr_gsi;
4241 bool insert_after;
4242 tree indx_before_incr, indx_after_incr;
4243 gimple *incr;
4244 tree step;
4245 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4247 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4248 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4250 if (loop_vinfo)
4252 loop = LOOP_VINFO_LOOP (loop_vinfo);
4253 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4254 containing_loop = (gimple_bb (stmt))->loop_father;
4255 pe = loop_preheader_edge (loop);
4257 else
4259 gcc_assert (bb_vinfo);
4260 only_init = true;
4261 *ptr_incr = NULL;
4264 /* Check the step (evolution) of the load in LOOP, and record
4265 whether it's invariant. */
4266 step = vect_dr_behavior (dr)->step;
4267 if (integer_zerop (step))
4268 *inv_p = true;
4269 else
4270 *inv_p = false;
4272 /* Create an expression for the first address accessed by this load
4273 in LOOP. */
4274 base_name = get_name (DR_BASE_ADDRESS (dr));
4276 if (dump_enabled_p ())
4278 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4279 dump_printf_loc (MSG_NOTE, vect_location,
4280 "create %s-pointer variable to type: ",
4281 get_tree_code_name (TREE_CODE (aggr_type)));
4282 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4283 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4284 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4285 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4286 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4287 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4288 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4289 else
4290 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4291 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4292 dump_printf (MSG_NOTE, "\n");
4295 /* (1) Create the new aggregate-pointer variable.
4296 Vector and array types inherit the alias set of their component
4297 type by default so we need to use a ref-all pointer if the data
4298 reference does not conflict with the created aggregated data
4299 reference because it is not addressable. */
4300 bool need_ref_all = false;
4301 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4302 get_alias_set (DR_REF (dr))))
4303 need_ref_all = true;
4304 /* Likewise for any of the data references in the stmt group. */
4305 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4307 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4310 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4311 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4312 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4313 get_alias_set (DR_REF (sdr))))
4315 need_ref_all = true;
4316 break;
4318 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4320 while (orig_stmt);
4322 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4323 need_ref_all);
4324 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4327 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4328 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4329 def-use update cycles for the pointer: one relative to the outer-loop
4330 (LOOP), which is what steps (3) and (4) below do. The other is relative
4331 to the inner-loop (which is the inner-most loop containing the dataref),
4332 and this is done be step (5) below.
4334 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4335 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4336 redundant. Steps (3),(4) create the following:
4338 vp0 = &base_addr;
4339 LOOP: vp1 = phi(vp0,vp2)
4342 vp2 = vp1 + step
4343 goto LOOP
4345 If there is an inner-loop nested in loop, then step (5) will also be
4346 applied, and an additional update in the inner-loop will be created:
4348 vp0 = &base_addr;
4349 LOOP: vp1 = phi(vp0,vp2)
4351 inner: vp3 = phi(vp1,vp4)
4352 vp4 = vp3 + inner_step
4353 if () goto inner
4355 vp2 = vp1 + step
4356 if () goto LOOP */
4358 /* (2) Calculate the initial address of the aggregate-pointer, and set
4359 the aggregate-pointer to point to it before the loop. */
4361 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4363 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4364 offset, byte_offset);
4365 if (new_stmt_list)
4367 if (pe)
4369 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4370 gcc_assert (!new_bb);
4372 else
4373 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4376 *initial_address = new_temp;
4377 aggr_ptr_init = new_temp;
4379 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4380 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4381 inner-loop nested in LOOP (during outer-loop vectorization). */
4383 /* No update in loop is required. */
4384 if (only_init && (!loop_vinfo || at_loop == loop))
4385 aptr = aggr_ptr_init;
4386 else
4388 /* The step of the aggregate pointer is the type size. */
4389 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4390 /* One exception to the above is when the scalar step of the load in
4391 LOOP is zero. In this case the step here is also zero. */
4392 if (*inv_p)
4393 iv_step = size_zero_node;
4394 else if (tree_int_cst_sgn (step) == -1)
4395 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4397 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4399 create_iv (aggr_ptr_init,
4400 fold_convert (aggr_ptr_type, iv_step),
4401 aggr_ptr, loop, &incr_gsi, insert_after,
4402 &indx_before_incr, &indx_after_incr);
4403 incr = gsi_stmt (incr_gsi);
4404 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4406 /* Copy the points-to information if it exists. */
4407 if (DR_PTR_INFO (dr))
4409 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4410 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4412 if (ptr_incr)
4413 *ptr_incr = incr;
4415 aptr = indx_before_incr;
4418 if (!nested_in_vect_loop || only_init)
4419 return aptr;
4422 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4423 nested in LOOP, if exists. */
4425 gcc_assert (nested_in_vect_loop);
4426 if (!only_init)
4428 standard_iv_increment_position (containing_loop, &incr_gsi,
4429 &insert_after);
4430 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4431 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4432 &indx_after_incr);
4433 incr = gsi_stmt (incr_gsi);
4434 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4436 /* Copy the points-to information if it exists. */
4437 if (DR_PTR_INFO (dr))
4439 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4440 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4442 if (ptr_incr)
4443 *ptr_incr = incr;
4445 return indx_before_incr;
4447 else
4448 gcc_unreachable ();
4452 /* Function bump_vector_ptr
4454 Increment a pointer (to a vector type) by vector-size. If requested,
4455 i.e. if PTR-INCR is given, then also connect the new increment stmt
4456 to the existing def-use update-chain of the pointer, by modifying
4457 the PTR_INCR as illustrated below:
4459 The pointer def-use update-chain before this function:
4460 DATAREF_PTR = phi (p_0, p_2)
4461 ....
4462 PTR_INCR: p_2 = DATAREF_PTR + step
4464 The pointer def-use update-chain after this function:
4465 DATAREF_PTR = phi (p_0, p_2)
4466 ....
4467 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4468 ....
4469 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4471 Input:
4472 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4473 in the loop.
4474 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4475 the loop. The increment amount across iterations is expected
4476 to be vector_size.
4477 BSI - location where the new update stmt is to be placed.
4478 STMT - the original scalar memory-access stmt that is being vectorized.
4479 BUMP - optional. The offset by which to bump the pointer. If not given,
4480 the offset is assumed to be vector_size.
4482 Output: Return NEW_DATAREF_PTR as illustrated above.
4486 tree
4487 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4488 gimple *stmt, tree bump)
4490 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4491 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4492 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4493 tree update = TYPE_SIZE_UNIT (vectype);
4494 gassign *incr_stmt;
4495 ssa_op_iter iter;
4496 use_operand_p use_p;
4497 tree new_dataref_ptr;
4499 if (bump)
4500 update = bump;
4502 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4503 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4504 else
4505 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4506 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4507 dataref_ptr, update);
4508 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4510 /* Copy the points-to information if it exists. */
4511 if (DR_PTR_INFO (dr))
4513 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4514 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4517 if (!ptr_incr)
4518 return new_dataref_ptr;
4520 /* Update the vector-pointer's cross-iteration increment. */
4521 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4523 tree use = USE_FROM_PTR (use_p);
4525 if (use == dataref_ptr)
4526 SET_USE (use_p, new_dataref_ptr);
4527 else
4528 gcc_assert (tree_int_cst_compare (use, update) == 0);
4531 return new_dataref_ptr;
4535 /* Function vect_create_destination_var.
4537 Create a new temporary of type VECTYPE. */
4539 tree
4540 vect_create_destination_var (tree scalar_dest, tree vectype)
4542 tree vec_dest;
4543 const char *name;
4544 char *new_name;
4545 tree type;
4546 enum vect_var_kind kind;
4548 kind = vectype
4549 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4550 ? vect_mask_var
4551 : vect_simple_var
4552 : vect_scalar_var;
4553 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4555 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4557 name = get_name (scalar_dest);
4558 if (name)
4559 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4560 else
4561 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4562 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4563 free (new_name);
4565 return vec_dest;
4568 /* Function vect_grouped_store_supported.
4570 Returns TRUE if interleave high and interleave low permutations
4571 are supported, and FALSE otherwise. */
4573 bool
4574 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4576 machine_mode mode = TYPE_MODE (vectype);
4578 /* vect_permute_store_chain requires the group size to be equal to 3 or
4579 be a power of two. */
4580 if (count != 3 && exact_log2 (count) == -1)
4582 if (dump_enabled_p ())
4583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4584 "the size of the group of accesses"
4585 " is not a power of 2 or not eqaul to 3\n");
4586 return false;
4589 /* Check that the permutation is supported. */
4590 if (VECTOR_MODE_P (mode))
4592 unsigned int i;
4593 if (count == 3)
4595 unsigned int j0 = 0, j1 = 0, j2 = 0;
4596 unsigned int i, j;
4598 unsigned int nelt;
4599 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4601 if (dump_enabled_p ())
4602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4603 "cannot handle groups of 3 stores for"
4604 " variable-length vectors\n");
4605 return false;
4608 vec_perm_builder sel (nelt, nelt, 1);
4609 sel.quick_grow (nelt);
4610 vec_perm_indices indices;
4611 for (j = 0; j < 3; j++)
4613 int nelt0 = ((3 - j) * nelt) % 3;
4614 int nelt1 = ((3 - j) * nelt + 1) % 3;
4615 int nelt2 = ((3 - j) * nelt + 2) % 3;
4616 for (i = 0; i < nelt; i++)
4618 if (3 * i + nelt0 < nelt)
4619 sel[3 * i + nelt0] = j0++;
4620 if (3 * i + nelt1 < nelt)
4621 sel[3 * i + nelt1] = nelt + j1++;
4622 if (3 * i + nelt2 < nelt)
4623 sel[3 * i + nelt2] = 0;
4625 indices.new_vector (sel, 2, nelt);
4626 if (!can_vec_perm_const_p (mode, indices))
4628 if (dump_enabled_p ())
4629 dump_printf (MSG_MISSED_OPTIMIZATION,
4630 "permutation op not supported by target.\n");
4631 return false;
4634 for (i = 0; i < nelt; i++)
4636 if (3 * i + nelt0 < nelt)
4637 sel[3 * i + nelt0] = 3 * i + nelt0;
4638 if (3 * i + nelt1 < nelt)
4639 sel[3 * i + nelt1] = 3 * i + nelt1;
4640 if (3 * i + nelt2 < nelt)
4641 sel[3 * i + nelt2] = nelt + j2++;
4643 indices.new_vector (sel, 2, nelt);
4644 if (!can_vec_perm_const_p (mode, indices))
4646 if (dump_enabled_p ())
4647 dump_printf (MSG_MISSED_OPTIMIZATION,
4648 "permutation op not supported by target.\n");
4649 return false;
4652 return true;
4654 else
4656 /* If length is not equal to 3 then only power of 2 is supported. */
4657 gcc_assert (pow2p_hwi (count));
4658 poly_uint64 nelt = GET_MODE_NUNITS (mode);
4660 /* The encoding has 2 interleaved stepped patterns. */
4661 vec_perm_builder sel (nelt, 2, 3);
4662 sel.quick_grow (6);
4663 for (i = 0; i < 3; i++)
4665 sel[i * 2] = i;
4666 sel[i * 2 + 1] = i + nelt;
4668 vec_perm_indices indices (sel, 2, nelt);
4669 if (can_vec_perm_const_p (mode, indices))
4671 for (i = 0; i < 6; i++)
4672 sel[i] += exact_div (nelt, 2);
4673 indices.new_vector (sel, 2, nelt);
4674 if (can_vec_perm_const_p (mode, indices))
4675 return true;
4680 if (dump_enabled_p ())
4681 dump_printf (MSG_MISSED_OPTIMIZATION,
4682 "permutaion op not supported by target.\n");
4683 return false;
4687 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4688 type VECTYPE. */
4690 bool
4691 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4693 return vect_lanes_optab_supported_p ("vec_store_lanes",
4694 vec_store_lanes_optab,
4695 vectype, count);
4699 /* Function vect_permute_store_chain.
4701 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4702 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4703 the data correctly for the stores. Return the final references for stores
4704 in RESULT_CHAIN.
4706 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4707 The input is 4 vectors each containing 8 elements. We assign a number to
4708 each element, the input sequence is:
4710 1st vec: 0 1 2 3 4 5 6 7
4711 2nd vec: 8 9 10 11 12 13 14 15
4712 3rd vec: 16 17 18 19 20 21 22 23
4713 4th vec: 24 25 26 27 28 29 30 31
4715 The output sequence should be:
4717 1st vec: 0 8 16 24 1 9 17 25
4718 2nd vec: 2 10 18 26 3 11 19 27
4719 3rd vec: 4 12 20 28 5 13 21 30
4720 4th vec: 6 14 22 30 7 15 23 31
4722 i.e., we interleave the contents of the four vectors in their order.
4724 We use interleave_high/low instructions to create such output. The input of
4725 each interleave_high/low operation is two vectors:
4726 1st vec 2nd vec
4727 0 1 2 3 4 5 6 7
4728 the even elements of the result vector are obtained left-to-right from the
4729 high/low elements of the first vector. The odd elements of the result are
4730 obtained left-to-right from the high/low elements of the second vector.
4731 The output of interleave_high will be: 0 4 1 5
4732 and of interleave_low: 2 6 3 7
4735 The permutation is done in log LENGTH stages. In each stage interleave_high
4736 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4737 where the first argument is taken from the first half of DR_CHAIN and the
4738 second argument from it's second half.
4739 In our example,
4741 I1: interleave_high (1st vec, 3rd vec)
4742 I2: interleave_low (1st vec, 3rd vec)
4743 I3: interleave_high (2nd vec, 4th vec)
4744 I4: interleave_low (2nd vec, 4th vec)
4746 The output for the first stage is:
4748 I1: 0 16 1 17 2 18 3 19
4749 I2: 4 20 5 21 6 22 7 23
4750 I3: 8 24 9 25 10 26 11 27
4751 I4: 12 28 13 29 14 30 15 31
4753 The output of the second stage, i.e. the final result is:
4755 I1: 0 8 16 24 1 9 17 25
4756 I2: 2 10 18 26 3 11 19 27
4757 I3: 4 12 20 28 5 13 21 30
4758 I4: 6 14 22 30 7 15 23 31. */
4760 void
4761 vect_permute_store_chain (vec<tree> dr_chain,
4762 unsigned int length,
4763 gimple *stmt,
4764 gimple_stmt_iterator *gsi,
4765 vec<tree> *result_chain)
4767 tree vect1, vect2, high, low;
4768 gimple *perm_stmt;
4769 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4770 tree perm_mask_low, perm_mask_high;
4771 tree data_ref;
4772 tree perm3_mask_low, perm3_mask_high;
4773 unsigned int i, j, n, log_length = exact_log2 (length);
4775 result_chain->quick_grow (length);
4776 memcpy (result_chain->address (), dr_chain.address (),
4777 length * sizeof (tree));
4779 if (length == 3)
4781 /* vect_grouped_store_supported ensures that this is constant. */
4782 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4783 unsigned int j0 = 0, j1 = 0, j2 = 0;
4785 vec_perm_builder sel (nelt, nelt, 1);
4786 sel.quick_grow (nelt);
4787 vec_perm_indices indices;
4788 for (j = 0; j < 3; j++)
4790 int nelt0 = ((3 - j) * nelt) % 3;
4791 int nelt1 = ((3 - j) * nelt + 1) % 3;
4792 int nelt2 = ((3 - j) * nelt + 2) % 3;
4794 for (i = 0; i < nelt; i++)
4796 if (3 * i + nelt0 < nelt)
4797 sel[3 * i + nelt0] = j0++;
4798 if (3 * i + nelt1 < nelt)
4799 sel[3 * i + nelt1] = nelt + j1++;
4800 if (3 * i + nelt2 < nelt)
4801 sel[3 * i + nelt2] = 0;
4803 indices.new_vector (sel, 2, nelt);
4804 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4806 for (i = 0; i < nelt; i++)
4808 if (3 * i + nelt0 < nelt)
4809 sel[3 * i + nelt0] = 3 * i + nelt0;
4810 if (3 * i + nelt1 < nelt)
4811 sel[3 * i + nelt1] = 3 * i + nelt1;
4812 if (3 * i + nelt2 < nelt)
4813 sel[3 * i + nelt2] = nelt + j2++;
4815 indices.new_vector (sel, 2, nelt);
4816 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4818 vect1 = dr_chain[0];
4819 vect2 = dr_chain[1];
4821 /* Create interleaving stmt:
4822 low = VEC_PERM_EXPR <vect1, vect2,
4823 {j, nelt, *, j + 1, nelt + j + 1, *,
4824 j + 2, nelt + j + 2, *, ...}> */
4825 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4826 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4827 vect2, perm3_mask_low);
4828 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4830 vect1 = data_ref;
4831 vect2 = dr_chain[2];
4832 /* Create interleaving stmt:
4833 low = VEC_PERM_EXPR <vect1, vect2,
4834 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4835 6, 7, nelt + j + 2, ...}> */
4836 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4837 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4838 vect2, perm3_mask_high);
4839 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4840 (*result_chain)[j] = data_ref;
4843 else
4845 /* If length is not equal to 3 then only power of 2 is supported. */
4846 gcc_assert (pow2p_hwi (length));
4848 /* The encoding has 2 interleaved stepped patterns. */
4849 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
4850 vec_perm_builder sel (nelt, 2, 3);
4851 sel.quick_grow (6);
4852 for (i = 0; i < 3; i++)
4854 sel[i * 2] = i;
4855 sel[i * 2 + 1] = i + nelt;
4857 vec_perm_indices indices (sel, 2, nelt);
4858 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4860 for (i = 0; i < 6; i++)
4861 sel[i] += exact_div (nelt, 2);
4862 indices.new_vector (sel, 2, nelt);
4863 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4865 for (i = 0, n = log_length; i < n; i++)
4867 for (j = 0; j < length/2; j++)
4869 vect1 = dr_chain[j];
4870 vect2 = dr_chain[j+length/2];
4872 /* Create interleaving stmt:
4873 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4874 ...}> */
4875 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4876 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4877 vect2, perm_mask_high);
4878 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4879 (*result_chain)[2*j] = high;
4881 /* Create interleaving stmt:
4882 low = VEC_PERM_EXPR <vect1, vect2,
4883 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4884 ...}> */
4885 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4886 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4887 vect2, perm_mask_low);
4888 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4889 (*result_chain)[2*j+1] = low;
4891 memcpy (dr_chain.address (), result_chain->address (),
4892 length * sizeof (tree));
4897 /* Function vect_setup_realignment
4899 This function is called when vectorizing an unaligned load using
4900 the dr_explicit_realign[_optimized] scheme.
4901 This function generates the following code at the loop prolog:
4903 p = initial_addr;
4904 x msq_init = *(floor(p)); # prolog load
4905 realignment_token = call target_builtin;
4906 loop:
4907 x msq = phi (msq_init, ---)
4909 The stmts marked with x are generated only for the case of
4910 dr_explicit_realign_optimized.
4912 The code above sets up a new (vector) pointer, pointing to the first
4913 location accessed by STMT, and a "floor-aligned" load using that pointer.
4914 It also generates code to compute the "realignment-token" (if the relevant
4915 target hook was defined), and creates a phi-node at the loop-header bb
4916 whose arguments are the result of the prolog-load (created by this
4917 function) and the result of a load that takes place in the loop (to be
4918 created by the caller to this function).
4920 For the case of dr_explicit_realign_optimized:
4921 The caller to this function uses the phi-result (msq) to create the
4922 realignment code inside the loop, and sets up the missing phi argument,
4923 as follows:
4924 loop:
4925 msq = phi (msq_init, lsq)
4926 lsq = *(floor(p')); # load in loop
4927 result = realign_load (msq, lsq, realignment_token);
4929 For the case of dr_explicit_realign:
4930 loop:
4931 msq = *(floor(p)); # load in loop
4932 p' = p + (VS-1);
4933 lsq = *(floor(p')); # load in loop
4934 result = realign_load (msq, lsq, realignment_token);
4936 Input:
4937 STMT - (scalar) load stmt to be vectorized. This load accesses
4938 a memory location that may be unaligned.
4939 BSI - place where new code is to be inserted.
4940 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4941 is used.
4943 Output:
4944 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4945 target hook, if defined.
4946 Return value - the result of the loop-header phi node. */
4948 tree
4949 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4950 tree *realignment_token,
4951 enum dr_alignment_support alignment_support_scheme,
4952 tree init_addr,
4953 struct loop **at_loop)
4955 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4956 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4957 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4958 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4959 struct loop *loop = NULL;
4960 edge pe = NULL;
4961 tree scalar_dest = gimple_assign_lhs (stmt);
4962 tree vec_dest;
4963 gimple *inc;
4964 tree ptr;
4965 tree data_ref;
4966 basic_block new_bb;
4967 tree msq_init = NULL_TREE;
4968 tree new_temp;
4969 gphi *phi_stmt;
4970 tree msq = NULL_TREE;
4971 gimple_seq stmts = NULL;
4972 bool inv_p;
4973 bool compute_in_loop = false;
4974 bool nested_in_vect_loop = false;
4975 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4976 struct loop *loop_for_initial_load = NULL;
4978 if (loop_vinfo)
4980 loop = LOOP_VINFO_LOOP (loop_vinfo);
4981 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4984 gcc_assert (alignment_support_scheme == dr_explicit_realign
4985 || alignment_support_scheme == dr_explicit_realign_optimized);
4987 /* We need to generate three things:
4988 1. the misalignment computation
4989 2. the extra vector load (for the optimized realignment scheme).
4990 3. the phi node for the two vectors from which the realignment is
4991 done (for the optimized realignment scheme). */
4993 /* 1. Determine where to generate the misalignment computation.
4995 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4996 calculation will be generated by this function, outside the loop (in the
4997 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4998 caller, inside the loop.
5000 Background: If the misalignment remains fixed throughout the iterations of
5001 the loop, then both realignment schemes are applicable, and also the
5002 misalignment computation can be done outside LOOP. This is because we are
5003 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5004 are a multiple of VS (the Vector Size), and therefore the misalignment in
5005 different vectorized LOOP iterations is always the same.
5006 The problem arises only if the memory access is in an inner-loop nested
5007 inside LOOP, which is now being vectorized using outer-loop vectorization.
5008 This is the only case when the misalignment of the memory access may not
5009 remain fixed throughout the iterations of the inner-loop (as explained in
5010 detail in vect_supportable_dr_alignment). In this case, not only is the
5011 optimized realignment scheme not applicable, but also the misalignment
5012 computation (and generation of the realignment token that is passed to
5013 REALIGN_LOAD) have to be done inside the loop.
5015 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5016 or not, which in turn determines if the misalignment is computed inside
5017 the inner-loop, or outside LOOP. */
5019 if (init_addr != NULL_TREE || !loop_vinfo)
5021 compute_in_loop = true;
5022 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5026 /* 2. Determine where to generate the extra vector load.
5028 For the optimized realignment scheme, instead of generating two vector
5029 loads in each iteration, we generate a single extra vector load in the
5030 preheader of the loop, and in each iteration reuse the result of the
5031 vector load from the previous iteration. In case the memory access is in
5032 an inner-loop nested inside LOOP, which is now being vectorized using
5033 outer-loop vectorization, we need to determine whether this initial vector
5034 load should be generated at the preheader of the inner-loop, or can be
5035 generated at the preheader of LOOP. If the memory access has no evolution
5036 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5037 to be generated inside LOOP (in the preheader of the inner-loop). */
5039 if (nested_in_vect_loop)
5041 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5042 bool invariant_in_outerloop =
5043 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5044 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5046 else
5047 loop_for_initial_load = loop;
5048 if (at_loop)
5049 *at_loop = loop_for_initial_load;
5051 if (loop_for_initial_load)
5052 pe = loop_preheader_edge (loop_for_initial_load);
5054 /* 3. For the case of the optimized realignment, create the first vector
5055 load at the loop preheader. */
5057 if (alignment_support_scheme == dr_explicit_realign_optimized)
5059 /* Create msq_init = *(floor(p1)) in the loop preheader */
5060 gassign *new_stmt;
5062 gcc_assert (!compute_in_loop);
5063 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5064 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5065 NULL_TREE, &init_addr, NULL, &inc,
5066 true, &inv_p);
5067 if (TREE_CODE (ptr) == SSA_NAME)
5068 new_temp = copy_ssa_name (ptr);
5069 else
5070 new_temp = make_ssa_name (TREE_TYPE (ptr));
5071 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5072 new_stmt = gimple_build_assign
5073 (new_temp, BIT_AND_EXPR, ptr,
5074 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5075 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5076 gcc_assert (!new_bb);
5077 data_ref
5078 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5079 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5080 new_stmt = gimple_build_assign (vec_dest, data_ref);
5081 new_temp = make_ssa_name (vec_dest, new_stmt);
5082 gimple_assign_set_lhs (new_stmt, new_temp);
5083 if (pe)
5085 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5086 gcc_assert (!new_bb);
5088 else
5089 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5091 msq_init = gimple_assign_lhs (new_stmt);
5094 /* 4. Create realignment token using a target builtin, if available.
5095 It is done either inside the containing loop, or before LOOP (as
5096 determined above). */
5098 if (targetm.vectorize.builtin_mask_for_load)
5100 gcall *new_stmt;
5101 tree builtin_decl;
5103 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5104 if (!init_addr)
5106 /* Generate the INIT_ADDR computation outside LOOP. */
5107 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5108 NULL_TREE);
5109 if (loop)
5111 pe = loop_preheader_edge (loop);
5112 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5113 gcc_assert (!new_bb);
5115 else
5116 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5119 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5120 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5121 vec_dest =
5122 vect_create_destination_var (scalar_dest,
5123 gimple_call_return_type (new_stmt));
5124 new_temp = make_ssa_name (vec_dest, new_stmt);
5125 gimple_call_set_lhs (new_stmt, new_temp);
5127 if (compute_in_loop)
5128 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5129 else
5131 /* Generate the misalignment computation outside LOOP. */
5132 pe = loop_preheader_edge (loop);
5133 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5134 gcc_assert (!new_bb);
5137 *realignment_token = gimple_call_lhs (new_stmt);
5139 /* The result of the CALL_EXPR to this builtin is determined from
5140 the value of the parameter and no global variables are touched
5141 which makes the builtin a "const" function. Requiring the
5142 builtin to have the "const" attribute makes it unnecessary
5143 to call mark_call_clobbered. */
5144 gcc_assert (TREE_READONLY (builtin_decl));
5147 if (alignment_support_scheme == dr_explicit_realign)
5148 return msq;
5150 gcc_assert (!compute_in_loop);
5151 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5154 /* 5. Create msq = phi <msq_init, lsq> in loop */
5156 pe = loop_preheader_edge (containing_loop);
5157 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5158 msq = make_ssa_name (vec_dest);
5159 phi_stmt = create_phi_node (msq, containing_loop->header);
5160 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5162 return msq;
5166 /* Function vect_grouped_load_supported.
5168 COUNT is the size of the load group (the number of statements plus the
5169 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5170 only one statement, with a gap of COUNT - 1.
5172 Returns true if a suitable permute exists. */
5174 bool
5175 vect_grouped_load_supported (tree vectype, bool single_element_p,
5176 unsigned HOST_WIDE_INT count)
5178 machine_mode mode = TYPE_MODE (vectype);
5180 /* If this is single-element interleaving with an element distance
5181 that leaves unused vector loads around punt - we at least create
5182 very sub-optimal code in that case (and blow up memory,
5183 see PR65518). */
5184 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5186 if (dump_enabled_p ())
5187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5188 "single-element interleaving not supported "
5189 "for not adjacent vector loads\n");
5190 return false;
5193 /* vect_permute_load_chain requires the group size to be equal to 3 or
5194 be a power of two. */
5195 if (count != 3 && exact_log2 (count) == -1)
5197 if (dump_enabled_p ())
5198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5199 "the size of the group of accesses"
5200 " is not a power of 2 or not equal to 3\n");
5201 return false;
5204 /* Check that the permutation is supported. */
5205 if (VECTOR_MODE_P (mode))
5207 unsigned int i, j;
5208 if (count == 3)
5210 unsigned int nelt;
5211 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5213 if (dump_enabled_p ())
5214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5215 "cannot handle groups of 3 loads for"
5216 " variable-length vectors\n");
5217 return false;
5220 vec_perm_builder sel (nelt, nelt, 1);
5221 sel.quick_grow (nelt);
5222 vec_perm_indices indices;
5223 unsigned int k;
5224 for (k = 0; k < 3; k++)
5226 for (i = 0; i < nelt; i++)
5227 if (3 * i + k < 2 * nelt)
5228 sel[i] = 3 * i + k;
5229 else
5230 sel[i] = 0;
5231 indices.new_vector (sel, 2, nelt);
5232 if (!can_vec_perm_const_p (mode, indices))
5234 if (dump_enabled_p ())
5235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5236 "shuffle of 3 loads is not supported by"
5237 " target\n");
5238 return false;
5240 for (i = 0, j = 0; i < nelt; i++)
5241 if (3 * i + k < 2 * nelt)
5242 sel[i] = i;
5243 else
5244 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5245 indices.new_vector (sel, 2, nelt);
5246 if (!can_vec_perm_const_p (mode, indices))
5248 if (dump_enabled_p ())
5249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5250 "shuffle of 3 loads is not supported by"
5251 " target\n");
5252 return false;
5255 return true;
5257 else
5259 /* If length is not equal to 3 then only power of 2 is supported. */
5260 gcc_assert (pow2p_hwi (count));
5261 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5263 /* The encoding has a single stepped pattern. */
5264 vec_perm_builder sel (nelt, 1, 3);
5265 sel.quick_grow (3);
5266 for (i = 0; i < 3; i++)
5267 sel[i] = i * 2;
5268 vec_perm_indices indices (sel, 2, nelt);
5269 if (can_vec_perm_const_p (mode, indices))
5271 for (i = 0; i < 3; i++)
5272 sel[i] = i * 2 + 1;
5273 indices.new_vector (sel, 2, nelt);
5274 if (can_vec_perm_const_p (mode, indices))
5275 return true;
5280 if (dump_enabled_p ())
5281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5282 "extract even/odd not supported by target\n");
5283 return false;
5286 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5287 type VECTYPE. */
5289 bool
5290 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5292 return vect_lanes_optab_supported_p ("vec_load_lanes",
5293 vec_load_lanes_optab,
5294 vectype, count);
5297 /* Function vect_permute_load_chain.
5299 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5300 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5301 the input data correctly. Return the final references for loads in
5302 RESULT_CHAIN.
5304 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5305 The input is 4 vectors each containing 8 elements. We assign a number to each
5306 element, the input sequence is:
5308 1st vec: 0 1 2 3 4 5 6 7
5309 2nd vec: 8 9 10 11 12 13 14 15
5310 3rd vec: 16 17 18 19 20 21 22 23
5311 4th vec: 24 25 26 27 28 29 30 31
5313 The output sequence should be:
5315 1st vec: 0 4 8 12 16 20 24 28
5316 2nd vec: 1 5 9 13 17 21 25 29
5317 3rd vec: 2 6 10 14 18 22 26 30
5318 4th vec: 3 7 11 15 19 23 27 31
5320 i.e., the first output vector should contain the first elements of each
5321 interleaving group, etc.
5323 We use extract_even/odd instructions to create such output. The input of
5324 each extract_even/odd operation is two vectors
5325 1st vec 2nd vec
5326 0 1 2 3 4 5 6 7
5328 and the output is the vector of extracted even/odd elements. The output of
5329 extract_even will be: 0 2 4 6
5330 and of extract_odd: 1 3 5 7
5333 The permutation is done in log LENGTH stages. In each stage extract_even
5334 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5335 their order. In our example,
5337 E1: extract_even (1st vec, 2nd vec)
5338 E2: extract_odd (1st vec, 2nd vec)
5339 E3: extract_even (3rd vec, 4th vec)
5340 E4: extract_odd (3rd vec, 4th vec)
5342 The output for the first stage will be:
5344 E1: 0 2 4 6 8 10 12 14
5345 E2: 1 3 5 7 9 11 13 15
5346 E3: 16 18 20 22 24 26 28 30
5347 E4: 17 19 21 23 25 27 29 31
5349 In order to proceed and create the correct sequence for the next stage (or
5350 for the correct output, if the second stage is the last one, as in our
5351 example), we first put the output of extract_even operation and then the
5352 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5353 The input for the second stage is:
5355 1st vec (E1): 0 2 4 6 8 10 12 14
5356 2nd vec (E3): 16 18 20 22 24 26 28 30
5357 3rd vec (E2): 1 3 5 7 9 11 13 15
5358 4th vec (E4): 17 19 21 23 25 27 29 31
5360 The output of the second stage:
5362 E1: 0 4 8 12 16 20 24 28
5363 E2: 2 6 10 14 18 22 26 30
5364 E3: 1 5 9 13 17 21 25 29
5365 E4: 3 7 11 15 19 23 27 31
5367 And RESULT_CHAIN after reordering:
5369 1st vec (E1): 0 4 8 12 16 20 24 28
5370 2nd vec (E3): 1 5 9 13 17 21 25 29
5371 3rd vec (E2): 2 6 10 14 18 22 26 30
5372 4th vec (E4): 3 7 11 15 19 23 27 31. */
5374 static void
5375 vect_permute_load_chain (vec<tree> dr_chain,
5376 unsigned int length,
5377 gimple *stmt,
5378 gimple_stmt_iterator *gsi,
5379 vec<tree> *result_chain)
5381 tree data_ref, first_vect, second_vect;
5382 tree perm_mask_even, perm_mask_odd;
5383 tree perm3_mask_low, perm3_mask_high;
5384 gimple *perm_stmt;
5385 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5386 unsigned int i, j, log_length = exact_log2 (length);
5388 result_chain->quick_grow (length);
5389 memcpy (result_chain->address (), dr_chain.address (),
5390 length * sizeof (tree));
5392 if (length == 3)
5394 /* vect_grouped_load_supported ensures that this is constant. */
5395 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5396 unsigned int k;
5398 vec_perm_builder sel (nelt, nelt, 1);
5399 sel.quick_grow (nelt);
5400 vec_perm_indices indices;
5401 for (k = 0; k < 3; k++)
5403 for (i = 0; i < nelt; i++)
5404 if (3 * i + k < 2 * nelt)
5405 sel[i] = 3 * i + k;
5406 else
5407 sel[i] = 0;
5408 indices.new_vector (sel, 2, nelt);
5409 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5411 for (i = 0, j = 0; i < nelt; i++)
5412 if (3 * i + k < 2 * nelt)
5413 sel[i] = i;
5414 else
5415 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5416 indices.new_vector (sel, 2, nelt);
5417 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5419 first_vect = dr_chain[0];
5420 second_vect = dr_chain[1];
5422 /* Create interleaving stmt (low part of):
5423 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5424 ...}> */
5425 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5426 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5427 second_vect, perm3_mask_low);
5428 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5430 /* Create interleaving stmt (high part of):
5431 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5432 ...}> */
5433 first_vect = data_ref;
5434 second_vect = dr_chain[2];
5435 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5436 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5437 second_vect, perm3_mask_high);
5438 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5439 (*result_chain)[k] = data_ref;
5442 else
5444 /* If length is not equal to 3 then only power of 2 is supported. */
5445 gcc_assert (pow2p_hwi (length));
5447 /* The encoding has a single stepped pattern. */
5448 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5449 vec_perm_builder sel (nelt, 1, 3);
5450 sel.quick_grow (3);
5451 for (i = 0; i < 3; ++i)
5452 sel[i] = i * 2;
5453 vec_perm_indices indices (sel, 2, nelt);
5454 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5456 for (i = 0; i < 3; ++i)
5457 sel[i] = i * 2 + 1;
5458 indices.new_vector (sel, 2, nelt);
5459 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5461 for (i = 0; i < log_length; i++)
5463 for (j = 0; j < length; j += 2)
5465 first_vect = dr_chain[j];
5466 second_vect = dr_chain[j+1];
5468 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5469 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5470 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5471 first_vect, second_vect,
5472 perm_mask_even);
5473 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5474 (*result_chain)[j/2] = data_ref;
5476 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5477 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5478 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5479 first_vect, second_vect,
5480 perm_mask_odd);
5481 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5482 (*result_chain)[j/2+length/2] = data_ref;
5484 memcpy (dr_chain.address (), result_chain->address (),
5485 length * sizeof (tree));
5490 /* Function vect_shift_permute_load_chain.
5492 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5493 sequence of stmts to reorder the input data accordingly.
5494 Return the final references for loads in RESULT_CHAIN.
5495 Return true if successed, false otherwise.
5497 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5498 The input is 3 vectors each containing 8 elements. We assign a
5499 number to each element, the input sequence is:
5501 1st vec: 0 1 2 3 4 5 6 7
5502 2nd vec: 8 9 10 11 12 13 14 15
5503 3rd vec: 16 17 18 19 20 21 22 23
5505 The output sequence should be:
5507 1st vec: 0 3 6 9 12 15 18 21
5508 2nd vec: 1 4 7 10 13 16 19 22
5509 3rd vec: 2 5 8 11 14 17 20 23
5511 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5513 First we shuffle all 3 vectors to get correct elements order:
5515 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5516 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5517 3rd vec: (16 19 22) (17 20 23) (18 21)
5519 Next we unite and shift vector 3 times:
5521 1st step:
5522 shift right by 6 the concatenation of:
5523 "1st vec" and "2nd vec"
5524 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5525 "2nd vec" and "3rd vec"
5526 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5527 "3rd vec" and "1st vec"
5528 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5529 | New vectors |
5531 So that now new vectors are:
5533 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5534 2nd vec: (10 13) (16 19 22) (17 20 23)
5535 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5537 2nd step:
5538 shift right by 5 the concatenation of:
5539 "1st vec" and "3rd vec"
5540 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5541 "2nd vec" and "1st vec"
5542 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5543 "3rd vec" and "2nd vec"
5544 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5545 | New vectors |
5547 So that now new vectors are:
5549 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5550 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5551 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5553 3rd step:
5554 shift right by 5 the concatenation of:
5555 "1st vec" and "1st vec"
5556 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5557 shift right by 3 the concatenation of:
5558 "2nd vec" and "2nd vec"
5559 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5560 | New vectors |
5562 So that now all vectors are READY:
5563 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5564 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5565 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5567 This algorithm is faster than one in vect_permute_load_chain if:
5568 1. "shift of a concatination" is faster than general permutation.
5569 This is usually so.
5570 2. The TARGET machine can't execute vector instructions in parallel.
5571 This is because each step of the algorithm depends on previous.
5572 The algorithm in vect_permute_load_chain is much more parallel.
5574 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5577 static bool
5578 vect_shift_permute_load_chain (vec<tree> dr_chain,
5579 unsigned int length,
5580 gimple *stmt,
5581 gimple_stmt_iterator *gsi,
5582 vec<tree> *result_chain)
5584 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5585 tree perm2_mask1, perm2_mask2, perm3_mask;
5586 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5587 gimple *perm_stmt;
5589 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5590 unsigned int i;
5591 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5592 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5594 unsigned HOST_WIDE_INT nelt, vf;
5595 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5596 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5597 /* Not supported for variable-length vectors. */
5598 return false;
5600 vec_perm_builder sel (nelt, nelt, 1);
5601 sel.quick_grow (nelt);
5603 result_chain->quick_grow (length);
5604 memcpy (result_chain->address (), dr_chain.address (),
5605 length * sizeof (tree));
5607 if (pow2p_hwi (length) && vf > 4)
5609 unsigned int j, log_length = exact_log2 (length);
5610 for (i = 0; i < nelt / 2; ++i)
5611 sel[i] = i * 2;
5612 for (i = 0; i < nelt / 2; ++i)
5613 sel[nelt / 2 + i] = i * 2 + 1;
5614 vec_perm_indices indices (sel, 2, nelt);
5615 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5617 if (dump_enabled_p ())
5618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5619 "shuffle of 2 fields structure is not \
5620 supported by target\n");
5621 return false;
5623 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5625 for (i = 0; i < nelt / 2; ++i)
5626 sel[i] = i * 2 + 1;
5627 for (i = 0; i < nelt / 2; ++i)
5628 sel[nelt / 2 + i] = i * 2;
5629 indices.new_vector (sel, 2, nelt);
5630 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5632 if (dump_enabled_p ())
5633 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5634 "shuffle of 2 fields structure is not \
5635 supported by target\n");
5636 return false;
5638 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5640 /* Generating permutation constant to shift all elements.
5641 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5642 for (i = 0; i < nelt; i++)
5643 sel[i] = nelt / 2 + i;
5644 indices.new_vector (sel, 2, nelt);
5645 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5647 if (dump_enabled_p ())
5648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5649 "shift permutation is not supported by target\n");
5650 return false;
5652 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5654 /* Generating permutation constant to select vector from 2.
5655 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5656 for (i = 0; i < nelt / 2; i++)
5657 sel[i] = i;
5658 for (i = nelt / 2; i < nelt; i++)
5659 sel[i] = nelt + i;
5660 indices.new_vector (sel, 2, nelt);
5661 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5663 if (dump_enabled_p ())
5664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5665 "select is not supported by target\n");
5666 return false;
5668 select_mask = vect_gen_perm_mask_checked (vectype, indices);
5670 for (i = 0; i < log_length; i++)
5672 for (j = 0; j < length; j += 2)
5674 first_vect = dr_chain[j];
5675 second_vect = dr_chain[j + 1];
5677 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5678 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5679 first_vect, first_vect,
5680 perm2_mask1);
5681 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5682 vect[0] = data_ref;
5684 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5685 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5686 second_vect, second_vect,
5687 perm2_mask2);
5688 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5689 vect[1] = data_ref;
5691 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5692 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5693 vect[0], vect[1], shift1_mask);
5694 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5695 (*result_chain)[j/2 + length/2] = data_ref;
5697 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5698 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5699 vect[0], vect[1], select_mask);
5700 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5701 (*result_chain)[j/2] = data_ref;
5703 memcpy (dr_chain.address (), result_chain->address (),
5704 length * sizeof (tree));
5706 return true;
5708 if (length == 3 && vf > 2)
5710 unsigned int k = 0, l = 0;
5712 /* Generating permutation constant to get all elements in rigth order.
5713 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5714 for (i = 0; i < nelt; i++)
5716 if (3 * k + (l % 3) >= nelt)
5718 k = 0;
5719 l += (3 - (nelt % 3));
5721 sel[i] = 3 * k + (l % 3);
5722 k++;
5724 vec_perm_indices indices (sel, 2, nelt);
5725 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5727 if (dump_enabled_p ())
5728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5729 "shuffle of 3 fields structure is not \
5730 supported by target\n");
5731 return false;
5733 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5735 /* Generating permutation constant to shift all elements.
5736 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5737 for (i = 0; i < nelt; i++)
5738 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5739 indices.new_vector (sel, 2, nelt);
5740 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5742 if (dump_enabled_p ())
5743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5744 "shift permutation is not supported by target\n");
5745 return false;
5747 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5749 /* Generating permutation constant to shift all elements.
5750 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5751 for (i = 0; i < nelt; i++)
5752 sel[i] = 2 * (nelt / 3) + 1 + i;
5753 indices.new_vector (sel, 2, nelt);
5754 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5756 if (dump_enabled_p ())
5757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5758 "shift permutation is not supported by target\n");
5759 return false;
5761 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5763 /* Generating permutation constant to shift all elements.
5764 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5765 for (i = 0; i < nelt; i++)
5766 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5767 indices.new_vector (sel, 2, nelt);
5768 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5770 if (dump_enabled_p ())
5771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5772 "shift permutation is not supported by target\n");
5773 return false;
5775 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5777 /* Generating permutation constant to shift all elements.
5778 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5779 for (i = 0; i < nelt; i++)
5780 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5781 indices.new_vector (sel, 2, nelt);
5782 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5784 if (dump_enabled_p ())
5785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5786 "shift permutation is not supported by target\n");
5787 return false;
5789 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5791 for (k = 0; k < 3; k++)
5793 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5794 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5795 dr_chain[k], dr_chain[k],
5796 perm3_mask);
5797 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5798 vect[k] = data_ref;
5801 for (k = 0; k < 3; k++)
5803 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5804 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5805 vect[k % 3], vect[(k + 1) % 3],
5806 shift1_mask);
5807 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5808 vect_shift[k] = data_ref;
5811 for (k = 0; k < 3; k++)
5813 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5814 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5815 vect_shift[(4 - k) % 3],
5816 vect_shift[(3 - k) % 3],
5817 shift2_mask);
5818 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5819 vect[k] = data_ref;
5822 (*result_chain)[3 - (nelt % 3)] = vect[2];
5824 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5825 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5826 vect[0], shift3_mask);
5827 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5828 (*result_chain)[nelt % 3] = data_ref;
5830 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5831 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5832 vect[1], shift4_mask);
5833 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5834 (*result_chain)[0] = data_ref;
5835 return true;
5837 return false;
5840 /* Function vect_transform_grouped_load.
5842 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5843 to perform their permutation and ascribe the result vectorized statements to
5844 the scalar statements.
5847 void
5848 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5849 gimple_stmt_iterator *gsi)
5851 machine_mode mode;
5852 vec<tree> result_chain = vNULL;
5854 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5855 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5856 vectors, that are ready for vector computation. */
5857 result_chain.create (size);
5859 /* If reassociation width for vector type is 2 or greater target machine can
5860 execute 2 or more vector instructions in parallel. Otherwise try to
5861 get chain for loads group using vect_shift_permute_load_chain. */
5862 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5863 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5864 || pow2p_hwi (size)
5865 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5866 gsi, &result_chain))
5867 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5868 vect_record_grouped_load_vectors (stmt, result_chain);
5869 result_chain.release ();
5872 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5873 generated as part of the vectorization of STMT. Assign the statement
5874 for each vector to the associated scalar statement. */
5876 void
5877 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5879 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5880 gimple *next_stmt, *new_stmt;
5881 unsigned int i, gap_count;
5882 tree tmp_data_ref;
5884 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5885 Since we scan the chain starting from it's first node, their order
5886 corresponds the order of data-refs in RESULT_CHAIN. */
5887 next_stmt = first_stmt;
5888 gap_count = 1;
5889 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5891 if (!next_stmt)
5892 break;
5894 /* Skip the gaps. Loads created for the gaps will be removed by dead
5895 code elimination pass later. No need to check for the first stmt in
5896 the group, since it always exists.
5897 GROUP_GAP is the number of steps in elements from the previous
5898 access (if there is no gap GROUP_GAP is 1). We skip loads that
5899 correspond to the gaps. */
5900 if (next_stmt != first_stmt
5901 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5903 gap_count++;
5904 continue;
5907 while (next_stmt)
5909 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5910 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5911 copies, and we put the new vector statement in the first available
5912 RELATED_STMT. */
5913 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5914 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5915 else
5917 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5919 gimple *prev_stmt =
5920 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5921 gimple *rel_stmt =
5922 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5923 while (rel_stmt)
5925 prev_stmt = rel_stmt;
5926 rel_stmt =
5927 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5930 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5931 new_stmt;
5935 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5936 gap_count = 1;
5937 /* If NEXT_STMT accesses the same DR as the previous statement,
5938 put the same TMP_DATA_REF as its vectorized statement; otherwise
5939 get the next data-ref from RESULT_CHAIN. */
5940 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5941 break;
5946 /* Function vect_force_dr_alignment_p.
5948 Returns whether the alignment of a DECL can be forced to be aligned
5949 on ALIGNMENT bit boundary. */
5951 bool
5952 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5954 if (!VAR_P (decl))
5955 return false;
5957 if (decl_in_symtab_p (decl)
5958 && !symtab_node::get (decl)->can_increase_alignment_p ())
5959 return false;
5961 if (TREE_STATIC (decl))
5962 return (alignment <= MAX_OFILE_ALIGNMENT);
5963 else
5964 return (alignment <= MAX_STACK_ALIGNMENT);
5968 /* Return whether the data reference DR is supported with respect to its
5969 alignment.
5970 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5971 it is aligned, i.e., check if it is possible to vectorize it with different
5972 alignment. */
5974 enum dr_alignment_support
5975 vect_supportable_dr_alignment (struct data_reference *dr,
5976 bool check_aligned_accesses)
5978 gimple *stmt = DR_STMT (dr);
5979 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5980 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5981 machine_mode mode = TYPE_MODE (vectype);
5982 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5983 struct loop *vect_loop = NULL;
5984 bool nested_in_vect_loop = false;
5986 if (aligned_access_p (dr) && !check_aligned_accesses)
5987 return dr_aligned;
5989 /* For now assume all conditional loads/stores support unaligned
5990 access without any special code. */
5991 if (is_gimple_call (stmt)
5992 && gimple_call_internal_p (stmt)
5993 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5994 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5995 return dr_unaligned_supported;
5997 if (loop_vinfo)
5999 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6000 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6003 /* Possibly unaligned access. */
6005 /* We can choose between using the implicit realignment scheme (generating
6006 a misaligned_move stmt) and the explicit realignment scheme (generating
6007 aligned loads with a REALIGN_LOAD). There are two variants to the
6008 explicit realignment scheme: optimized, and unoptimized.
6009 We can optimize the realignment only if the step between consecutive
6010 vector loads is equal to the vector size. Since the vector memory
6011 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6012 is guaranteed that the misalignment amount remains the same throughout the
6013 execution of the vectorized loop. Therefore, we can create the
6014 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6015 at the loop preheader.
6017 However, in the case of outer-loop vectorization, when vectorizing a
6018 memory access in the inner-loop nested within the LOOP that is now being
6019 vectorized, while it is guaranteed that the misalignment of the
6020 vectorized memory access will remain the same in different outer-loop
6021 iterations, it is *not* guaranteed that is will remain the same throughout
6022 the execution of the inner-loop. This is because the inner-loop advances
6023 with the original scalar step (and not in steps of VS). If the inner-loop
6024 step happens to be a multiple of VS, then the misalignment remains fixed
6025 and we can use the optimized realignment scheme. For example:
6027 for (i=0; i<N; i++)
6028 for (j=0; j<M; j++)
6029 s += a[i+j];
6031 When vectorizing the i-loop in the above example, the step between
6032 consecutive vector loads is 1, and so the misalignment does not remain
6033 fixed across the execution of the inner-loop, and the realignment cannot
6034 be optimized (as illustrated in the following pseudo vectorized loop):
6036 for (i=0; i<N; i+=4)
6037 for (j=0; j<M; j++){
6038 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6039 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6040 // (assuming that we start from an aligned address).
6043 We therefore have to use the unoptimized realignment scheme:
6045 for (i=0; i<N; i+=4)
6046 for (j=k; j<M; j+=4)
6047 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6048 // that the misalignment of the initial address is
6049 // 0).
6051 The loop can then be vectorized as follows:
6053 for (k=0; k<4; k++){
6054 rt = get_realignment_token (&vp[k]);
6055 for (i=0; i<N; i+=4){
6056 v1 = vp[i+k];
6057 for (j=k; j<M; j+=4){
6058 v2 = vp[i+j+VS-1];
6059 va = REALIGN_LOAD <v1,v2,rt>;
6060 vs += va;
6061 v1 = v2;
6064 } */
6066 if (DR_IS_READ (dr))
6068 bool is_packed = false;
6069 tree type = (TREE_TYPE (DR_REF (dr)));
6071 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6072 && (!targetm.vectorize.builtin_mask_for_load
6073 || targetm.vectorize.builtin_mask_for_load ()))
6075 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6077 /* If we are doing SLP then the accesses need not have the
6078 same alignment, instead it depends on the SLP group size. */
6079 if (loop_vinfo
6080 && STMT_SLP_TYPE (stmt_info)
6081 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6082 * GROUP_SIZE (vinfo_for_stmt
6083 (GROUP_FIRST_ELEMENT (stmt_info))),
6084 TYPE_VECTOR_SUBPARTS (vectype)))
6086 else if (!loop_vinfo
6087 || (nested_in_vect_loop
6088 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6089 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6090 return dr_explicit_realign;
6091 else
6092 return dr_explicit_realign_optimized;
6094 if (!known_alignment_for_access_p (dr))
6095 is_packed = not_size_aligned (DR_REF (dr));
6097 if (targetm.vectorize.support_vector_misalignment
6098 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6099 /* Can't software pipeline the loads, but can at least do them. */
6100 return dr_unaligned_supported;
6102 else
6104 bool is_packed = false;
6105 tree type = (TREE_TYPE (DR_REF (dr)));
6107 if (!known_alignment_for_access_p (dr))
6108 is_packed = not_size_aligned (DR_REF (dr));
6110 if (targetm.vectorize.support_vector_misalignment
6111 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6112 return dr_unaligned_supported;
6115 /* Unsupported. */
6116 return dr_unaligned_unsupported;