poly_int: vector_alignment_reachable_p
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob2e02be5df7fada831d1fa4253a6921cd8d3d25dc
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
64 machine_mode mode;
65 scalar_int_mode array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 limit_p = !targetm.array_mode_supported_p (mode, count);
70 if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
71 limit_p).exists (&array_mode))
73 if (dump_enabled_p ())
74 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
75 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
76 GET_MODE_NAME (mode), count);
77 return false;
80 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
82 if (dump_enabled_p ())
83 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
84 "cannot use %s<%s><%s>\n", name,
85 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
86 return false;
89 if (dump_enabled_p ())
90 dump_printf_loc (MSG_NOTE, vect_location,
91 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
92 GET_MODE_NAME (mode));
94 return true;
98 /* Return the smallest scalar part of STMT.
99 This is used to determine the vectype of the stmt. We generally set the
100 vectype according to the type of the result (lhs). For stmts whose
101 result-type is different than the type of the arguments (e.g., demotion,
102 promotion), vectype will be reset appropriately (later). Note that we have
103 to visit the smallest datatype in this function, because that determines the
104 VF. If the smallest datatype in the loop is present only as the rhs of a
105 promotion operation - we'd miss it.
106 Such a case, where a variable of this datatype does not appear in the lhs
107 anywhere in the loop, can only occur if it's an invariant: e.g.:
108 'int_x = (int) short_inv', which we'd expect to have been optimized away by
109 invariant motion. However, we cannot rely on invariant motion to always
110 take invariants out of the loop, and so in the case of promotion we also
111 have to check the rhs.
112 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
113 types. */
115 tree
116 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
117 HOST_WIDE_INT *rhs_size_unit)
119 tree scalar_type = gimple_expr_type (stmt);
120 HOST_WIDE_INT lhs, rhs;
122 /* During the analysis phase, this function is called on arbitrary
123 statements that might not have scalar results. */
124 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
125 return scalar_type;
127 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 if (is_gimple_assign (stmt)
130 && (gimple_assign_cast_p (stmt)
131 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
135 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
137 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
138 if (rhs < lhs)
139 scalar_type = rhs_type;
142 *lhs_size_unit = lhs;
143 *rhs_size_unit = rhs;
144 return scalar_type;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
152 static bool
153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
158 return false;
160 if (!runtime_alias_check_p (ddr, loop,
161 optimize_loop_nest_for_speed_p (loop)))
162 return false;
164 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
165 return true;
169 /* A subroutine of vect_analyze_data_ref_dependence. Handle
170 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
171 distances. These distances are conservatively correct but they don't
172 reflect a guaranteed dependence.
174 Return true if this function does all the work necessary to avoid
175 an alias or false if the caller should use the dependence distances
176 to limit the vectorization factor in the usual way. LOOP_DEPTH is
177 the depth of the loop described by LOOP_VINFO and the other arguments
178 are as for vect_analyze_data_ref_dependence. */
180 static bool
181 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
182 loop_vec_info loop_vinfo,
183 int loop_depth, unsigned int *max_vf)
185 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186 lambda_vector dist_v;
187 unsigned int i;
188 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
190 int dist = dist_v[loop_depth];
191 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
193 /* If the user asserted safelen >= DIST consecutive iterations
194 can be executed concurrently, assume independence.
196 ??? An alternative would be to add the alias check even
197 in this case, and vectorize the fallback loop with the
198 maximum VF set to safelen. However, if the user has
199 explicitly given a length, it's less likely that that
200 would be a win. */
201 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
203 if ((unsigned int) loop->safelen < *max_vf)
204 *max_vf = loop->safelen;
205 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
206 continue;
209 /* For dependence distances of 2 or more, we have the option
210 of limiting VF or checking for an alias at runtime.
211 Prefer to check at runtime if we can, to avoid limiting
212 the VF unnecessarily when the bases are in fact independent.
214 Note that the alias checks will be removed if the VF ends up
215 being small enough. */
216 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
219 return true;
223 /* Function vect_analyze_data_ref_dependence.
225 Return TRUE if there (might) exist a dependence between a memory-reference
226 DRA and a memory-reference DRB. When versioning for alias may check a
227 dependence at run-time, return FALSE. Adjust *MAX_VF according to
228 the data dependence. */
230 static bool
231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 unsigned int *max_vf)
235 unsigned int i;
236 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
237 struct data_reference *dra = DDR_A (ddr);
238 struct data_reference *drb = DDR_B (ddr);
239 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
240 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
241 lambda_vector dist_v;
242 unsigned int loop_depth;
244 /* In loop analysis all data references should be vectorizable. */
245 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
246 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
247 gcc_unreachable ();
249 /* Independent data accesses. */
250 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
251 return false;
253 if (dra == drb
254 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
255 return false;
257 /* We do not have to consider dependences between accesses that belong
258 to the same group. */
259 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
260 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
261 return false;
263 /* Even if we have an anti-dependence then, as the vectorized loop covers at
264 least two scalar iterations, there is always also a true dependence.
265 As the vectorizer does not re-order loads and stores we can ignore
266 the anti-dependence if TBAA can disambiguate both DRs similar to the
267 case with known negative distance anti-dependences (positive
268 distance anti-dependences would violate TBAA constraints). */
269 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
270 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
271 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
272 get_alias_set (DR_REF (drb))))
273 return false;
275 /* Unknown data dependence. */
276 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
278 /* If user asserted safelen consecutive iterations can be
279 executed concurrently, assume independence. */
280 if (loop->safelen >= 2)
282 if ((unsigned int) loop->safelen < *max_vf)
283 *max_vf = loop->safelen;
284 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
285 return false;
288 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
289 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
291 if (dump_enabled_p ())
293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
294 "versioning for alias not supported for: "
295 "can't determine dependence between ");
296 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
297 DR_REF (dra));
298 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
299 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
300 DR_REF (drb));
301 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
303 return true;
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias required: "
310 "can't determine dependence between ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
319 /* Add to list of ddrs that need to be tested at run-time. */
320 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
323 /* Known data dependence. */
324 if (DDR_NUM_DIST_VECTS (ddr) == 0)
326 /* If user asserted safelen consecutive iterations can be
327 executed concurrently, assume independence. */
328 if (loop->safelen >= 2)
330 if ((unsigned int) loop->safelen < *max_vf)
331 *max_vf = loop->safelen;
332 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
333 return false;
336 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
337 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
339 if (dump_enabled_p ())
341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
342 "versioning for alias not supported for: "
343 "bad dist vector for ");
344 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
345 DR_REF (dra));
346 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
347 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
348 DR_REF (drb));
349 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
351 return true;
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
357 "versioning for alias required: "
358 "bad dist vector for ");
359 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
360 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
361 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
362 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 /* Add to list of ddrs that need to be tested at run-time. */
365 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
368 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
370 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
371 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
372 loop_depth, max_vf))
373 return false;
375 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
377 int dist = dist_v[loop_depth];
379 if (dump_enabled_p ())
380 dump_printf_loc (MSG_NOTE, vect_location,
381 "dependence distance = %d.\n", dist);
383 if (dist == 0)
385 if (dump_enabled_p ())
387 dump_printf_loc (MSG_NOTE, vect_location,
388 "dependence distance == 0 between ");
389 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
390 dump_printf (MSG_NOTE, " and ");
391 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
392 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
395 /* When we perform grouped accesses and perform implicit CSE
396 by detecting equal accesses and doing disambiguation with
397 runtime alias tests like for
398 .. = a[i];
399 .. = a[i+1];
400 a[i] = ..;
401 a[i+1] = ..;
402 *p = ..;
403 .. = a[i];
404 .. = a[i+1];
405 where we will end up loading { a[i], a[i+1] } once, make
406 sure that inserting group loads before the first load and
407 stores after the last store will do the right thing.
408 Similar for groups like
409 a[i] = ...;
410 ... = a[i];
411 a[i+1] = ...;
412 where loads from the group interleave with the store. */
413 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
414 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
416 gimple *earlier_stmt;
417 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
418 if (DR_IS_WRITE
419 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
421 if (dump_enabled_p ())
422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
423 "READ_WRITE dependence in interleaving."
424 "\n");
425 return true;
429 continue;
432 if (dist > 0 && DDR_REVERSED_P (ddr))
434 /* If DDR_REVERSED_P the order of the data-refs in DDR was
435 reversed (to make distance vector positive), and the actual
436 distance is negative. */
437 if (dump_enabled_p ())
438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
439 "dependence distance negative.\n");
440 /* Record a negative dependence distance to later limit the
441 amount of stmt copying / unrolling we can perform.
442 Only need to handle read-after-write dependence. */
443 if (DR_IS_READ (drb)
444 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
445 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
446 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
447 continue;
450 unsigned int abs_dist = abs (dist);
451 if (abs_dist >= 2 && abs_dist < *max_vf)
453 /* The dependence distance requires reduction of the maximal
454 vectorization factor. */
455 *max_vf = abs (dist);
456 if (dump_enabled_p ())
457 dump_printf_loc (MSG_NOTE, vect_location,
458 "adjusting maximal vectorization factor to %i\n",
459 *max_vf);
462 if (abs_dist >= *max_vf)
464 /* Dependence distance does not create dependence, as far as
465 vectorization is concerned, in this case. */
466 if (dump_enabled_p ())
467 dump_printf_loc (MSG_NOTE, vect_location,
468 "dependence distance >= VF.\n");
469 continue;
472 if (dump_enabled_p ())
474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
475 "not vectorized, possible dependence "
476 "between data-refs ");
477 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
478 dump_printf (MSG_NOTE, " and ");
479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
480 dump_printf (MSG_NOTE, "\n");
483 return true;
486 return false;
489 /* Function vect_analyze_data_ref_dependences.
491 Examine all the data references in the loop, and make sure there do not
492 exist any data dependences between them. Set *MAX_VF according to
493 the maximum vectorization factor the data dependences allow. */
495 bool
496 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
497 unsigned int *max_vf)
499 unsigned int i;
500 struct data_dependence_relation *ddr;
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE, vect_location,
504 "=== vect_analyze_data_ref_dependences ===\n");
506 LOOP_VINFO_DDRS (loop_vinfo)
507 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
508 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
509 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
510 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
511 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
512 &LOOP_VINFO_DDRS (loop_vinfo),
513 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
514 return false;
516 /* For epilogues we either have no aliases or alias versioning
517 was applied to original loop. Therefore we may just get max_vf
518 using VF of original loop. */
519 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
520 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
521 else
522 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
523 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
524 return false;
526 return true;
530 /* Function vect_slp_analyze_data_ref_dependence.
532 Return TRUE if there (might) exist a dependence between a memory-reference
533 DRA and a memory-reference DRB. When versioning for alias may check a
534 dependence at run-time, return FALSE. Adjust *MAX_VF according to
535 the data dependence. */
537 static bool
538 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
540 struct data_reference *dra = DDR_A (ddr);
541 struct data_reference *drb = DDR_B (ddr);
543 /* We need to check dependences of statements marked as unvectorizable
544 as well, they still can prohibit vectorization. */
546 /* Independent data accesses. */
547 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
548 return false;
550 if (dra == drb)
551 return false;
553 /* Read-read is OK. */
554 if (DR_IS_READ (dra) && DR_IS_READ (drb))
555 return false;
557 /* If dra and drb are part of the same interleaving chain consider
558 them independent. */
559 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
560 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
561 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
562 return false;
564 /* Unknown data dependence. */
565 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
567 if (dump_enabled_p ())
569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
570 "can't determine dependence between ");
571 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
572 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
573 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
574 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
577 else if (dump_enabled_p ())
579 dump_printf_loc (MSG_NOTE, vect_location,
580 "determined dependence between ");
581 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
582 dump_printf (MSG_NOTE, " and ");
583 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
584 dump_printf (MSG_NOTE, "\n");
587 return true;
591 /* Analyze dependences involved in the transform of SLP NODE. STORES
592 contain the vector of scalar stores of this instance if we are
593 disambiguating the loads. */
595 static bool
596 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
597 vec<gimple *> stores, gimple *last_store)
599 /* This walks over all stmts involved in the SLP load/store done
600 in NODE verifying we can sink them up to the last stmt in the
601 group. */
602 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
603 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
605 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
606 if (access == last_access)
607 continue;
608 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
609 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
610 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
612 gimple *stmt = gsi_stmt (gsi);
613 if (! gimple_vuse (stmt)
614 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
615 continue;
617 /* If we couldn't record a (single) data reference for this
618 stmt we have to give up. */
619 /* ??? Here and below if dependence analysis fails we can resort
620 to the alias oracle which can handle more kinds of stmts. */
621 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
622 if (!dr_b)
623 return false;
625 bool dependent = false;
626 /* If we run into a store of this same instance (we've just
627 marked those) then delay dependence checking until we run
628 into the last store because this is where it will have
629 been sunk to (and we verify if we can do that as well). */
630 if (gimple_visited_p (stmt))
632 if (stmt != last_store)
633 continue;
634 unsigned i;
635 gimple *store;
636 FOR_EACH_VEC_ELT (stores, i, store)
638 data_reference *store_dr
639 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
640 ddr_p ddr = initialize_data_dependence_relation
641 (dr_a, store_dr, vNULL);
642 dependent = vect_slp_analyze_data_ref_dependence (ddr);
643 free_dependence_relation (ddr);
644 if (dependent)
645 break;
648 else
650 ddr_p ddr = initialize_data_dependence_relation (dr_a,
651 dr_b, vNULL);
652 dependent = vect_slp_analyze_data_ref_dependence (ddr);
653 free_dependence_relation (ddr);
655 if (dependent)
656 return false;
659 return true;
663 /* Function vect_analyze_data_ref_dependences.
665 Examine all the data references in the basic-block, and make sure there
666 do not exist any data dependences between them. Set *MAX_VF according to
667 the maximum vectorization factor the data dependences allow. */
669 bool
670 vect_slp_analyze_instance_dependence (slp_instance instance)
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_NOTE, vect_location,
674 "=== vect_slp_analyze_instance_dependence ===\n");
676 /* The stores of this instance are at the root of the SLP tree. */
677 slp_tree store = SLP_INSTANCE_TREE (instance);
678 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
679 store = NULL;
681 /* Verify we can sink stores to the vectorized stmt insert location. */
682 gimple *last_store = NULL;
683 if (store)
685 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
686 return false;
688 /* Mark stores in this instance and remember the last one. */
689 last_store = vect_find_last_scalar_stmt_in_slp (store);
690 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
691 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
694 bool res = true;
696 /* Verify we can sink loads to the vectorized stmt insert location,
697 special-casing stores of this instance. */
698 slp_tree load;
699 unsigned int i;
700 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
701 if (! vect_slp_analyze_node_dependences (instance, load,
702 store
703 ? SLP_TREE_SCALAR_STMTS (store)
704 : vNULL, last_store))
706 res = false;
707 break;
710 /* Unset the visited flag. */
711 if (store)
712 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
713 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
715 return res;
718 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
719 the statement that contains DRB, which is useful for recording in the
720 dump file. */
722 static void
723 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
724 innermost_loop_behavior *drb)
726 bool existed;
727 innermost_loop_behavior *&entry
728 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
729 if (!existed || entry->base_alignment < drb->base_alignment)
731 entry = drb;
732 if (dump_enabled_p ())
734 dump_printf_loc (MSG_NOTE, vect_location,
735 "recording new base alignment for ");
736 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
737 dump_printf (MSG_NOTE, "\n");
738 dump_printf_loc (MSG_NOTE, vect_location,
739 " alignment: %d\n", drb->base_alignment);
740 dump_printf_loc (MSG_NOTE, vect_location,
741 " misalignment: %d\n", drb->base_misalignment);
742 dump_printf_loc (MSG_NOTE, vect_location,
743 " based on: ");
744 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
749 /* If the region we're going to vectorize is reached, all unconditional
750 data references occur at least once. We can therefore pool the base
751 alignment guarantees from each unconditional reference. Do this by
752 going through all the data references in VINFO and checking whether
753 the containing statement makes the reference unconditionally. If so,
754 record the alignment of the base address in VINFO so that it can be
755 used for all other references with the same base. */
757 void
758 vect_record_base_alignments (vec_info *vinfo)
760 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
761 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
762 data_reference *dr;
763 unsigned int i;
764 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
765 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
767 gimple *stmt = DR_STMT (dr);
768 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
770 /* If DR is nested in the loop that is being vectorized, we can also
771 record the alignment of the base wrt the outer loop. */
772 if (loop && nested_in_vect_loop_p (loop, stmt))
774 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
775 vect_record_base_alignment
776 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
781 /* Return the target alignment for the vectorized form of DR. */
783 static unsigned int
784 vect_calculate_target_alignment (struct data_reference *dr)
786 gimple *stmt = DR_STMT (dr);
787 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
788 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
789 return targetm.vectorize.preferred_vector_alignment (vectype);
792 /* Function vect_compute_data_ref_alignment
794 Compute the misalignment of the data reference DR.
796 Output:
797 1. If during the misalignment computation it is found that the data reference
798 cannot be vectorized then false is returned.
799 2. DR_MISALIGNMENT (DR) is defined.
801 FOR NOW: No analysis is actually performed. Misalignment is calculated
802 only for trivial cases. TODO. */
804 bool
805 vect_compute_data_ref_alignment (struct data_reference *dr)
807 gimple *stmt = DR_STMT (dr);
808 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
809 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
810 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
811 struct loop *loop = NULL;
812 tree ref = DR_REF (dr);
813 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
815 if (dump_enabled_p ())
816 dump_printf_loc (MSG_NOTE, vect_location,
817 "vect_compute_data_ref_alignment:\n");
819 if (loop_vinfo)
820 loop = LOOP_VINFO_LOOP (loop_vinfo);
822 /* Initialize misalignment to unknown. */
823 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
825 innermost_loop_behavior *drb = vect_dr_behavior (dr);
826 bool step_preserves_misalignment_p;
828 unsigned HOST_WIDE_INT vector_alignment
829 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
830 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
832 /* No step for BB vectorization. */
833 if (!loop)
835 gcc_assert (integer_zerop (drb->step));
836 step_preserves_misalignment_p = true;
839 /* In case the dataref is in an inner-loop of the loop that is being
840 vectorized (LOOP), we use the base and misalignment information
841 relative to the outer-loop (LOOP). This is ok only if the misalignment
842 stays the same throughout the execution of the inner-loop, which is why
843 we have to check that the stride of the dataref in the inner-loop evenly
844 divides by the vector alignment. */
845 else if (nested_in_vect_loop_p (loop, stmt))
847 step_preserves_misalignment_p
848 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
850 if (dump_enabled_p ())
852 if (step_preserves_misalignment_p)
853 dump_printf_loc (MSG_NOTE, vect_location,
854 "inner step divides the vector alignment.\n");
855 else
856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
857 "inner step doesn't divide the vector"
858 " alignment.\n");
862 /* Similarly we can only use base and misalignment information relative to
863 an innermost loop if the misalignment stays the same throughout the
864 execution of the loop. As above, this is the case if the stride of
865 the dataref evenly divides by the alignment. */
866 else
868 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
869 step_preserves_misalignment_p
870 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
872 if (!step_preserves_misalignment_p && dump_enabled_p ())
873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
874 "step doesn't divide the vector alignment.\n");
877 unsigned int base_alignment = drb->base_alignment;
878 unsigned int base_misalignment = drb->base_misalignment;
880 /* Calculate the maximum of the pooled base address alignment and the
881 alignment that we can compute for DR itself. */
882 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
883 if (entry && base_alignment < (*entry)->base_alignment)
885 base_alignment = (*entry)->base_alignment;
886 base_misalignment = (*entry)->base_misalignment;
889 if (drb->offset_alignment < vector_alignment
890 || !step_preserves_misalignment_p
891 /* We need to know whether the step wrt the vectorized loop is
892 negative when computing the starting misalignment below. */
893 || TREE_CODE (drb->step) != INTEGER_CST)
895 if (dump_enabled_p ())
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "Unknown alignment for access: ");
899 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
900 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
902 return true;
905 if (base_alignment < vector_alignment)
907 tree base = drb->base_address;
908 if (TREE_CODE (base) == ADDR_EXPR)
909 base = TREE_OPERAND (base, 0);
910 if (!vect_can_force_dr_alignment_p (base,
911 vector_alignment * BITS_PER_UNIT))
913 if (dump_enabled_p ())
915 dump_printf_loc (MSG_NOTE, vect_location,
916 "can't force alignment of ref: ");
917 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
918 dump_printf (MSG_NOTE, "\n");
920 return true;
923 if (DECL_USER_ALIGN (base))
925 if (dump_enabled_p ())
927 dump_printf_loc (MSG_NOTE, vect_location,
928 "not forcing alignment of user-aligned "
929 "variable: ");
930 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
931 dump_printf (MSG_NOTE, "\n");
933 return true;
936 /* Force the alignment of the decl.
937 NOTE: This is the only change to the code we make during
938 the analysis phase, before deciding to vectorize the loop. */
939 if (dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
942 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
943 dump_printf (MSG_NOTE, "\n");
946 DR_VECT_AUX (dr)->base_decl = base;
947 DR_VECT_AUX (dr)->base_misaligned = true;
948 base_misalignment = 0;
950 poly_int64 misalignment
951 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
953 /* If this is a backward running DR then first access in the larger
954 vectype actually is N-1 elements before the address in the DR.
955 Adjust misalign accordingly. */
956 if (tree_int_cst_sgn (drb->step) < 0)
957 /* PLUS because STEP is negative. */
958 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
959 * TREE_INT_CST_LOW (drb->step));
961 unsigned int const_misalignment;
962 if (!known_misalignment (misalignment, vector_alignment,
963 &const_misalignment))
965 if (dump_enabled_p ())
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
968 "Non-constant misalignment for access: ");
969 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
970 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
972 return true;
975 SET_DR_MISALIGNMENT (dr, const_misalignment);
977 if (dump_enabled_p ())
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
980 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
981 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
982 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
985 return true;
988 /* Function vect_update_misalignment_for_peel.
989 Sets DR's misalignment
990 - to 0 if it has the same alignment as DR_PEEL,
991 - to the misalignment computed using NPEEL if DR's salignment is known,
992 - to -1 (unknown) otherwise.
994 DR - the data reference whose misalignment is to be adjusted.
995 DR_PEEL - the data reference whose misalignment is being made
996 zero in the vector loop by the peel.
997 NPEEL - the number of iterations in the peel loop if the misalignment
998 of DR_PEEL is known at compile time. */
1000 static void
1001 vect_update_misalignment_for_peel (struct data_reference *dr,
1002 struct data_reference *dr_peel, int npeel)
1004 unsigned int i;
1005 vec<dr_p> same_aligned_drs;
1006 struct data_reference *current_dr;
1007 int dr_size = vect_get_scalar_dr_size (dr);
1008 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1009 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1010 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1012 /* For interleaved data accesses the step in the loop must be multiplied by
1013 the size of the interleaving group. */
1014 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1015 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1016 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1017 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1019 /* It can be assumed that the data refs with the same alignment as dr_peel
1020 are aligned in the vector loop. */
1021 same_aligned_drs
1022 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1023 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1025 if (current_dr != dr)
1026 continue;
1027 gcc_assert (!known_alignment_for_access_p (dr)
1028 || !known_alignment_for_access_p (dr_peel)
1029 || (DR_MISALIGNMENT (dr) / dr_size
1030 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1031 SET_DR_MISALIGNMENT (dr, 0);
1032 return;
1035 if (known_alignment_for_access_p (dr)
1036 && known_alignment_for_access_p (dr_peel))
1038 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1039 int misal = DR_MISALIGNMENT (dr);
1040 misal += negative ? -npeel * dr_size : npeel * dr_size;
1041 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1042 SET_DR_MISALIGNMENT (dr, misal);
1043 return;
1046 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1048 "to unknown (-1).\n");
1049 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1053 /* Function verify_data_ref_alignment
1055 Return TRUE if DR can be handled with respect to alignment. */
1057 static bool
1058 verify_data_ref_alignment (data_reference_p dr)
1060 enum dr_alignment_support supportable_dr_alignment
1061 = vect_supportable_dr_alignment (dr, false);
1062 if (!supportable_dr_alignment)
1064 if (dump_enabled_p ())
1066 if (DR_IS_READ (dr))
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068 "not vectorized: unsupported unaligned load.");
1069 else
1070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1071 "not vectorized: unsupported unaligned "
1072 "store.");
1074 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1075 DR_REF (dr));
1076 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1078 return false;
1081 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1082 dump_printf_loc (MSG_NOTE, vect_location,
1083 "Vectorizing an unaligned access.\n");
1085 return true;
1088 /* Function vect_verify_datarefs_alignment
1090 Return TRUE if all data references in the loop can be
1091 handled with respect to alignment. */
1093 bool
1094 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1096 vec<data_reference_p> datarefs = vinfo->datarefs;
1097 struct data_reference *dr;
1098 unsigned int i;
1100 FOR_EACH_VEC_ELT (datarefs, i, dr)
1102 gimple *stmt = DR_STMT (dr);
1103 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1105 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1106 continue;
1108 /* For interleaving, only the alignment of the first access matters. */
1109 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1110 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1111 continue;
1113 /* Strided accesses perform only component accesses, alignment is
1114 irrelevant for them. */
1115 if (STMT_VINFO_STRIDED_P (stmt_info)
1116 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1117 continue;
1119 if (! verify_data_ref_alignment (dr))
1120 return false;
1123 return true;
1126 /* Given an memory reference EXP return whether its alignment is less
1127 than its size. */
1129 static bool
1130 not_size_aligned (tree exp)
1132 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1133 return true;
1135 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1136 > get_object_alignment (exp));
1139 /* Function vector_alignment_reachable_p
1141 Return true if vector alignment for DR is reachable by peeling
1142 a few loop iterations. Return false otherwise. */
1144 static bool
1145 vector_alignment_reachable_p (struct data_reference *dr)
1147 gimple *stmt = DR_STMT (dr);
1148 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1149 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1151 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1153 /* For interleaved access we peel only if number of iterations in
1154 the prolog loop ({VF - misalignment}), is a multiple of the
1155 number of the interleaved accesses. */
1156 int elem_size, mis_in_elements;
1158 /* FORNOW: handle only known alignment. */
1159 if (!known_alignment_for_access_p (dr))
1160 return false;
1162 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1163 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1164 elem_size = vector_element_size (vector_size, nelements);
1165 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1167 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1168 return false;
1171 /* If misalignment is known at the compile time then allow peeling
1172 only if natural alignment is reachable through peeling. */
1173 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1175 HOST_WIDE_INT elmsize =
1176 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1177 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_NOTE, vect_location,
1180 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1181 dump_printf (MSG_NOTE,
1182 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1184 if (DR_MISALIGNMENT (dr) % elmsize)
1186 if (dump_enabled_p ())
1187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188 "data size does not divide the misalignment.\n");
1189 return false;
1193 if (!known_alignment_for_access_p (dr))
1195 tree type = TREE_TYPE (DR_REF (dr));
1196 bool is_packed = not_size_aligned (DR_REF (dr));
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199 "Unknown misalignment, %snaturally aligned\n",
1200 is_packed ? "not " : "");
1201 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1204 return true;
1208 /* Calculate the cost of the memory access represented by DR. */
1210 static void
1211 vect_get_data_access_cost (struct data_reference *dr,
1212 unsigned int *inside_cost,
1213 unsigned int *outside_cost,
1214 stmt_vector_for_cost *body_cost_vec)
1216 gimple *stmt = DR_STMT (dr);
1217 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1218 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1219 int ncopies;
1221 if (PURE_SLP_STMT (stmt_info))
1222 ncopies = 1;
1223 else
1224 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1226 if (DR_IS_READ (dr))
1227 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1228 NULL, body_cost_vec, false);
1229 else
1230 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_NOTE, vect_location,
1234 "vect_get_data_access_cost: inside_cost = %d, "
1235 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1239 typedef struct _vect_peel_info
1241 struct data_reference *dr;
1242 int npeel;
1243 unsigned int count;
1244 } *vect_peel_info;
1246 typedef struct _vect_peel_extended_info
1248 struct _vect_peel_info peel_info;
1249 unsigned int inside_cost;
1250 unsigned int outside_cost;
1251 } *vect_peel_extended_info;
1254 /* Peeling hashtable helpers. */
1256 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1258 static inline hashval_t hash (const _vect_peel_info *);
1259 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1262 inline hashval_t
1263 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1265 return (hashval_t) peel_info->npeel;
1268 inline bool
1269 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1271 return (a->npeel == b->npeel);
1275 /* Insert DR into peeling hash table with NPEEL as key. */
1277 static void
1278 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1279 loop_vec_info loop_vinfo, struct data_reference *dr,
1280 int npeel)
1282 struct _vect_peel_info elem, *slot;
1283 _vect_peel_info **new_slot;
1284 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1286 elem.npeel = npeel;
1287 slot = peeling_htab->find (&elem);
1288 if (slot)
1289 slot->count++;
1290 else
1292 slot = XNEW (struct _vect_peel_info);
1293 slot->npeel = npeel;
1294 slot->dr = dr;
1295 slot->count = 1;
1296 new_slot = peeling_htab->find_slot (slot, INSERT);
1297 *new_slot = slot;
1300 if (!supportable_dr_alignment
1301 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1302 slot->count += VECT_MAX_COST;
1306 /* Traverse peeling hash table to find peeling option that aligns maximum
1307 number of data accesses. */
1310 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1311 _vect_peel_extended_info *max)
1313 vect_peel_info elem = *slot;
1315 if (elem->count > max->peel_info.count
1316 || (elem->count == max->peel_info.count
1317 && max->peel_info.npeel > elem->npeel))
1319 max->peel_info.npeel = elem->npeel;
1320 max->peel_info.count = elem->count;
1321 max->peel_info.dr = elem->dr;
1324 return 1;
1327 /* Get the costs of peeling NPEEL iterations checking data access costs
1328 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1329 misalignment will be zero after peeling. */
1331 static void
1332 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1333 struct data_reference *dr0,
1334 unsigned int *inside_cost,
1335 unsigned int *outside_cost,
1336 stmt_vector_for_cost *body_cost_vec,
1337 unsigned int npeel,
1338 bool unknown_misalignment)
1340 unsigned i;
1341 data_reference *dr;
1343 FOR_EACH_VEC_ELT (datarefs, i, dr)
1345 gimple *stmt = DR_STMT (dr);
1346 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1347 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1348 continue;
1350 /* For interleaving, only the alignment of the first access
1351 matters. */
1352 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1353 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1354 continue;
1356 /* Strided accesses perform only component accesses, alignment is
1357 irrelevant for them. */
1358 if (STMT_VINFO_STRIDED_P (stmt_info)
1359 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1360 continue;
1362 int save_misalignment;
1363 save_misalignment = DR_MISALIGNMENT (dr);
1364 if (npeel == 0)
1366 else if (unknown_misalignment && dr == dr0)
1367 SET_DR_MISALIGNMENT (dr, 0);
1368 else
1369 vect_update_misalignment_for_peel (dr, dr0, npeel);
1370 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1371 body_cost_vec);
1372 SET_DR_MISALIGNMENT (dr, save_misalignment);
1376 /* Traverse peeling hash table and calculate cost for each peeling option.
1377 Find the one with the lowest cost. */
1380 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1381 _vect_peel_extended_info *min)
1383 vect_peel_info elem = *slot;
1384 int dummy;
1385 unsigned int inside_cost = 0, outside_cost = 0;
1386 gimple *stmt = DR_STMT (elem->dr);
1387 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1388 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1389 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1390 epilogue_cost_vec;
1392 prologue_cost_vec.create (2);
1393 body_cost_vec.create (2);
1394 epilogue_cost_vec.create (2);
1396 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1397 elem->dr, &inside_cost, &outside_cost,
1398 &body_cost_vec, elem->npeel, false);
1400 body_cost_vec.release ();
1402 outside_cost += vect_get_known_peeling_cost
1403 (loop_vinfo, elem->npeel, &dummy,
1404 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1405 &prologue_cost_vec, &epilogue_cost_vec);
1407 /* Prologue and epilogue costs are added to the target model later.
1408 These costs depend only on the scalar iteration cost, the
1409 number of peeling iterations finally chosen, and the number of
1410 misaligned statements. So discard the information found here. */
1411 prologue_cost_vec.release ();
1412 epilogue_cost_vec.release ();
1414 if (inside_cost < min->inside_cost
1415 || (inside_cost == min->inside_cost
1416 && outside_cost < min->outside_cost))
1418 min->inside_cost = inside_cost;
1419 min->outside_cost = outside_cost;
1420 min->peel_info.dr = elem->dr;
1421 min->peel_info.npeel = elem->npeel;
1422 min->peel_info.count = elem->count;
1425 return 1;
1429 /* Choose best peeling option by traversing peeling hash table and either
1430 choosing an option with the lowest cost (if cost model is enabled) or the
1431 option that aligns as many accesses as possible. */
1433 static struct _vect_peel_extended_info
1434 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1435 loop_vec_info loop_vinfo)
1437 struct _vect_peel_extended_info res;
1439 res.peel_info.dr = NULL;
1441 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1443 res.inside_cost = INT_MAX;
1444 res.outside_cost = INT_MAX;
1445 peeling_htab->traverse <_vect_peel_extended_info *,
1446 vect_peeling_hash_get_lowest_cost> (&res);
1448 else
1450 res.peel_info.count = 0;
1451 peeling_htab->traverse <_vect_peel_extended_info *,
1452 vect_peeling_hash_get_most_frequent> (&res);
1453 res.inside_cost = 0;
1454 res.outside_cost = 0;
1457 return res;
1460 /* Return true if the new peeling NPEEL is supported. */
1462 static bool
1463 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1464 unsigned npeel)
1466 unsigned i;
1467 struct data_reference *dr = NULL;
1468 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1469 gimple *stmt;
1470 stmt_vec_info stmt_info;
1471 enum dr_alignment_support supportable_dr_alignment;
1473 /* Ensure that all data refs can be vectorized after the peel. */
1474 FOR_EACH_VEC_ELT (datarefs, i, dr)
1476 int save_misalignment;
1478 if (dr == dr0)
1479 continue;
1481 stmt = DR_STMT (dr);
1482 stmt_info = vinfo_for_stmt (stmt);
1483 /* For interleaving, only the alignment of the first access
1484 matters. */
1485 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1486 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1487 continue;
1489 /* Strided accesses perform only component accesses, alignment is
1490 irrelevant for them. */
1491 if (STMT_VINFO_STRIDED_P (stmt_info)
1492 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1493 continue;
1495 save_misalignment = DR_MISALIGNMENT (dr);
1496 vect_update_misalignment_for_peel (dr, dr0, npeel);
1497 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1498 SET_DR_MISALIGNMENT (dr, save_misalignment);
1500 if (!supportable_dr_alignment)
1501 return false;
1504 return true;
1507 /* Function vect_enhance_data_refs_alignment
1509 This pass will use loop versioning and loop peeling in order to enhance
1510 the alignment of data references in the loop.
1512 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1513 original loop is to be vectorized. Any other loops that are created by
1514 the transformations performed in this pass - are not supposed to be
1515 vectorized. This restriction will be relaxed.
1517 This pass will require a cost model to guide it whether to apply peeling
1518 or versioning or a combination of the two. For example, the scheme that
1519 intel uses when given a loop with several memory accesses, is as follows:
1520 choose one memory access ('p') which alignment you want to force by doing
1521 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1522 other accesses are not necessarily aligned, or (2) use loop versioning to
1523 generate one loop in which all accesses are aligned, and another loop in
1524 which only 'p' is necessarily aligned.
1526 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1527 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1528 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1530 Devising a cost model is the most critical aspect of this work. It will
1531 guide us on which access to peel for, whether to use loop versioning, how
1532 many versions to create, etc. The cost model will probably consist of
1533 generic considerations as well as target specific considerations (on
1534 powerpc for example, misaligned stores are more painful than misaligned
1535 loads).
1537 Here are the general steps involved in alignment enhancements:
1539 -- original loop, before alignment analysis:
1540 for (i=0; i<N; i++){
1541 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1542 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1545 -- After vect_compute_data_refs_alignment:
1546 for (i=0; i<N; i++){
1547 x = q[i]; # DR_MISALIGNMENT(q) = 3
1548 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1551 -- Possibility 1: we do loop versioning:
1552 if (p is aligned) {
1553 for (i=0; i<N; i++){ # loop 1A
1554 x = q[i]; # DR_MISALIGNMENT(q) = 3
1555 p[i] = y; # DR_MISALIGNMENT(p) = 0
1558 else {
1559 for (i=0; i<N; i++){ # loop 1B
1560 x = q[i]; # DR_MISALIGNMENT(q) = 3
1561 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1565 -- Possibility 2: we do loop peeling:
1566 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1567 x = q[i];
1568 p[i] = y;
1570 for (i = 3; i < N; i++){ # loop 2A
1571 x = q[i]; # DR_MISALIGNMENT(q) = 0
1572 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1575 -- Possibility 3: combination of loop peeling and versioning:
1576 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1577 x = q[i];
1578 p[i] = y;
1580 if (p is aligned) {
1581 for (i = 3; i<N; i++){ # loop 3A
1582 x = q[i]; # DR_MISALIGNMENT(q) = 0
1583 p[i] = y; # DR_MISALIGNMENT(p) = 0
1586 else {
1587 for (i = 3; i<N; i++){ # loop 3B
1588 x = q[i]; # DR_MISALIGNMENT(q) = 0
1589 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1593 These loops are later passed to loop_transform to be vectorized. The
1594 vectorizer will use the alignment information to guide the transformation
1595 (whether to generate regular loads/stores, or with special handling for
1596 misalignment). */
1598 bool
1599 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1601 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1602 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1603 enum dr_alignment_support supportable_dr_alignment;
1604 struct data_reference *dr0 = NULL, *first_store = NULL;
1605 struct data_reference *dr;
1606 unsigned int i, j;
1607 bool do_peeling = false;
1608 bool do_versioning = false;
1609 bool stat;
1610 gimple *stmt;
1611 stmt_vec_info stmt_info;
1612 unsigned int npeel = 0;
1613 bool one_misalignment_known = false;
1614 bool one_misalignment_unknown = false;
1615 bool one_dr_unsupportable = false;
1616 struct data_reference *unsupportable_dr = NULL;
1617 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1618 unsigned possible_npeel_number = 1;
1619 tree vectype;
1620 unsigned int mis, same_align_drs_max = 0;
1621 hash_table<peel_info_hasher> peeling_htab (1);
1623 if (dump_enabled_p ())
1624 dump_printf_loc (MSG_NOTE, vect_location,
1625 "=== vect_enhance_data_refs_alignment ===\n");
1627 /* Reset data so we can safely be called multiple times. */
1628 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1629 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1631 /* While cost model enhancements are expected in the future, the high level
1632 view of the code at this time is as follows:
1634 A) If there is a misaligned access then see if peeling to align
1635 this access can make all data references satisfy
1636 vect_supportable_dr_alignment. If so, update data structures
1637 as needed and return true.
1639 B) If peeling wasn't possible and there is a data reference with an
1640 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1641 then see if loop versioning checks can be used to make all data
1642 references satisfy vect_supportable_dr_alignment. If so, update
1643 data structures as needed and return true.
1645 C) If neither peeling nor versioning were successful then return false if
1646 any data reference does not satisfy vect_supportable_dr_alignment.
1648 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1650 Note, Possibility 3 above (which is peeling and versioning together) is not
1651 being done at this time. */
1653 /* (1) Peeling to force alignment. */
1655 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1656 Considerations:
1657 + How many accesses will become aligned due to the peeling
1658 - How many accesses will become unaligned due to the peeling,
1659 and the cost of misaligned accesses.
1660 - The cost of peeling (the extra runtime checks, the increase
1661 in code size). */
1663 FOR_EACH_VEC_ELT (datarefs, i, dr)
1665 stmt = DR_STMT (dr);
1666 stmt_info = vinfo_for_stmt (stmt);
1668 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1669 continue;
1671 /* For interleaving, only the alignment of the first access
1672 matters. */
1673 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1674 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1675 continue;
1677 /* For invariant accesses there is nothing to enhance. */
1678 if (integer_zerop (DR_STEP (dr)))
1679 continue;
1681 /* Strided accesses perform only component accesses, alignment is
1682 irrelevant for them. */
1683 if (STMT_VINFO_STRIDED_P (stmt_info)
1684 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1685 continue;
1687 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1688 do_peeling = vector_alignment_reachable_p (dr);
1689 if (do_peeling)
1691 if (known_alignment_for_access_p (dr))
1693 unsigned int npeel_tmp = 0;
1694 bool negative = tree_int_cst_compare (DR_STEP (dr),
1695 size_zero_node) < 0;
1697 vectype = STMT_VINFO_VECTYPE (stmt_info);
1698 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1699 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1700 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1701 if (DR_MISALIGNMENT (dr) != 0)
1702 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1704 /* For multiple types, it is possible that the bigger type access
1705 will have more than one peeling option. E.g., a loop with two
1706 types: one of size (vector size / 4), and the other one of
1707 size (vector size / 8). Vectorization factor will 8. If both
1708 accesses are misaligned by 3, the first one needs one scalar
1709 iteration to be aligned, and the second one needs 5. But the
1710 first one will be aligned also by peeling 5 scalar
1711 iterations, and in that case both accesses will be aligned.
1712 Hence, except for the immediate peeling amount, we also want
1713 to try to add full vector size, while we don't exceed
1714 vectorization factor.
1715 We do this automatically for cost model, since we calculate
1716 cost for every peeling option. */
1717 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1719 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1720 ? vf * GROUP_SIZE (stmt_info) : vf);
1721 possible_npeel_number
1722 = vect_get_num_vectors (nscalars, vectype);
1724 /* NPEEL_TMP is 0 when there is no misalignment, but also
1725 allow peeling NELEMENTS. */
1726 if (DR_MISALIGNMENT (dr) == 0)
1727 possible_npeel_number++;
1730 /* Save info about DR in the hash table. Also include peeling
1731 amounts according to the explanation above. */
1732 for (j = 0; j < possible_npeel_number; j++)
1734 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1735 dr, npeel_tmp);
1736 npeel_tmp += target_align / dr_size;
1739 one_misalignment_known = true;
1741 else
1743 /* If we don't know any misalignment values, we prefer
1744 peeling for data-ref that has the maximum number of data-refs
1745 with the same alignment, unless the target prefers to align
1746 stores over load. */
1747 unsigned same_align_drs
1748 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1749 if (!dr0
1750 || same_align_drs_max < same_align_drs)
1752 same_align_drs_max = same_align_drs;
1753 dr0 = dr;
1755 /* For data-refs with the same number of related
1756 accesses prefer the one where the misalign
1757 computation will be invariant in the outermost loop. */
1758 else if (same_align_drs_max == same_align_drs)
1760 struct loop *ivloop0, *ivloop;
1761 ivloop0 = outermost_invariant_loop_for_expr
1762 (loop, DR_BASE_ADDRESS (dr0));
1763 ivloop = outermost_invariant_loop_for_expr
1764 (loop, DR_BASE_ADDRESS (dr));
1765 if ((ivloop && !ivloop0)
1766 || (ivloop && ivloop0
1767 && flow_loop_nested_p (ivloop, ivloop0)))
1768 dr0 = dr;
1771 one_misalignment_unknown = true;
1773 /* Check for data refs with unsupportable alignment that
1774 can be peeled. */
1775 if (!supportable_dr_alignment)
1777 one_dr_unsupportable = true;
1778 unsupportable_dr = dr;
1781 if (!first_store && DR_IS_WRITE (dr))
1782 first_store = dr;
1785 else
1787 if (!aligned_access_p (dr))
1789 if (dump_enabled_p ())
1790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1791 "vector alignment may not be reachable\n");
1792 break;
1797 /* Check if we can possibly peel the loop. */
1798 if (!vect_can_advance_ivs_p (loop_vinfo)
1799 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1800 || loop->inner)
1801 do_peeling = false;
1803 struct _vect_peel_extended_info peel_for_known_alignment;
1804 struct _vect_peel_extended_info peel_for_unknown_alignment;
1805 struct _vect_peel_extended_info best_peel;
1807 peel_for_unknown_alignment.inside_cost = INT_MAX;
1808 peel_for_unknown_alignment.outside_cost = INT_MAX;
1809 peel_for_unknown_alignment.peel_info.count = 0;
1811 if (do_peeling
1812 && one_misalignment_unknown)
1814 /* Check if the target requires to prefer stores over loads, i.e., if
1815 misaligned stores are more expensive than misaligned loads (taking
1816 drs with same alignment into account). */
1817 unsigned int load_inside_cost = 0;
1818 unsigned int load_outside_cost = 0;
1819 unsigned int store_inside_cost = 0;
1820 unsigned int store_outside_cost = 0;
1821 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1823 stmt_vector_for_cost dummy;
1824 dummy.create (2);
1825 vect_get_peeling_costs_all_drs (datarefs, dr0,
1826 &load_inside_cost,
1827 &load_outside_cost,
1828 &dummy, estimated_npeels, true);
1829 dummy.release ();
1831 if (first_store)
1833 dummy.create (2);
1834 vect_get_peeling_costs_all_drs (datarefs, first_store,
1835 &store_inside_cost,
1836 &store_outside_cost,
1837 &dummy, estimated_npeels, true);
1838 dummy.release ();
1840 else
1842 store_inside_cost = INT_MAX;
1843 store_outside_cost = INT_MAX;
1846 if (load_inside_cost > store_inside_cost
1847 || (load_inside_cost == store_inside_cost
1848 && load_outside_cost > store_outside_cost))
1850 dr0 = first_store;
1851 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1852 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1854 else
1856 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1857 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1860 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1861 prologue_cost_vec.create (2);
1862 epilogue_cost_vec.create (2);
1864 int dummy2;
1865 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1866 (loop_vinfo, estimated_npeels, &dummy2,
1867 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1868 &prologue_cost_vec, &epilogue_cost_vec);
1870 prologue_cost_vec.release ();
1871 epilogue_cost_vec.release ();
1873 peel_for_unknown_alignment.peel_info.count = 1
1874 + STMT_VINFO_SAME_ALIGN_REFS
1875 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1878 peel_for_unknown_alignment.peel_info.npeel = 0;
1879 peel_for_unknown_alignment.peel_info.dr = dr0;
1881 best_peel = peel_for_unknown_alignment;
1883 peel_for_known_alignment.inside_cost = INT_MAX;
1884 peel_for_known_alignment.outside_cost = INT_MAX;
1885 peel_for_known_alignment.peel_info.count = 0;
1886 peel_for_known_alignment.peel_info.dr = NULL;
1888 if (do_peeling && one_misalignment_known)
1890 /* Peeling is possible, but there is no data access that is not supported
1891 unless aligned. So we try to choose the best possible peeling from
1892 the hash table. */
1893 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1894 (&peeling_htab, loop_vinfo);
1897 /* Compare costs of peeling for known and unknown alignment. */
1898 if (peel_for_known_alignment.peel_info.dr != NULL
1899 && peel_for_unknown_alignment.inside_cost
1900 >= peel_for_known_alignment.inside_cost)
1902 best_peel = peel_for_known_alignment;
1904 /* If the best peeling for known alignment has NPEEL == 0, perform no
1905 peeling at all except if there is an unsupportable dr that we can
1906 align. */
1907 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1908 do_peeling = false;
1911 /* If there is an unsupportable data ref, prefer this over all choices so far
1912 since we'd have to discard a chosen peeling except when it accidentally
1913 aligned the unsupportable data ref. */
1914 if (one_dr_unsupportable)
1915 dr0 = unsupportable_dr;
1916 else if (do_peeling)
1918 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1919 TODO: Use nopeel_outside_cost or get rid of it? */
1920 unsigned nopeel_inside_cost = 0;
1921 unsigned nopeel_outside_cost = 0;
1923 stmt_vector_for_cost dummy;
1924 dummy.create (2);
1925 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1926 &nopeel_outside_cost, &dummy, 0, false);
1927 dummy.release ();
1929 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1930 costs will be recorded. */
1931 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1932 prologue_cost_vec.create (2);
1933 epilogue_cost_vec.create (2);
1935 int dummy2;
1936 nopeel_outside_cost += vect_get_known_peeling_cost
1937 (loop_vinfo, 0, &dummy2,
1938 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1939 &prologue_cost_vec, &epilogue_cost_vec);
1941 prologue_cost_vec.release ();
1942 epilogue_cost_vec.release ();
1944 npeel = best_peel.peel_info.npeel;
1945 dr0 = best_peel.peel_info.dr;
1947 /* If no peeling is not more expensive than the best peeling we
1948 have so far, don't perform any peeling. */
1949 if (nopeel_inside_cost <= best_peel.inside_cost)
1950 do_peeling = false;
1953 if (do_peeling)
1955 stmt = DR_STMT (dr0);
1956 stmt_info = vinfo_for_stmt (stmt);
1957 vectype = STMT_VINFO_VECTYPE (stmt_info);
1959 if (known_alignment_for_access_p (dr0))
1961 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1962 size_zero_node) < 0;
1963 if (!npeel)
1965 /* Since it's known at compile time, compute the number of
1966 iterations in the peeled loop (the peeling factor) for use in
1967 updating DR_MISALIGNMENT values. The peeling factor is the
1968 vectorization factor minus the misalignment as an element
1969 count. */
1970 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1971 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1972 npeel = ((mis & (target_align - 1))
1973 / vect_get_scalar_dr_size (dr0));
1976 /* For interleaved data access every iteration accesses all the
1977 members of the group, therefore we divide the number of iterations
1978 by the group size. */
1979 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1980 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1981 npeel /= GROUP_SIZE (stmt_info);
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_NOTE, vect_location,
1985 "Try peeling by %d\n", npeel);
1988 /* Ensure that all datarefs can be vectorized after the peel. */
1989 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1990 do_peeling = false;
1992 /* Check if all datarefs are supportable and log. */
1993 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1995 stat = vect_verify_datarefs_alignment (loop_vinfo);
1996 if (!stat)
1997 do_peeling = false;
1998 else
1999 return stat;
2002 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2003 if (do_peeling)
2005 unsigned max_allowed_peel
2006 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2007 if (max_allowed_peel != (unsigned)-1)
2009 unsigned max_peel = npeel;
2010 if (max_peel == 0)
2012 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2013 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2015 if (max_peel > max_allowed_peel)
2017 do_peeling = false;
2018 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_NOTE, vect_location,
2020 "Disable peeling, max peels reached: %d\n", max_peel);
2025 /* Cost model #2 - if peeling may result in a remaining loop not
2026 iterating enough to be vectorized then do not peel. Since this
2027 is a cost heuristic rather than a correctness decision, use the
2028 most likely runtime value for variable vectorization factors. */
2029 if (do_peeling
2030 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2032 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2033 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2034 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2035 < assumed_vf + max_peel)
2036 do_peeling = false;
2039 if (do_peeling)
2041 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2042 If the misalignment of DR_i is identical to that of dr0 then set
2043 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2044 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2045 by the peeling factor times the element size of DR_i (MOD the
2046 vectorization factor times the size). Otherwise, the
2047 misalignment of DR_i must be set to unknown. */
2048 FOR_EACH_VEC_ELT (datarefs, i, dr)
2049 if (dr != dr0)
2051 /* Strided accesses perform only component accesses, alignment
2052 is irrelevant for them. */
2053 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2054 if (STMT_VINFO_STRIDED_P (stmt_info)
2055 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2056 continue;
2058 vect_update_misalignment_for_peel (dr, dr0, npeel);
2061 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2062 if (npeel)
2063 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2064 else
2065 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2066 = DR_MISALIGNMENT (dr0);
2067 SET_DR_MISALIGNMENT (dr0, 0);
2068 if (dump_enabled_p ())
2070 dump_printf_loc (MSG_NOTE, vect_location,
2071 "Alignment of access forced using peeling.\n");
2072 dump_printf_loc (MSG_NOTE, vect_location,
2073 "Peeling for alignment will be applied.\n");
2076 /* The inside-loop cost will be accounted for in vectorizable_load
2077 and vectorizable_store correctly with adjusted alignments.
2078 Drop the body_cst_vec on the floor here. */
2079 stat = vect_verify_datarefs_alignment (loop_vinfo);
2080 gcc_assert (stat);
2081 return stat;
2085 /* (2) Versioning to force alignment. */
2087 /* Try versioning if:
2088 1) optimize loop for speed
2089 2) there is at least one unsupported misaligned data ref with an unknown
2090 misalignment, and
2091 3) all misaligned data refs with a known misalignment are supported, and
2092 4) the number of runtime alignment checks is within reason. */
2094 do_versioning =
2095 optimize_loop_nest_for_speed_p (loop)
2096 && (!loop->inner); /* FORNOW */
2098 if (do_versioning)
2100 FOR_EACH_VEC_ELT (datarefs, i, dr)
2102 stmt = DR_STMT (dr);
2103 stmt_info = vinfo_for_stmt (stmt);
2105 /* For interleaving, only the alignment of the first access
2106 matters. */
2107 if (aligned_access_p (dr)
2108 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2109 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2110 continue;
2112 if (STMT_VINFO_STRIDED_P (stmt_info))
2114 /* Strided loads perform only component accesses, alignment is
2115 irrelevant for them. */
2116 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2117 continue;
2118 do_versioning = false;
2119 break;
2122 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2124 if (!supportable_dr_alignment)
2126 gimple *stmt;
2127 int mask;
2128 tree vectype;
2130 if (known_alignment_for_access_p (dr)
2131 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2132 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2134 do_versioning = false;
2135 break;
2138 stmt = DR_STMT (dr);
2139 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2140 gcc_assert (vectype);
2142 /* The rightmost bits of an aligned address must be zeros.
2143 Construct the mask needed for this test. For example,
2144 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2145 mask must be 15 = 0xf. */
2146 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2148 /* FORNOW: use the same mask to test all potentially unaligned
2149 references in the loop. The vectorizer currently supports
2150 a single vector size, see the reference to
2151 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2152 vectorization factor is computed. */
2153 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2154 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2155 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2156 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2157 DR_STMT (dr));
2161 /* Versioning requires at least one misaligned data reference. */
2162 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2163 do_versioning = false;
2164 else if (!do_versioning)
2165 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2168 if (do_versioning)
2170 vec<gimple *> may_misalign_stmts
2171 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2172 gimple *stmt;
2174 /* It can now be assumed that the data references in the statements
2175 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2176 of the loop being vectorized. */
2177 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2179 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2180 dr = STMT_VINFO_DATA_REF (stmt_info);
2181 SET_DR_MISALIGNMENT (dr, 0);
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_NOTE, vect_location,
2184 "Alignment of access forced using versioning.\n");
2187 if (dump_enabled_p ())
2188 dump_printf_loc (MSG_NOTE, vect_location,
2189 "Versioning for alignment will be applied.\n");
2191 /* Peeling and versioning can't be done together at this time. */
2192 gcc_assert (! (do_peeling && do_versioning));
2194 stat = vect_verify_datarefs_alignment (loop_vinfo);
2195 gcc_assert (stat);
2196 return stat;
2199 /* This point is reached if neither peeling nor versioning is being done. */
2200 gcc_assert (! (do_peeling || do_versioning));
2202 stat = vect_verify_datarefs_alignment (loop_vinfo);
2203 return stat;
2207 /* Function vect_find_same_alignment_drs.
2209 Update group and alignment relations according to the chosen
2210 vectorization factor. */
2212 static void
2213 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2215 struct data_reference *dra = DDR_A (ddr);
2216 struct data_reference *drb = DDR_B (ddr);
2217 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2218 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2220 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2221 return;
2223 if (dra == drb)
2224 return;
2226 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2227 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2228 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2229 return;
2231 /* Two references with distance zero have the same alignment. */
2232 offset_int diff = (wi::to_offset (DR_INIT (dra))
2233 - wi::to_offset (DR_INIT (drb)));
2234 if (diff != 0)
2236 /* Get the wider of the two alignments. */
2237 unsigned int align_a = (vect_calculate_target_alignment (dra)
2238 / BITS_PER_UNIT);
2239 unsigned int align_b = (vect_calculate_target_alignment (drb)
2240 / BITS_PER_UNIT);
2241 unsigned int max_align = MAX (align_a, align_b);
2243 /* Require the gap to be a multiple of the larger vector alignment. */
2244 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2245 return;
2248 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2249 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2250 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "accesses have the same alignment: ");
2254 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2255 dump_printf (MSG_NOTE, " and ");
2256 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2257 dump_printf (MSG_NOTE, "\n");
2262 /* Function vect_analyze_data_refs_alignment
2264 Analyze the alignment of the data-references in the loop.
2265 Return FALSE if a data reference is found that cannot be vectorized. */
2267 bool
2268 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_NOTE, vect_location,
2272 "=== vect_analyze_data_refs_alignment ===\n");
2274 /* Mark groups of data references with same alignment using
2275 data dependence information. */
2276 vec<ddr_p> ddrs = vinfo->ddrs;
2277 struct data_dependence_relation *ddr;
2278 unsigned int i;
2280 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2281 vect_find_same_alignment_drs (ddr);
2283 vec<data_reference_p> datarefs = vinfo->datarefs;
2284 struct data_reference *dr;
2286 vect_record_base_alignments (vinfo);
2287 FOR_EACH_VEC_ELT (datarefs, i, dr)
2289 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2290 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2291 && !vect_compute_data_ref_alignment (dr))
2293 /* Strided accesses perform only component accesses, misalignment
2294 information is irrelevant for them. */
2295 if (STMT_VINFO_STRIDED_P (stmt_info)
2296 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2297 continue;
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301 "not vectorized: can't calculate alignment "
2302 "for data ref.\n");
2304 return false;
2308 return true;
2312 /* Analyze alignment of DRs of stmts in NODE. */
2314 static bool
2315 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2317 /* We vectorize from the first scalar stmt in the node unless
2318 the node is permuted in which case we start from the first
2319 element in the group. */
2320 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2321 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2322 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2323 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2325 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2326 if (! vect_compute_data_ref_alignment (dr)
2327 /* For creating the data-ref pointer we need alignment of the
2328 first element anyway. */
2329 || (dr != first_dr
2330 && ! vect_compute_data_ref_alignment (first_dr))
2331 || ! verify_data_ref_alignment (dr))
2333 if (dump_enabled_p ())
2334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2335 "not vectorized: bad data alignment in basic "
2336 "block.\n");
2337 return false;
2340 return true;
2343 /* Function vect_slp_analyze_instance_alignment
2345 Analyze the alignment of the data-references in the SLP instance.
2346 Return FALSE if a data reference is found that cannot be vectorized. */
2348 bool
2349 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2351 if (dump_enabled_p ())
2352 dump_printf_loc (MSG_NOTE, vect_location,
2353 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2355 slp_tree node;
2356 unsigned i;
2357 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2358 if (! vect_slp_analyze_and_verify_node_alignment (node))
2359 return false;
2361 node = SLP_INSTANCE_TREE (instance);
2362 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2363 && ! vect_slp_analyze_and_verify_node_alignment
2364 (SLP_INSTANCE_TREE (instance)))
2365 return false;
2367 return true;
2371 /* Analyze groups of accesses: check that DR belongs to a group of
2372 accesses of legal size, step, etc. Detect gaps, single element
2373 interleaving, and other special cases. Set grouped access info.
2374 Collect groups of strided stores for further use in SLP analysis.
2375 Worker for vect_analyze_group_access. */
2377 static bool
2378 vect_analyze_group_access_1 (struct data_reference *dr)
2380 tree step = DR_STEP (dr);
2381 tree scalar_type = TREE_TYPE (DR_REF (dr));
2382 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2383 gimple *stmt = DR_STMT (dr);
2384 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2385 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2386 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2387 HOST_WIDE_INT dr_step = -1;
2388 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2389 bool slp_impossible = false;
2391 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2392 size of the interleaving group (including gaps). */
2393 if (tree_fits_shwi_p (step))
2395 dr_step = tree_to_shwi (step);
2396 /* Check that STEP is a multiple of type size. Otherwise there is
2397 a non-element-sized gap at the end of the group which we
2398 cannot represent in GROUP_GAP or GROUP_SIZE.
2399 ??? As we can handle non-constant step fine here we should
2400 simply remove uses of GROUP_GAP between the last and first
2401 element and instead rely on DR_STEP. GROUP_SIZE then would
2402 simply not include that gap. */
2403 if ((dr_step % type_size) != 0)
2405 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_NOTE, vect_location,
2408 "Step ");
2409 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2410 dump_printf (MSG_NOTE,
2411 " is not a multiple of the element size for ");
2412 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2413 dump_printf (MSG_NOTE, "\n");
2415 return false;
2417 groupsize = absu_hwi (dr_step) / type_size;
2419 else
2420 groupsize = 0;
2422 /* Not consecutive access is possible only if it is a part of interleaving. */
2423 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2425 /* Check if it this DR is a part of interleaving, and is a single
2426 element of the group that is accessed in the loop. */
2428 /* Gaps are supported only for loads. STEP must be a multiple of the type
2429 size. The size of the group must be a power of 2. */
2430 if (DR_IS_READ (dr)
2431 && (dr_step % type_size) == 0
2432 && groupsize > 0
2433 && pow2p_hwi (groupsize))
2435 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2436 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2437 GROUP_GAP (stmt_info) = groupsize - 1;
2438 if (dump_enabled_p ())
2440 dump_printf_loc (MSG_NOTE, vect_location,
2441 "Detected single element interleaving ");
2442 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2443 dump_printf (MSG_NOTE, " step ");
2444 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2445 dump_printf (MSG_NOTE, "\n");
2448 return true;
2451 if (dump_enabled_p ())
2453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2454 "not consecutive access ");
2455 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2458 if (bb_vinfo)
2460 /* Mark the statement as unvectorizable. */
2461 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2462 return true;
2465 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2466 STMT_VINFO_STRIDED_P (stmt_info) = true;
2467 return true;
2470 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2472 /* First stmt in the interleaving chain. Check the chain. */
2473 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2474 struct data_reference *data_ref = dr;
2475 unsigned int count = 1;
2476 tree prev_init = DR_INIT (data_ref);
2477 gimple *prev = stmt;
2478 HOST_WIDE_INT diff, gaps = 0;
2480 while (next)
2482 /* Skip same data-refs. In case that two or more stmts share
2483 data-ref (supported only for loads), we vectorize only the first
2484 stmt, and the rest get their vectorized loads from the first
2485 one. */
2486 if (!tree_int_cst_compare (DR_INIT (data_ref),
2487 DR_INIT (STMT_VINFO_DATA_REF (
2488 vinfo_for_stmt (next)))))
2490 if (DR_IS_WRITE (data_ref))
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494 "Two store stmts share the same dr.\n");
2495 return false;
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2500 "Two or more load stmts share the same dr.\n");
2502 /* For load use the same data-ref load. */
2503 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2505 prev = next;
2506 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2507 continue;
2510 prev = next;
2511 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2513 /* All group members have the same STEP by construction. */
2514 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2516 /* Check that the distance between two accesses is equal to the type
2517 size. Otherwise, we have gaps. */
2518 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2519 - TREE_INT_CST_LOW (prev_init)) / type_size;
2520 if (diff != 1)
2522 /* FORNOW: SLP of accesses with gaps is not supported. */
2523 slp_impossible = true;
2524 if (DR_IS_WRITE (data_ref))
2526 if (dump_enabled_p ())
2527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2528 "interleaved store with gaps\n");
2529 return false;
2532 gaps += diff - 1;
2535 last_accessed_element += diff;
2537 /* Store the gap from the previous member of the group. If there is no
2538 gap in the access, GROUP_GAP is always 1. */
2539 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2541 prev_init = DR_INIT (data_ref);
2542 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2543 /* Count the number of data-refs in the chain. */
2544 count++;
2547 if (groupsize == 0)
2548 groupsize = count + gaps;
2550 /* This could be UINT_MAX but as we are generating code in a very
2551 inefficient way we have to cap earlier. See PR78699 for example. */
2552 if (groupsize > 4096)
2554 if (dump_enabled_p ())
2555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2556 "group is too large\n");
2557 return false;
2560 /* Check that the size of the interleaving is equal to count for stores,
2561 i.e., that there are no gaps. */
2562 if (groupsize != count
2563 && !DR_IS_READ (dr))
2565 if (dump_enabled_p ())
2566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2567 "interleaved store with gaps\n");
2568 return false;
2571 /* If there is a gap after the last load in the group it is the
2572 difference between the groupsize and the last accessed
2573 element.
2574 When there is no gap, this difference should be 0. */
2575 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2577 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2578 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_NOTE, vect_location,
2581 "Detected interleaving ");
2582 if (DR_IS_READ (dr))
2583 dump_printf (MSG_NOTE, "load ");
2584 else
2585 dump_printf (MSG_NOTE, "store ");
2586 dump_printf (MSG_NOTE, "of size %u starting with ",
2587 (unsigned)groupsize);
2588 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2589 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2590 dump_printf_loc (MSG_NOTE, vect_location,
2591 "There is a gap of %u elements after the group\n",
2592 GROUP_GAP (vinfo_for_stmt (stmt)));
2595 /* SLP: create an SLP data structure for every interleaving group of
2596 stores for further analysis in vect_analyse_slp. */
2597 if (DR_IS_WRITE (dr) && !slp_impossible)
2599 if (loop_vinfo)
2600 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2601 if (bb_vinfo)
2602 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2606 return true;
2609 /* Analyze groups of accesses: check that DR belongs to a group of
2610 accesses of legal size, step, etc. Detect gaps, single element
2611 interleaving, and other special cases. Set grouped access info.
2612 Collect groups of strided stores for further use in SLP analysis. */
2614 static bool
2615 vect_analyze_group_access (struct data_reference *dr)
2617 if (!vect_analyze_group_access_1 (dr))
2619 /* Dissolve the group if present. */
2620 gimple *next;
2621 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2622 while (stmt)
2624 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2625 next = GROUP_NEXT_ELEMENT (vinfo);
2626 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2627 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2628 stmt = next;
2630 return false;
2632 return true;
2635 /* Analyze the access pattern of the data-reference DR.
2636 In case of non-consecutive accesses call vect_analyze_group_access() to
2637 analyze groups of accesses. */
2639 static bool
2640 vect_analyze_data_ref_access (struct data_reference *dr)
2642 tree step = DR_STEP (dr);
2643 tree scalar_type = TREE_TYPE (DR_REF (dr));
2644 gimple *stmt = DR_STMT (dr);
2645 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2646 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2647 struct loop *loop = NULL;
2649 if (loop_vinfo)
2650 loop = LOOP_VINFO_LOOP (loop_vinfo);
2652 if (loop_vinfo && !step)
2654 if (dump_enabled_p ())
2655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2656 "bad data-ref access in loop\n");
2657 return false;
2660 /* Allow loads with zero step in inner-loop vectorization. */
2661 if (loop_vinfo && integer_zerop (step))
2663 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2664 if (!nested_in_vect_loop_p (loop, stmt))
2665 return DR_IS_READ (dr);
2666 /* Allow references with zero step for outer loops marked
2667 with pragma omp simd only - it guarantees absence of
2668 loop-carried dependencies between inner loop iterations. */
2669 if (!loop->force_vectorize)
2671 if (dump_enabled_p ())
2672 dump_printf_loc (MSG_NOTE, vect_location,
2673 "zero step in inner loop of nest\n");
2674 return false;
2678 if (loop && nested_in_vect_loop_p (loop, stmt))
2680 /* Interleaved accesses are not yet supported within outer-loop
2681 vectorization for references in the inner-loop. */
2682 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2684 /* For the rest of the analysis we use the outer-loop step. */
2685 step = STMT_VINFO_DR_STEP (stmt_info);
2686 if (integer_zerop (step))
2688 if (dump_enabled_p ())
2689 dump_printf_loc (MSG_NOTE, vect_location,
2690 "zero step in outer loop.\n");
2691 return DR_IS_READ (dr);
2695 /* Consecutive? */
2696 if (TREE_CODE (step) == INTEGER_CST)
2698 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2699 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2700 || (dr_step < 0
2701 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2703 /* Mark that it is not interleaving. */
2704 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2705 return true;
2709 if (loop && nested_in_vect_loop_p (loop, stmt))
2711 if (dump_enabled_p ())
2712 dump_printf_loc (MSG_NOTE, vect_location,
2713 "grouped access in outer loop.\n");
2714 return false;
2718 /* Assume this is a DR handled by non-constant strided load case. */
2719 if (TREE_CODE (step) != INTEGER_CST)
2720 return (STMT_VINFO_STRIDED_P (stmt_info)
2721 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2722 || vect_analyze_group_access (dr)));
2724 /* Not consecutive access - check if it's a part of interleaving group. */
2725 return vect_analyze_group_access (dr);
2728 /* Compare two data-references DRA and DRB to group them into chunks
2729 suitable for grouping. */
2731 static int
2732 dr_group_sort_cmp (const void *dra_, const void *drb_)
2734 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2735 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2736 int cmp;
2738 /* Stabilize sort. */
2739 if (dra == drb)
2740 return 0;
2742 /* DRs in different loops never belong to the same group. */
2743 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2744 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2745 if (loopa != loopb)
2746 return loopa->num < loopb->num ? -1 : 1;
2748 /* Ordering of DRs according to base. */
2749 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2750 DR_BASE_ADDRESS (drb));
2751 if (cmp != 0)
2752 return cmp;
2754 /* And according to DR_OFFSET. */
2755 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2756 if (cmp != 0)
2757 return cmp;
2759 /* Put reads before writes. */
2760 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2761 return DR_IS_READ (dra) ? -1 : 1;
2763 /* Then sort after access size. */
2764 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2765 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2766 if (cmp != 0)
2767 return cmp;
2769 /* And after step. */
2770 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2771 if (cmp != 0)
2772 return cmp;
2774 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2775 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2776 if (cmp == 0)
2777 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2778 return cmp;
2781 /* Function vect_analyze_data_ref_accesses.
2783 Analyze the access pattern of all the data references in the loop.
2785 FORNOW: the only access pattern that is considered vectorizable is a
2786 simple step 1 (consecutive) access.
2788 FORNOW: handle only arrays and pointer accesses. */
2790 bool
2791 vect_analyze_data_ref_accesses (vec_info *vinfo)
2793 unsigned int i;
2794 vec<data_reference_p> datarefs = vinfo->datarefs;
2795 struct data_reference *dr;
2797 if (dump_enabled_p ())
2798 dump_printf_loc (MSG_NOTE, vect_location,
2799 "=== vect_analyze_data_ref_accesses ===\n");
2801 if (datarefs.is_empty ())
2802 return true;
2804 /* Sort the array of datarefs to make building the interleaving chains
2805 linear. Don't modify the original vector's order, it is needed for
2806 determining what dependencies are reversed. */
2807 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2808 datarefs_copy.qsort (dr_group_sort_cmp);
2810 /* Build the interleaving chains. */
2811 for (i = 0; i < datarefs_copy.length () - 1;)
2813 data_reference_p dra = datarefs_copy[i];
2814 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2815 stmt_vec_info lastinfo = NULL;
2816 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2818 ++i;
2819 continue;
2821 for (i = i + 1; i < datarefs_copy.length (); ++i)
2823 data_reference_p drb = datarefs_copy[i];
2824 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2825 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2826 break;
2828 /* ??? Imperfect sorting (non-compatible types, non-modulo
2829 accesses, same accesses) can lead to a group to be artificially
2830 split here as we don't just skip over those. If it really
2831 matters we can push those to a worklist and re-iterate
2832 over them. The we can just skip ahead to the next DR here. */
2834 /* DRs in a different loop should not be put into the same
2835 interleaving group. */
2836 if (gimple_bb (DR_STMT (dra))->loop_father
2837 != gimple_bb (DR_STMT (drb))->loop_father)
2838 break;
2840 /* Check that the data-refs have same first location (except init)
2841 and they are both either store or load (not load and store,
2842 not masked loads or stores). */
2843 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2844 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2845 DR_BASE_ADDRESS (drb)) != 0
2846 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2847 || !gimple_assign_single_p (DR_STMT (dra))
2848 || !gimple_assign_single_p (DR_STMT (drb)))
2849 break;
2851 /* Check that the data-refs have the same constant size. */
2852 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2853 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2854 if (!tree_fits_uhwi_p (sza)
2855 || !tree_fits_uhwi_p (szb)
2856 || !tree_int_cst_equal (sza, szb))
2857 break;
2859 /* Check that the data-refs have the same step. */
2860 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2861 break;
2863 /* Check the types are compatible.
2864 ??? We don't distinguish this during sorting. */
2865 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2866 TREE_TYPE (DR_REF (drb))))
2867 break;
2869 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2870 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2871 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2872 HOST_WIDE_INT init_prev
2873 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2874 gcc_assert (init_a <= init_b
2875 && init_a <= init_prev
2876 && init_prev <= init_b);
2878 /* Do not place the same access in the interleaving chain twice. */
2879 if (init_b == init_prev)
2881 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2882 < gimple_uid (DR_STMT (drb)));
2883 /* ??? For now we simply "drop" the later reference which is
2884 otherwise the same rather than finishing off this group.
2885 In the end we'd want to re-process duplicates forming
2886 multiple groups from the refs, likely by just collecting
2887 all candidates (including duplicates and split points
2888 below) in a vector and then process them together. */
2889 continue;
2892 /* If init_b == init_a + the size of the type * k, we have an
2893 interleaving, and DRA is accessed before DRB. */
2894 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2895 if (type_size_a == 0
2896 || (init_b - init_a) % type_size_a != 0)
2897 break;
2899 /* If we have a store, the accesses are adjacent. This splits
2900 groups into chunks we support (we don't support vectorization
2901 of stores with gaps). */
2902 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2903 break;
2905 /* If the step (if not zero or non-constant) is greater than the
2906 difference between data-refs' inits this splits groups into
2907 suitable sizes. */
2908 if (tree_fits_shwi_p (DR_STEP (dra)))
2910 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2911 if (step != 0 && step <= (init_b - init_a))
2912 break;
2915 if (dump_enabled_p ())
2917 dump_printf_loc (MSG_NOTE, vect_location,
2918 "Detected interleaving ");
2919 if (DR_IS_READ (dra))
2920 dump_printf (MSG_NOTE, "load ");
2921 else
2922 dump_printf (MSG_NOTE, "store ");
2923 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2924 dump_printf (MSG_NOTE, " and ");
2925 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2926 dump_printf (MSG_NOTE, "\n");
2929 /* Link the found element into the group list. */
2930 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2932 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2933 lastinfo = stmtinfo_a;
2935 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2936 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2937 lastinfo = stmtinfo_b;
2941 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2942 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2943 && !vect_analyze_data_ref_access (dr))
2945 if (dump_enabled_p ())
2946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2947 "not vectorized: complicated access pattern.\n");
2949 if (is_a <bb_vec_info> (vinfo))
2951 /* Mark the statement as not vectorizable. */
2952 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2953 continue;
2955 else
2957 datarefs_copy.release ();
2958 return false;
2962 datarefs_copy.release ();
2963 return true;
2966 /* Function vect_vfa_segment_size.
2968 Create an expression that computes the size of segment
2969 that will be accessed for a data reference. The functions takes into
2970 account that realignment loads may access one more vector.
2972 Input:
2973 DR: The data reference.
2974 LENGTH_FACTOR: segment length to consider.
2976 Return an expression whose value is the size of segment which will be
2977 accessed by DR. */
2979 static tree
2980 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2982 tree segment_length;
2984 if (integer_zerop (DR_STEP (dr)))
2985 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2986 else
2987 segment_length = size_binop (MULT_EXPR,
2988 fold_convert (sizetype, DR_STEP (dr)),
2989 fold_convert (sizetype, length_factor));
2991 if (vect_supportable_dr_alignment (dr, false)
2992 == dr_explicit_realign_optimized)
2994 tree vector_size = TYPE_SIZE_UNIT
2995 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2997 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2999 return segment_length;
3002 /* Function vect_no_alias_p.
3004 Given data references A and B with equal base and offset, the alias
3005 relation can be decided at compilation time, return TRUE if they do
3006 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
3007 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3008 respectively. */
3010 static bool
3011 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
3012 tree segment_length_a, tree segment_length_b)
3014 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
3015 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
3016 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
3017 return false;
3019 tree seg_a_min = DR_INIT (a);
3020 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3021 seg_a_min, segment_length_a);
3022 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3023 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3024 [a, a+12) */
3025 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3027 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3028 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3029 seg_a_max, unit_size);
3030 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3031 DR_INIT (a), unit_size);
3033 tree seg_b_min = DR_INIT (b);
3034 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3035 seg_b_min, segment_length_b);
3036 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3038 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3039 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3040 seg_b_max, unit_size);
3041 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3042 DR_INIT (b), unit_size);
3045 if (tree_int_cst_le (seg_a_max, seg_b_min)
3046 || tree_int_cst_le (seg_b_max, seg_a_min))
3047 return true;
3049 return false;
3052 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3053 in DDR is >= VF. */
3055 static bool
3056 dependence_distance_ge_vf (data_dependence_relation *ddr,
3057 unsigned int loop_depth, poly_uint64 vf)
3059 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3060 || DDR_NUM_DIST_VECTS (ddr) == 0)
3061 return false;
3063 /* If the dependence is exact, we should have limited the VF instead. */
3064 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3066 unsigned int i;
3067 lambda_vector dist_v;
3068 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3070 HOST_WIDE_INT dist = dist_v[loop_depth];
3071 if (dist != 0
3072 && !(dist > 0 && DDR_REVERSED_P (ddr))
3073 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3074 return false;
3077 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_NOTE, vect_location,
3080 "dependence distance between ");
3081 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3082 dump_printf (MSG_NOTE, " and ");
3083 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3084 dump_printf (MSG_NOTE, " is >= VF\n");
3087 return true;
3090 /* Function vect_prune_runtime_alias_test_list.
3092 Prune a list of ddrs to be tested at run-time by versioning for alias.
3093 Merge several alias checks into one if possible.
3094 Return FALSE if resulting list of ddrs is longer then allowed by
3095 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3097 bool
3098 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3100 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3101 hash_set <tree_pair_hash> compared_objects;
3103 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3104 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3105 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3106 vec<vec_object_pair> &check_unequal_addrs
3107 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3108 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3109 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3111 ddr_p ddr;
3112 unsigned int i;
3113 tree length_factor;
3115 if (dump_enabled_p ())
3116 dump_printf_loc (MSG_NOTE, vect_location,
3117 "=== vect_prune_runtime_alias_test_list ===\n");
3119 if (may_alias_ddrs.is_empty ())
3120 return true;
3122 comp_alias_ddrs.create (may_alias_ddrs.length ());
3124 unsigned int loop_depth
3125 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3126 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3128 /* First, we collect all data ref pairs for aliasing checks. */
3129 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3131 int comp_res;
3132 struct data_reference *dr_a, *dr_b;
3133 gimple *dr_group_first_a, *dr_group_first_b;
3134 tree segment_length_a, segment_length_b;
3135 gimple *stmt_a, *stmt_b;
3137 /* Ignore the alias if the VF we chose ended up being no greater
3138 than the dependence distance. */
3139 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3140 continue;
3142 if (DDR_OBJECT_A (ddr))
3144 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3145 if (!compared_objects.add (new_pair))
3147 if (dump_enabled_p ())
3149 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3150 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3151 dump_printf (MSG_NOTE, " and ");
3152 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3153 dump_printf (MSG_NOTE, " have different addresses\n");
3155 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3157 continue;
3160 dr_a = DDR_A (ddr);
3161 stmt_a = DR_STMT (DDR_A (ddr));
3162 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3163 if (dr_group_first_a)
3165 stmt_a = dr_group_first_a;
3166 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3169 dr_b = DDR_B (ddr);
3170 stmt_b = DR_STMT (DDR_B (ddr));
3171 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3172 if (dr_group_first_b)
3174 stmt_b = dr_group_first_b;
3175 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3178 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3179 length_factor = scalar_loop_iters;
3180 else
3181 length_factor = size_int (vect_factor);
3182 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3183 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3185 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3186 DR_BASE_ADDRESS (dr_b));
3187 if (comp_res == 0)
3188 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3189 DR_OFFSET (dr_b));
3191 /* Alias is known at compilation time. */
3192 if (comp_res == 0
3193 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3194 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3195 && TREE_CODE (segment_length_a) == INTEGER_CST
3196 && TREE_CODE (segment_length_b) == INTEGER_CST)
3198 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3199 continue;
3201 if (dump_enabled_p ())
3202 dump_printf_loc (MSG_NOTE, vect_location,
3203 "not vectorized: compilation time alias.\n");
3205 return false;
3208 dr_with_seg_len_pair_t dr_with_seg_len_pair
3209 (dr_with_seg_len (dr_a, segment_length_a),
3210 dr_with_seg_len (dr_b, segment_length_b));
3212 /* Canonicalize pairs by sorting the two DR members. */
3213 if (comp_res > 0)
3214 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3216 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3219 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3221 unsigned int count = (comp_alias_ddrs.length ()
3222 + check_unequal_addrs.length ());
3223 dump_printf_loc (MSG_NOTE, vect_location,
3224 "improved number of alias checks from %d to %d\n",
3225 may_alias_ddrs.length (), count);
3226 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3228 if (dump_enabled_p ())
3229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3230 "number of versioning for alias "
3231 "run-time tests exceeds %d "
3232 "(--param vect-max-version-for-alias-checks)\n",
3233 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3234 return false;
3237 return true;
3240 /* Return true if a non-affine read or write in STMT is suitable for a
3241 gather load or scatter store. Describe the operation in *INFO if so. */
3243 bool
3244 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3245 gather_scatter_info *info)
3247 HOST_WIDE_INT scale = 1;
3248 poly_int64 pbitpos, pbitsize;
3249 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3250 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3251 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3252 tree offtype = NULL_TREE;
3253 tree decl, base, off;
3254 machine_mode pmode;
3255 int punsignedp, reversep, pvolatilep = 0;
3257 base = DR_REF (dr);
3258 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3259 see if we can use the def stmt of the address. */
3260 if (is_gimple_call (stmt)
3261 && gimple_call_internal_p (stmt)
3262 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3263 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3264 && TREE_CODE (base) == MEM_REF
3265 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3266 && integer_zerop (TREE_OPERAND (base, 1))
3267 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3269 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3270 if (is_gimple_assign (def_stmt)
3271 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3272 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3275 /* The gather and scatter builtins need address of the form
3276 loop_invariant + vector * {1, 2, 4, 8}
3278 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3279 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3280 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3281 multiplications and additions in it. To get a vector, we need
3282 a single SSA_NAME that will be defined in the loop and will
3283 contain everything that is not loop invariant and that can be
3284 vectorized. The following code attempts to find such a preexistng
3285 SSA_NAME OFF and put the loop invariants into a tree BASE
3286 that can be gimplified before the loop. */
3287 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3288 &punsignedp, &reversep, &pvolatilep);
3289 gcc_assert (base && !reversep);
3290 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3292 if (TREE_CODE (base) == MEM_REF)
3294 if (!integer_zerop (TREE_OPERAND (base, 1)))
3296 if (off == NULL_TREE)
3297 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3298 else
3299 off = size_binop (PLUS_EXPR, off,
3300 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3302 base = TREE_OPERAND (base, 0);
3304 else
3305 base = build_fold_addr_expr (base);
3307 if (off == NULL_TREE)
3308 off = size_zero_node;
3310 /* If base is not loop invariant, either off is 0, then we start with just
3311 the constant offset in the loop invariant BASE and continue with base
3312 as OFF, otherwise give up.
3313 We could handle that case by gimplifying the addition of base + off
3314 into some SSA_NAME and use that as off, but for now punt. */
3315 if (!expr_invariant_in_loop_p (loop, base))
3317 if (!integer_zerop (off))
3318 return false;
3319 off = base;
3320 base = size_int (pbytepos);
3322 /* Otherwise put base + constant offset into the loop invariant BASE
3323 and continue with OFF. */
3324 else
3326 base = fold_convert (sizetype, base);
3327 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3330 /* OFF at this point may be either a SSA_NAME or some tree expression
3331 from get_inner_reference. Try to peel off loop invariants from it
3332 into BASE as long as possible. */
3333 STRIP_NOPS (off);
3334 while (offtype == NULL_TREE)
3336 enum tree_code code;
3337 tree op0, op1, add = NULL_TREE;
3339 if (TREE_CODE (off) == SSA_NAME)
3341 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3343 if (expr_invariant_in_loop_p (loop, off))
3344 return false;
3346 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3347 break;
3349 op0 = gimple_assign_rhs1 (def_stmt);
3350 code = gimple_assign_rhs_code (def_stmt);
3351 op1 = gimple_assign_rhs2 (def_stmt);
3353 else
3355 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3356 return false;
3357 code = TREE_CODE (off);
3358 extract_ops_from_tree (off, &code, &op0, &op1);
3360 switch (code)
3362 case POINTER_PLUS_EXPR:
3363 case PLUS_EXPR:
3364 if (expr_invariant_in_loop_p (loop, op0))
3366 add = op0;
3367 off = op1;
3368 do_add:
3369 add = fold_convert (sizetype, add);
3370 if (scale != 1)
3371 add = size_binop (MULT_EXPR, add, size_int (scale));
3372 base = size_binop (PLUS_EXPR, base, add);
3373 continue;
3375 if (expr_invariant_in_loop_p (loop, op1))
3377 add = op1;
3378 off = op0;
3379 goto do_add;
3381 break;
3382 case MINUS_EXPR:
3383 if (expr_invariant_in_loop_p (loop, op1))
3385 add = fold_convert (sizetype, op1);
3386 add = size_binop (MINUS_EXPR, size_zero_node, add);
3387 off = op0;
3388 goto do_add;
3390 break;
3391 case MULT_EXPR:
3392 if (scale == 1 && tree_fits_shwi_p (op1))
3394 scale = tree_to_shwi (op1);
3395 off = op0;
3396 continue;
3398 break;
3399 case SSA_NAME:
3400 off = op0;
3401 continue;
3402 CASE_CONVERT:
3403 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3404 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3405 break;
3406 if (TYPE_PRECISION (TREE_TYPE (op0))
3407 == TYPE_PRECISION (TREE_TYPE (off)))
3409 off = op0;
3410 continue;
3412 if (TYPE_PRECISION (TREE_TYPE (op0))
3413 < TYPE_PRECISION (TREE_TYPE (off)))
3415 off = op0;
3416 offtype = TREE_TYPE (off);
3417 STRIP_NOPS (off);
3418 continue;
3420 break;
3421 default:
3422 break;
3424 break;
3427 /* If at the end OFF still isn't a SSA_NAME or isn't
3428 defined in the loop, punt. */
3429 if (TREE_CODE (off) != SSA_NAME
3430 || expr_invariant_in_loop_p (loop, off))
3431 return false;
3433 if (offtype == NULL_TREE)
3434 offtype = TREE_TYPE (off);
3436 if (DR_IS_READ (dr))
3437 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3438 offtype, scale);
3439 else
3440 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3441 offtype, scale);
3443 if (decl == NULL_TREE)
3444 return false;
3446 info->decl = decl;
3447 info->base = base;
3448 info->offset = off;
3449 info->offset_dt = vect_unknown_def_type;
3450 info->offset_vectype = NULL_TREE;
3451 info->scale = scale;
3452 return true;
3455 /* Function vect_analyze_data_refs.
3457 Find all the data references in the loop or basic block.
3459 The general structure of the analysis of data refs in the vectorizer is as
3460 follows:
3461 1- vect_analyze_data_refs(loop/bb): call
3462 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3463 in the loop/bb and their dependences.
3464 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3465 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3466 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3470 bool
3471 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3473 struct loop *loop = NULL;
3474 unsigned int i;
3475 struct data_reference *dr;
3476 tree scalar_type;
3478 if (dump_enabled_p ())
3479 dump_printf_loc (MSG_NOTE, vect_location,
3480 "=== vect_analyze_data_refs ===\n");
3482 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3483 loop = LOOP_VINFO_LOOP (loop_vinfo);
3485 /* Go through the data-refs, check that the analysis succeeded. Update
3486 pointer from stmt_vec_info struct to DR and vectype. */
3488 vec<data_reference_p> datarefs = vinfo->datarefs;
3489 FOR_EACH_VEC_ELT (datarefs, i, dr)
3491 gimple *stmt;
3492 stmt_vec_info stmt_info;
3493 tree base, offset, init;
3494 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3495 bool simd_lane_access = false;
3496 poly_uint64 vf;
3498 again:
3499 if (!dr || !DR_REF (dr))
3501 if (dump_enabled_p ())
3502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3503 "not vectorized: unhandled data-ref\n");
3504 return false;
3507 stmt = DR_STMT (dr);
3508 stmt_info = vinfo_for_stmt (stmt);
3510 /* Discard clobbers from the dataref vector. We will remove
3511 clobber stmts during vectorization. */
3512 if (gimple_clobber_p (stmt))
3514 free_data_ref (dr);
3515 if (i == datarefs.length () - 1)
3517 datarefs.pop ();
3518 break;
3520 datarefs.ordered_remove (i);
3521 dr = datarefs[i];
3522 goto again;
3525 /* Check that analysis of the data-ref succeeded. */
3526 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3527 || !DR_STEP (dr))
3529 bool maybe_gather
3530 = DR_IS_READ (dr)
3531 && !TREE_THIS_VOLATILE (DR_REF (dr))
3532 && targetm.vectorize.builtin_gather != NULL;
3533 bool maybe_scatter
3534 = DR_IS_WRITE (dr)
3535 && !TREE_THIS_VOLATILE (DR_REF (dr))
3536 && targetm.vectorize.builtin_scatter != NULL;
3537 bool maybe_simd_lane_access
3538 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3540 /* If target supports vector gather loads or scatter stores, or if
3541 this might be a SIMD lane access, see if they can't be used. */
3542 if (is_a <loop_vec_info> (vinfo)
3543 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3544 && !nested_in_vect_loop_p (loop, stmt))
3546 struct data_reference *newdr
3547 = create_data_ref (NULL, loop_containing_stmt (stmt),
3548 DR_REF (dr), stmt, !maybe_scatter,
3549 DR_IS_CONDITIONAL_IN_STMT (dr));
3550 gcc_assert (newdr != NULL && DR_REF (newdr));
3551 if (DR_BASE_ADDRESS (newdr)
3552 && DR_OFFSET (newdr)
3553 && DR_INIT (newdr)
3554 && DR_STEP (newdr)
3555 && integer_zerop (DR_STEP (newdr)))
3557 if (maybe_simd_lane_access)
3559 tree off = DR_OFFSET (newdr);
3560 STRIP_NOPS (off);
3561 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3562 && TREE_CODE (off) == MULT_EXPR
3563 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3565 tree step = TREE_OPERAND (off, 1);
3566 off = TREE_OPERAND (off, 0);
3567 STRIP_NOPS (off);
3568 if (CONVERT_EXPR_P (off)
3569 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3570 0)))
3571 < TYPE_PRECISION (TREE_TYPE (off)))
3572 off = TREE_OPERAND (off, 0);
3573 if (TREE_CODE (off) == SSA_NAME)
3575 gimple *def = SSA_NAME_DEF_STMT (off);
3576 tree reft = TREE_TYPE (DR_REF (newdr));
3577 if (is_gimple_call (def)
3578 && gimple_call_internal_p (def)
3579 && (gimple_call_internal_fn (def)
3580 == IFN_GOMP_SIMD_LANE))
3582 tree arg = gimple_call_arg (def, 0);
3583 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3584 arg = SSA_NAME_VAR (arg);
3585 if (arg == loop->simduid
3586 /* For now. */
3587 && tree_int_cst_equal
3588 (TYPE_SIZE_UNIT (reft),
3589 step))
3591 DR_OFFSET (newdr) = ssize_int (0);
3592 DR_STEP (newdr) = step;
3593 DR_OFFSET_ALIGNMENT (newdr)
3594 = BIGGEST_ALIGNMENT;
3595 DR_STEP_ALIGNMENT (newdr)
3596 = highest_pow2_factor (step);
3597 dr = newdr;
3598 simd_lane_access = true;
3604 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3606 dr = newdr;
3607 if (maybe_gather)
3608 gatherscatter = GATHER;
3609 else
3610 gatherscatter = SCATTER;
3613 if (gatherscatter == SG_NONE && !simd_lane_access)
3614 free_data_ref (newdr);
3617 if (gatherscatter == SG_NONE && !simd_lane_access)
3619 if (dump_enabled_p ())
3621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3622 "not vectorized: data ref analysis "
3623 "failed ");
3624 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3627 if (is_a <bb_vec_info> (vinfo))
3628 break;
3630 return false;
3634 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3636 if (dump_enabled_p ())
3637 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3638 "not vectorized: base addr of dr is a "
3639 "constant\n");
3641 if (is_a <bb_vec_info> (vinfo))
3642 break;
3644 if (gatherscatter != SG_NONE || simd_lane_access)
3645 free_data_ref (dr);
3646 return false;
3649 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3651 if (dump_enabled_p ())
3653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3654 "not vectorized: volatile type ");
3655 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3658 if (is_a <bb_vec_info> (vinfo))
3659 break;
3661 return false;
3664 if (stmt_can_throw_internal (stmt))
3666 if (dump_enabled_p ())
3668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3669 "not vectorized: statement can throw an "
3670 "exception ");
3671 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3674 if (is_a <bb_vec_info> (vinfo))
3675 break;
3677 if (gatherscatter != SG_NONE || simd_lane_access)
3678 free_data_ref (dr);
3679 return false;
3682 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3683 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3685 if (dump_enabled_p ())
3687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3688 "not vectorized: statement is bitfield "
3689 "access ");
3690 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3693 if (is_a <bb_vec_info> (vinfo))
3694 break;
3696 if (gatherscatter != SG_NONE || simd_lane_access)
3697 free_data_ref (dr);
3698 return false;
3701 base = unshare_expr (DR_BASE_ADDRESS (dr));
3702 offset = unshare_expr (DR_OFFSET (dr));
3703 init = unshare_expr (DR_INIT (dr));
3705 if (is_gimple_call (stmt)
3706 && (!gimple_call_internal_p (stmt)
3707 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3708 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3710 if (dump_enabled_p ())
3712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3713 "not vectorized: dr in a call ");
3714 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3717 if (is_a <bb_vec_info> (vinfo))
3718 break;
3720 if (gatherscatter != SG_NONE || simd_lane_access)
3721 free_data_ref (dr);
3722 return false;
3725 /* Update DR field in stmt_vec_info struct. */
3727 /* If the dataref is in an inner-loop of the loop that is considered for
3728 for vectorization, we also want to analyze the access relative to
3729 the outer-loop (DR contains information only relative to the
3730 inner-most enclosing loop). We do that by building a reference to the
3731 first location accessed by the inner-loop, and analyze it relative to
3732 the outer-loop. */
3733 if (loop && nested_in_vect_loop_p (loop, stmt))
3735 /* Build a reference to the first location accessed by the
3736 inner loop: *(BASE + INIT + OFFSET). By construction,
3737 this address must be invariant in the inner loop, so we
3738 can consider it as being used in the outer loop. */
3739 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3740 init, offset);
3741 tree init_addr = fold_build_pointer_plus (base, init_offset);
3742 tree init_ref = build_fold_indirect_ref (init_addr);
3744 if (dump_enabled_p ())
3746 dump_printf_loc (MSG_NOTE, vect_location,
3747 "analyze in outer loop: ");
3748 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3749 dump_printf (MSG_NOTE, "\n");
3752 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3753 init_ref, loop))
3754 /* dr_analyze_innermost already explained the failure. */
3755 return false;
3757 if (dump_enabled_p ())
3759 dump_printf_loc (MSG_NOTE, vect_location,
3760 "\touter base_address: ");
3761 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3762 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3763 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3764 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3765 STMT_VINFO_DR_OFFSET (stmt_info));
3766 dump_printf (MSG_NOTE,
3767 "\n\touter constant offset from base address: ");
3768 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3769 STMT_VINFO_DR_INIT (stmt_info));
3770 dump_printf (MSG_NOTE, "\n\touter step: ");
3771 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3772 STMT_VINFO_DR_STEP (stmt_info));
3773 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3774 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3775 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3776 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3777 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3778 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3779 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3780 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3784 if (STMT_VINFO_DATA_REF (stmt_info))
3786 if (dump_enabled_p ())
3788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3789 "not vectorized: more than one data ref "
3790 "in stmt: ");
3791 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3794 if (is_a <bb_vec_info> (vinfo))
3795 break;
3797 if (gatherscatter != SG_NONE || simd_lane_access)
3798 free_data_ref (dr);
3799 return false;
3802 STMT_VINFO_DATA_REF (stmt_info) = dr;
3803 if (simd_lane_access)
3805 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3806 free_data_ref (datarefs[i]);
3807 datarefs[i] = dr;
3810 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3811 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3812 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3814 if (dump_enabled_p ())
3816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3817 "not vectorized: base object not addressable "
3818 "for stmt: ");
3819 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3821 if (is_a <bb_vec_info> (vinfo))
3823 /* In BB vectorization the ref can still participate
3824 in dependence analysis, we just can't vectorize it. */
3825 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3826 continue;
3828 return false;
3831 /* Set vectype for STMT. */
3832 scalar_type = TREE_TYPE (DR_REF (dr));
3833 STMT_VINFO_VECTYPE (stmt_info)
3834 = get_vectype_for_scalar_type (scalar_type);
3835 if (!STMT_VINFO_VECTYPE (stmt_info))
3837 if (dump_enabled_p ())
3839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3840 "not vectorized: no vectype for stmt: ");
3841 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3842 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3843 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3844 scalar_type);
3845 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3848 if (is_a <bb_vec_info> (vinfo))
3850 /* No vector type is fine, the ref can still participate
3851 in dependence analysis, we just can't vectorize it. */
3852 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3853 continue;
3856 if (gatherscatter != SG_NONE || simd_lane_access)
3858 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3859 if (gatherscatter != SG_NONE)
3860 free_data_ref (dr);
3862 return false;
3864 else
3866 if (dump_enabled_p ())
3868 dump_printf_loc (MSG_NOTE, vect_location,
3869 "got vectype for stmt: ");
3870 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3871 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3872 STMT_VINFO_VECTYPE (stmt_info));
3873 dump_printf (MSG_NOTE, "\n");
3877 /* Adjust the minimal vectorization factor according to the
3878 vector type. */
3879 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3880 *min_vf = upper_bound (*min_vf, vf);
3882 if (gatherscatter != SG_NONE)
3884 gather_scatter_info gs_info;
3885 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3886 &gs_info)
3887 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3889 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3890 free_data_ref (dr);
3891 if (dump_enabled_p ())
3893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3894 (gatherscatter == GATHER) ?
3895 "not vectorized: not suitable for gather "
3896 "load " :
3897 "not vectorized: not suitable for scatter "
3898 "store ");
3899 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3901 return false;
3904 free_data_ref (datarefs[i]);
3905 datarefs[i] = dr;
3906 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3909 else if (is_a <loop_vec_info> (vinfo)
3910 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3912 if (nested_in_vect_loop_p (loop, stmt))
3914 if (dump_enabled_p ())
3916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3917 "not vectorized: not suitable for strided "
3918 "load ");
3919 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3921 return false;
3923 STMT_VINFO_STRIDED_P (stmt_info) = true;
3927 /* If we stopped analysis at the first dataref we could not analyze
3928 when trying to vectorize a basic-block mark the rest of the datarefs
3929 as not vectorizable and truncate the vector of datarefs. That
3930 avoids spending useless time in analyzing their dependence. */
3931 if (i != datarefs.length ())
3933 gcc_assert (is_a <bb_vec_info> (vinfo));
3934 for (unsigned j = i; j < datarefs.length (); ++j)
3936 data_reference_p dr = datarefs[j];
3937 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3938 free_data_ref (dr);
3940 datarefs.truncate (i);
3943 return true;
3947 /* Function vect_get_new_vect_var.
3949 Returns a name for a new variable. The current naming scheme appends the
3950 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3951 the name of vectorizer generated variables, and appends that to NAME if
3952 provided. */
3954 tree
3955 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3957 const char *prefix;
3958 tree new_vect_var;
3960 switch (var_kind)
3962 case vect_simple_var:
3963 prefix = "vect";
3964 break;
3965 case vect_scalar_var:
3966 prefix = "stmp";
3967 break;
3968 case vect_mask_var:
3969 prefix = "mask";
3970 break;
3971 case vect_pointer_var:
3972 prefix = "vectp";
3973 break;
3974 default:
3975 gcc_unreachable ();
3978 if (name)
3980 char* tmp = concat (prefix, "_", name, NULL);
3981 new_vect_var = create_tmp_reg (type, tmp);
3982 free (tmp);
3984 else
3985 new_vect_var = create_tmp_reg (type, prefix);
3987 return new_vect_var;
3990 /* Like vect_get_new_vect_var but return an SSA name. */
3992 tree
3993 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3995 const char *prefix;
3996 tree new_vect_var;
3998 switch (var_kind)
4000 case vect_simple_var:
4001 prefix = "vect";
4002 break;
4003 case vect_scalar_var:
4004 prefix = "stmp";
4005 break;
4006 case vect_pointer_var:
4007 prefix = "vectp";
4008 break;
4009 default:
4010 gcc_unreachable ();
4013 if (name)
4015 char* tmp = concat (prefix, "_", name, NULL);
4016 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4017 free (tmp);
4019 else
4020 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4022 return new_vect_var;
4025 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4027 static void
4028 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4030 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4031 int misalign = DR_MISALIGNMENT (dr);
4032 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4033 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4034 else
4035 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4036 DR_TARGET_ALIGNMENT (dr), misalign);
4039 /* Function vect_create_addr_base_for_vector_ref.
4041 Create an expression that computes the address of the first memory location
4042 that will be accessed for a data reference.
4044 Input:
4045 STMT: The statement containing the data reference.
4046 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4047 OFFSET: Optional. If supplied, it is be added to the initial address.
4048 LOOP: Specify relative to which loop-nest should the address be computed.
4049 For example, when the dataref is in an inner-loop nested in an
4050 outer-loop that is now being vectorized, LOOP can be either the
4051 outer-loop, or the inner-loop. The first memory location accessed
4052 by the following dataref ('in' points to short):
4054 for (i=0; i<N; i++)
4055 for (j=0; j<M; j++)
4056 s += in[i+j]
4058 is as follows:
4059 if LOOP=i_loop: &in (relative to i_loop)
4060 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4061 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4062 initial address. Unlike OFFSET, which is number of elements to
4063 be added, BYTE_OFFSET is measured in bytes.
4065 Output:
4066 1. Return an SSA_NAME whose value is the address of the memory location of
4067 the first vector of the data reference.
4068 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4069 these statement(s) which define the returned SSA_NAME.
4071 FORNOW: We are only handling array accesses with step 1. */
4073 tree
4074 vect_create_addr_base_for_vector_ref (gimple *stmt,
4075 gimple_seq *new_stmt_list,
4076 tree offset,
4077 tree byte_offset)
4079 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4080 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4081 const char *base_name;
4082 tree addr_base;
4083 tree dest;
4084 gimple_seq seq = NULL;
4085 tree vect_ptr_type;
4086 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4087 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4088 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4090 tree data_ref_base = unshare_expr (drb->base_address);
4091 tree base_offset = unshare_expr (drb->offset);
4092 tree init = unshare_expr (drb->init);
4094 if (loop_vinfo)
4095 base_name = get_name (data_ref_base);
4096 else
4098 base_offset = ssize_int (0);
4099 init = ssize_int (0);
4100 base_name = get_name (DR_REF (dr));
4103 /* Create base_offset */
4104 base_offset = size_binop (PLUS_EXPR,
4105 fold_convert (sizetype, base_offset),
4106 fold_convert (sizetype, init));
4108 if (offset)
4110 offset = fold_build2 (MULT_EXPR, sizetype,
4111 fold_convert (sizetype, offset), step);
4112 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4113 base_offset, offset);
4115 if (byte_offset)
4117 byte_offset = fold_convert (sizetype, byte_offset);
4118 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4119 base_offset, byte_offset);
4122 /* base + base_offset */
4123 if (loop_vinfo)
4124 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4125 else
4127 addr_base = build1 (ADDR_EXPR,
4128 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4129 unshare_expr (DR_REF (dr)));
4132 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4133 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4134 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4135 gimple_seq_add_seq (new_stmt_list, seq);
4137 if (DR_PTR_INFO (dr)
4138 && TREE_CODE (addr_base) == SSA_NAME
4139 && !SSA_NAME_PTR_INFO (addr_base))
4141 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4142 if (offset || byte_offset)
4143 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4146 if (dump_enabled_p ())
4148 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4149 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4150 dump_printf (MSG_NOTE, "\n");
4153 return addr_base;
4157 /* Function vect_create_data_ref_ptr.
4159 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4160 location accessed in the loop by STMT, along with the def-use update
4161 chain to appropriately advance the pointer through the loop iterations.
4162 Also set aliasing information for the pointer. This pointer is used by
4163 the callers to this function to create a memory reference expression for
4164 vector load/store access.
4166 Input:
4167 1. STMT: a stmt that references memory. Expected to be of the form
4168 GIMPLE_ASSIGN <name, data-ref> or
4169 GIMPLE_ASSIGN <data-ref, name>.
4170 2. AGGR_TYPE: the type of the reference, which should be either a vector
4171 or an array.
4172 3. AT_LOOP: the loop where the vector memref is to be created.
4173 4. OFFSET (optional): an offset to be added to the initial address accessed
4174 by the data-ref in STMT.
4175 5. BSI: location where the new stmts are to be placed if there is no loop
4176 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4177 pointing to the initial address.
4178 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4179 to the initial address accessed by the data-ref in STMT. This is
4180 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4181 in bytes.
4183 Output:
4184 1. Declare a new ptr to vector_type, and have it point to the base of the
4185 data reference (initial addressed accessed by the data reference).
4186 For example, for vector of type V8HI, the following code is generated:
4188 v8hi *ap;
4189 ap = (v8hi *)initial_address;
4191 if OFFSET is not supplied:
4192 initial_address = &a[init];
4193 if OFFSET is supplied:
4194 initial_address = &a[init + OFFSET];
4195 if BYTE_OFFSET is supplied:
4196 initial_address = &a[init] + BYTE_OFFSET;
4198 Return the initial_address in INITIAL_ADDRESS.
4200 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4201 update the pointer in each iteration of the loop.
4203 Return the increment stmt that updates the pointer in PTR_INCR.
4205 3. Set INV_P to true if the access pattern of the data reference in the
4206 vectorized loop is invariant. Set it to false otherwise.
4208 4. Return the pointer. */
4210 tree
4211 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4212 tree offset, tree *initial_address,
4213 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4214 bool only_init, bool *inv_p, tree byte_offset)
4216 const char *base_name;
4217 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4218 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4219 struct loop *loop = NULL;
4220 bool nested_in_vect_loop = false;
4221 struct loop *containing_loop = NULL;
4222 tree aggr_ptr_type;
4223 tree aggr_ptr;
4224 tree new_temp;
4225 gimple_seq new_stmt_list = NULL;
4226 edge pe = NULL;
4227 basic_block new_bb;
4228 tree aggr_ptr_init;
4229 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4230 tree aptr;
4231 gimple_stmt_iterator incr_gsi;
4232 bool insert_after;
4233 tree indx_before_incr, indx_after_incr;
4234 gimple *incr;
4235 tree step;
4236 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4238 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4239 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4241 if (loop_vinfo)
4243 loop = LOOP_VINFO_LOOP (loop_vinfo);
4244 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4245 containing_loop = (gimple_bb (stmt))->loop_father;
4246 pe = loop_preheader_edge (loop);
4248 else
4250 gcc_assert (bb_vinfo);
4251 only_init = true;
4252 *ptr_incr = NULL;
4255 /* Check the step (evolution) of the load in LOOP, and record
4256 whether it's invariant. */
4257 step = vect_dr_behavior (dr)->step;
4258 if (integer_zerop (step))
4259 *inv_p = true;
4260 else
4261 *inv_p = false;
4263 /* Create an expression for the first address accessed by this load
4264 in LOOP. */
4265 base_name = get_name (DR_BASE_ADDRESS (dr));
4267 if (dump_enabled_p ())
4269 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4270 dump_printf_loc (MSG_NOTE, vect_location,
4271 "create %s-pointer variable to type: ",
4272 get_tree_code_name (TREE_CODE (aggr_type)));
4273 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4274 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4275 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4276 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4277 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4278 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4279 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4280 else
4281 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4282 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4283 dump_printf (MSG_NOTE, "\n");
4286 /* (1) Create the new aggregate-pointer variable.
4287 Vector and array types inherit the alias set of their component
4288 type by default so we need to use a ref-all pointer if the data
4289 reference does not conflict with the created aggregated data
4290 reference because it is not addressable. */
4291 bool need_ref_all = false;
4292 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4293 get_alias_set (DR_REF (dr))))
4294 need_ref_all = true;
4295 /* Likewise for any of the data references in the stmt group. */
4296 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4298 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4301 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4302 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4303 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4304 get_alias_set (DR_REF (sdr))))
4306 need_ref_all = true;
4307 break;
4309 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4311 while (orig_stmt);
4313 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4314 need_ref_all);
4315 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4318 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4319 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4320 def-use update cycles for the pointer: one relative to the outer-loop
4321 (LOOP), which is what steps (3) and (4) below do. The other is relative
4322 to the inner-loop (which is the inner-most loop containing the dataref),
4323 and this is done be step (5) below.
4325 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4326 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4327 redundant. Steps (3),(4) create the following:
4329 vp0 = &base_addr;
4330 LOOP: vp1 = phi(vp0,vp2)
4333 vp2 = vp1 + step
4334 goto LOOP
4336 If there is an inner-loop nested in loop, then step (5) will also be
4337 applied, and an additional update in the inner-loop will be created:
4339 vp0 = &base_addr;
4340 LOOP: vp1 = phi(vp0,vp2)
4342 inner: vp3 = phi(vp1,vp4)
4343 vp4 = vp3 + inner_step
4344 if () goto inner
4346 vp2 = vp1 + step
4347 if () goto LOOP */
4349 /* (2) Calculate the initial address of the aggregate-pointer, and set
4350 the aggregate-pointer to point to it before the loop. */
4352 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4354 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4355 offset, byte_offset);
4356 if (new_stmt_list)
4358 if (pe)
4360 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4361 gcc_assert (!new_bb);
4363 else
4364 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4367 *initial_address = new_temp;
4368 aggr_ptr_init = new_temp;
4370 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4371 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4372 inner-loop nested in LOOP (during outer-loop vectorization). */
4374 /* No update in loop is required. */
4375 if (only_init && (!loop_vinfo || at_loop == loop))
4376 aptr = aggr_ptr_init;
4377 else
4379 /* The step of the aggregate pointer is the type size. */
4380 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4381 /* One exception to the above is when the scalar step of the load in
4382 LOOP is zero. In this case the step here is also zero. */
4383 if (*inv_p)
4384 iv_step = size_zero_node;
4385 else if (tree_int_cst_sgn (step) == -1)
4386 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4388 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4390 create_iv (aggr_ptr_init,
4391 fold_convert (aggr_ptr_type, iv_step),
4392 aggr_ptr, loop, &incr_gsi, insert_after,
4393 &indx_before_incr, &indx_after_incr);
4394 incr = gsi_stmt (incr_gsi);
4395 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4397 /* Copy the points-to information if it exists. */
4398 if (DR_PTR_INFO (dr))
4400 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4401 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4403 if (ptr_incr)
4404 *ptr_incr = incr;
4406 aptr = indx_before_incr;
4409 if (!nested_in_vect_loop || only_init)
4410 return aptr;
4413 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4414 nested in LOOP, if exists. */
4416 gcc_assert (nested_in_vect_loop);
4417 if (!only_init)
4419 standard_iv_increment_position (containing_loop, &incr_gsi,
4420 &insert_after);
4421 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4422 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4423 &indx_after_incr);
4424 incr = gsi_stmt (incr_gsi);
4425 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4427 /* Copy the points-to information if it exists. */
4428 if (DR_PTR_INFO (dr))
4430 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4431 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4433 if (ptr_incr)
4434 *ptr_incr = incr;
4436 return indx_before_incr;
4438 else
4439 gcc_unreachable ();
4443 /* Function bump_vector_ptr
4445 Increment a pointer (to a vector type) by vector-size. If requested,
4446 i.e. if PTR-INCR is given, then also connect the new increment stmt
4447 to the existing def-use update-chain of the pointer, by modifying
4448 the PTR_INCR as illustrated below:
4450 The pointer def-use update-chain before this function:
4451 DATAREF_PTR = phi (p_0, p_2)
4452 ....
4453 PTR_INCR: p_2 = DATAREF_PTR + step
4455 The pointer def-use update-chain after this function:
4456 DATAREF_PTR = phi (p_0, p_2)
4457 ....
4458 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4459 ....
4460 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4462 Input:
4463 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4464 in the loop.
4465 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4466 the loop. The increment amount across iterations is expected
4467 to be vector_size.
4468 BSI - location where the new update stmt is to be placed.
4469 STMT - the original scalar memory-access stmt that is being vectorized.
4470 BUMP - optional. The offset by which to bump the pointer. If not given,
4471 the offset is assumed to be vector_size.
4473 Output: Return NEW_DATAREF_PTR as illustrated above.
4477 tree
4478 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4479 gimple *stmt, tree bump)
4481 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4482 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4483 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4484 tree update = TYPE_SIZE_UNIT (vectype);
4485 gassign *incr_stmt;
4486 ssa_op_iter iter;
4487 use_operand_p use_p;
4488 tree new_dataref_ptr;
4490 if (bump)
4491 update = bump;
4493 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4494 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4495 else
4496 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4497 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4498 dataref_ptr, update);
4499 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4501 /* Copy the points-to information if it exists. */
4502 if (DR_PTR_INFO (dr))
4504 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4505 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4508 if (!ptr_incr)
4509 return new_dataref_ptr;
4511 /* Update the vector-pointer's cross-iteration increment. */
4512 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4514 tree use = USE_FROM_PTR (use_p);
4516 if (use == dataref_ptr)
4517 SET_USE (use_p, new_dataref_ptr);
4518 else
4519 gcc_assert (tree_int_cst_compare (use, update) == 0);
4522 return new_dataref_ptr;
4526 /* Function vect_create_destination_var.
4528 Create a new temporary of type VECTYPE. */
4530 tree
4531 vect_create_destination_var (tree scalar_dest, tree vectype)
4533 tree vec_dest;
4534 const char *name;
4535 char *new_name;
4536 tree type;
4537 enum vect_var_kind kind;
4539 kind = vectype
4540 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4541 ? vect_mask_var
4542 : vect_simple_var
4543 : vect_scalar_var;
4544 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4546 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4548 name = get_name (scalar_dest);
4549 if (name)
4550 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4551 else
4552 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4553 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4554 free (new_name);
4556 return vec_dest;
4559 /* Function vect_grouped_store_supported.
4561 Returns TRUE if interleave high and interleave low permutations
4562 are supported, and FALSE otherwise. */
4564 bool
4565 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4567 machine_mode mode = TYPE_MODE (vectype);
4569 /* vect_permute_store_chain requires the group size to be equal to 3 or
4570 be a power of two. */
4571 if (count != 3 && exact_log2 (count) == -1)
4573 if (dump_enabled_p ())
4574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4575 "the size of the group of accesses"
4576 " is not a power of 2 or not eqaul to 3\n");
4577 return false;
4580 /* Check that the permutation is supported. */
4581 if (VECTOR_MODE_P (mode))
4583 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4584 if (count == 3)
4586 unsigned int j0 = 0, j1 = 0, j2 = 0;
4587 unsigned int i, j;
4589 vec_perm_builder sel (nelt, nelt, 1);
4590 sel.quick_grow (nelt);
4591 vec_perm_indices indices;
4592 for (j = 0; j < 3; j++)
4594 int nelt0 = ((3 - j) * nelt) % 3;
4595 int nelt1 = ((3 - j) * nelt + 1) % 3;
4596 int nelt2 = ((3 - j) * nelt + 2) % 3;
4597 for (i = 0; i < nelt; i++)
4599 if (3 * i + nelt0 < nelt)
4600 sel[3 * i + nelt0] = j0++;
4601 if (3 * i + nelt1 < nelt)
4602 sel[3 * i + nelt1] = nelt + j1++;
4603 if (3 * i + nelt2 < nelt)
4604 sel[3 * i + nelt2] = 0;
4606 indices.new_vector (sel, 2, nelt);
4607 if (!can_vec_perm_const_p (mode, indices))
4609 if (dump_enabled_p ())
4610 dump_printf (MSG_MISSED_OPTIMIZATION,
4611 "permutation op not supported by target.\n");
4612 return false;
4615 for (i = 0; i < nelt; i++)
4617 if (3 * i + nelt0 < nelt)
4618 sel[3 * i + nelt0] = 3 * i + nelt0;
4619 if (3 * i + nelt1 < nelt)
4620 sel[3 * i + nelt1] = 3 * i + nelt1;
4621 if (3 * i + nelt2 < nelt)
4622 sel[3 * i + nelt2] = nelt + j2++;
4624 indices.new_vector (sel, 2, nelt);
4625 if (!can_vec_perm_const_p (mode, indices))
4627 if (dump_enabled_p ())
4628 dump_printf (MSG_MISSED_OPTIMIZATION,
4629 "permutation op not supported by target.\n");
4630 return false;
4633 return true;
4635 else
4637 /* If length is not equal to 3 then only power of 2 is supported. */
4638 gcc_assert (pow2p_hwi (count));
4640 /* The encoding has 2 interleaved stepped patterns. */
4641 vec_perm_builder sel (nelt, 2, 3);
4642 sel.quick_grow (6);
4643 for (i = 0; i < 3; i++)
4645 sel[i * 2] = i;
4646 sel[i * 2 + 1] = i + nelt;
4648 vec_perm_indices indices (sel, 2, nelt);
4649 if (can_vec_perm_const_p (mode, indices))
4651 for (i = 0; i < 6; i++)
4652 sel[i] += nelt / 2;
4653 indices.new_vector (sel, 2, nelt);
4654 if (can_vec_perm_const_p (mode, indices))
4655 return true;
4660 if (dump_enabled_p ())
4661 dump_printf (MSG_MISSED_OPTIMIZATION,
4662 "permutaion op not supported by target.\n");
4663 return false;
4667 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4668 type VECTYPE. */
4670 bool
4671 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4673 return vect_lanes_optab_supported_p ("vec_store_lanes",
4674 vec_store_lanes_optab,
4675 vectype, count);
4679 /* Function vect_permute_store_chain.
4681 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4682 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4683 the data correctly for the stores. Return the final references for stores
4684 in RESULT_CHAIN.
4686 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4687 The input is 4 vectors each containing 8 elements. We assign a number to
4688 each element, the input sequence is:
4690 1st vec: 0 1 2 3 4 5 6 7
4691 2nd vec: 8 9 10 11 12 13 14 15
4692 3rd vec: 16 17 18 19 20 21 22 23
4693 4th vec: 24 25 26 27 28 29 30 31
4695 The output sequence should be:
4697 1st vec: 0 8 16 24 1 9 17 25
4698 2nd vec: 2 10 18 26 3 11 19 27
4699 3rd vec: 4 12 20 28 5 13 21 30
4700 4th vec: 6 14 22 30 7 15 23 31
4702 i.e., we interleave the contents of the four vectors in their order.
4704 We use interleave_high/low instructions to create such output. The input of
4705 each interleave_high/low operation is two vectors:
4706 1st vec 2nd vec
4707 0 1 2 3 4 5 6 7
4708 the even elements of the result vector are obtained left-to-right from the
4709 high/low elements of the first vector. The odd elements of the result are
4710 obtained left-to-right from the high/low elements of the second vector.
4711 The output of interleave_high will be: 0 4 1 5
4712 and of interleave_low: 2 6 3 7
4715 The permutation is done in log LENGTH stages. In each stage interleave_high
4716 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4717 where the first argument is taken from the first half of DR_CHAIN and the
4718 second argument from it's second half.
4719 In our example,
4721 I1: interleave_high (1st vec, 3rd vec)
4722 I2: interleave_low (1st vec, 3rd vec)
4723 I3: interleave_high (2nd vec, 4th vec)
4724 I4: interleave_low (2nd vec, 4th vec)
4726 The output for the first stage is:
4728 I1: 0 16 1 17 2 18 3 19
4729 I2: 4 20 5 21 6 22 7 23
4730 I3: 8 24 9 25 10 26 11 27
4731 I4: 12 28 13 29 14 30 15 31
4733 The output of the second stage, i.e. the final result is:
4735 I1: 0 8 16 24 1 9 17 25
4736 I2: 2 10 18 26 3 11 19 27
4737 I3: 4 12 20 28 5 13 21 30
4738 I4: 6 14 22 30 7 15 23 31. */
4740 void
4741 vect_permute_store_chain (vec<tree> dr_chain,
4742 unsigned int length,
4743 gimple *stmt,
4744 gimple_stmt_iterator *gsi,
4745 vec<tree> *result_chain)
4747 tree vect1, vect2, high, low;
4748 gimple *perm_stmt;
4749 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4750 tree perm_mask_low, perm_mask_high;
4751 tree data_ref;
4752 tree perm3_mask_low, perm3_mask_high;
4753 unsigned int i, n, log_length = exact_log2 (length);
4754 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4756 result_chain->quick_grow (length);
4757 memcpy (result_chain->address (), dr_chain.address (),
4758 length * sizeof (tree));
4760 if (length == 3)
4762 unsigned int j0 = 0, j1 = 0, j2 = 0;
4764 vec_perm_builder sel (nelt, nelt, 1);
4765 sel.quick_grow (nelt);
4766 vec_perm_indices indices;
4767 for (j = 0; j < 3; j++)
4769 int nelt0 = ((3 - j) * nelt) % 3;
4770 int nelt1 = ((3 - j) * nelt + 1) % 3;
4771 int nelt2 = ((3 - j) * nelt + 2) % 3;
4773 for (i = 0; i < nelt; i++)
4775 if (3 * i + nelt0 < nelt)
4776 sel[3 * i + nelt0] = j0++;
4777 if (3 * i + nelt1 < nelt)
4778 sel[3 * i + nelt1] = nelt + j1++;
4779 if (3 * i + nelt2 < nelt)
4780 sel[3 * i + nelt2] = 0;
4782 indices.new_vector (sel, 2, nelt);
4783 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4785 for (i = 0; i < nelt; i++)
4787 if (3 * i + nelt0 < nelt)
4788 sel[3 * i + nelt0] = 3 * i + nelt0;
4789 if (3 * i + nelt1 < nelt)
4790 sel[3 * i + nelt1] = 3 * i + nelt1;
4791 if (3 * i + nelt2 < nelt)
4792 sel[3 * i + nelt2] = nelt + j2++;
4794 indices.new_vector (sel, 2, nelt);
4795 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4797 vect1 = dr_chain[0];
4798 vect2 = dr_chain[1];
4800 /* Create interleaving stmt:
4801 low = VEC_PERM_EXPR <vect1, vect2,
4802 {j, nelt, *, j + 1, nelt + j + 1, *,
4803 j + 2, nelt + j + 2, *, ...}> */
4804 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4805 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4806 vect2, perm3_mask_low);
4807 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4809 vect1 = data_ref;
4810 vect2 = dr_chain[2];
4811 /* Create interleaving stmt:
4812 low = VEC_PERM_EXPR <vect1, vect2,
4813 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4814 6, 7, nelt + j + 2, ...}> */
4815 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4816 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4817 vect2, perm3_mask_high);
4818 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4819 (*result_chain)[j] = data_ref;
4822 else
4824 /* If length is not equal to 3 then only power of 2 is supported. */
4825 gcc_assert (pow2p_hwi (length));
4827 /* The encoding has 2 interleaved stepped patterns. */
4828 vec_perm_builder sel (nelt, 2, 3);
4829 sel.quick_grow (6);
4830 for (i = 0; i < 3; i++)
4832 sel[i * 2] = i;
4833 sel[i * 2 + 1] = i + nelt;
4835 vec_perm_indices indices (sel, 2, nelt);
4836 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4838 for (i = 0; i < 6; i++)
4839 sel[i] += nelt / 2;
4840 indices.new_vector (sel, 2, nelt);
4841 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4843 for (i = 0, n = log_length; i < n; i++)
4845 for (j = 0; j < length/2; j++)
4847 vect1 = dr_chain[j];
4848 vect2 = dr_chain[j+length/2];
4850 /* Create interleaving stmt:
4851 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4852 ...}> */
4853 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4854 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4855 vect2, perm_mask_high);
4856 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4857 (*result_chain)[2*j] = high;
4859 /* Create interleaving stmt:
4860 low = VEC_PERM_EXPR <vect1, vect2,
4861 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4862 ...}> */
4863 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4864 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4865 vect2, perm_mask_low);
4866 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4867 (*result_chain)[2*j+1] = low;
4869 memcpy (dr_chain.address (), result_chain->address (),
4870 length * sizeof (tree));
4875 /* Function vect_setup_realignment
4877 This function is called when vectorizing an unaligned load using
4878 the dr_explicit_realign[_optimized] scheme.
4879 This function generates the following code at the loop prolog:
4881 p = initial_addr;
4882 x msq_init = *(floor(p)); # prolog load
4883 realignment_token = call target_builtin;
4884 loop:
4885 x msq = phi (msq_init, ---)
4887 The stmts marked with x are generated only for the case of
4888 dr_explicit_realign_optimized.
4890 The code above sets up a new (vector) pointer, pointing to the first
4891 location accessed by STMT, and a "floor-aligned" load using that pointer.
4892 It also generates code to compute the "realignment-token" (if the relevant
4893 target hook was defined), and creates a phi-node at the loop-header bb
4894 whose arguments are the result of the prolog-load (created by this
4895 function) and the result of a load that takes place in the loop (to be
4896 created by the caller to this function).
4898 For the case of dr_explicit_realign_optimized:
4899 The caller to this function uses the phi-result (msq) to create the
4900 realignment code inside the loop, and sets up the missing phi argument,
4901 as follows:
4902 loop:
4903 msq = phi (msq_init, lsq)
4904 lsq = *(floor(p')); # load in loop
4905 result = realign_load (msq, lsq, realignment_token);
4907 For the case of dr_explicit_realign:
4908 loop:
4909 msq = *(floor(p)); # load in loop
4910 p' = p + (VS-1);
4911 lsq = *(floor(p')); # load in loop
4912 result = realign_load (msq, lsq, realignment_token);
4914 Input:
4915 STMT - (scalar) load stmt to be vectorized. This load accesses
4916 a memory location that may be unaligned.
4917 BSI - place where new code is to be inserted.
4918 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4919 is used.
4921 Output:
4922 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4923 target hook, if defined.
4924 Return value - the result of the loop-header phi node. */
4926 tree
4927 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4928 tree *realignment_token,
4929 enum dr_alignment_support alignment_support_scheme,
4930 tree init_addr,
4931 struct loop **at_loop)
4933 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4934 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4935 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4936 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4937 struct loop *loop = NULL;
4938 edge pe = NULL;
4939 tree scalar_dest = gimple_assign_lhs (stmt);
4940 tree vec_dest;
4941 gimple *inc;
4942 tree ptr;
4943 tree data_ref;
4944 basic_block new_bb;
4945 tree msq_init = NULL_TREE;
4946 tree new_temp;
4947 gphi *phi_stmt;
4948 tree msq = NULL_TREE;
4949 gimple_seq stmts = NULL;
4950 bool inv_p;
4951 bool compute_in_loop = false;
4952 bool nested_in_vect_loop = false;
4953 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4954 struct loop *loop_for_initial_load = NULL;
4956 if (loop_vinfo)
4958 loop = LOOP_VINFO_LOOP (loop_vinfo);
4959 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4962 gcc_assert (alignment_support_scheme == dr_explicit_realign
4963 || alignment_support_scheme == dr_explicit_realign_optimized);
4965 /* We need to generate three things:
4966 1. the misalignment computation
4967 2. the extra vector load (for the optimized realignment scheme).
4968 3. the phi node for the two vectors from which the realignment is
4969 done (for the optimized realignment scheme). */
4971 /* 1. Determine where to generate the misalignment computation.
4973 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4974 calculation will be generated by this function, outside the loop (in the
4975 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4976 caller, inside the loop.
4978 Background: If the misalignment remains fixed throughout the iterations of
4979 the loop, then both realignment schemes are applicable, and also the
4980 misalignment computation can be done outside LOOP. This is because we are
4981 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4982 are a multiple of VS (the Vector Size), and therefore the misalignment in
4983 different vectorized LOOP iterations is always the same.
4984 The problem arises only if the memory access is in an inner-loop nested
4985 inside LOOP, which is now being vectorized using outer-loop vectorization.
4986 This is the only case when the misalignment of the memory access may not
4987 remain fixed throughout the iterations of the inner-loop (as explained in
4988 detail in vect_supportable_dr_alignment). In this case, not only is the
4989 optimized realignment scheme not applicable, but also the misalignment
4990 computation (and generation of the realignment token that is passed to
4991 REALIGN_LOAD) have to be done inside the loop.
4993 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4994 or not, which in turn determines if the misalignment is computed inside
4995 the inner-loop, or outside LOOP. */
4997 if (init_addr != NULL_TREE || !loop_vinfo)
4999 compute_in_loop = true;
5000 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5004 /* 2. Determine where to generate the extra vector load.
5006 For the optimized realignment scheme, instead of generating two vector
5007 loads in each iteration, we generate a single extra vector load in the
5008 preheader of the loop, and in each iteration reuse the result of the
5009 vector load from the previous iteration. In case the memory access is in
5010 an inner-loop nested inside LOOP, which is now being vectorized using
5011 outer-loop vectorization, we need to determine whether this initial vector
5012 load should be generated at the preheader of the inner-loop, or can be
5013 generated at the preheader of LOOP. If the memory access has no evolution
5014 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5015 to be generated inside LOOP (in the preheader of the inner-loop). */
5017 if (nested_in_vect_loop)
5019 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5020 bool invariant_in_outerloop =
5021 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5022 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5024 else
5025 loop_for_initial_load = loop;
5026 if (at_loop)
5027 *at_loop = loop_for_initial_load;
5029 if (loop_for_initial_load)
5030 pe = loop_preheader_edge (loop_for_initial_load);
5032 /* 3. For the case of the optimized realignment, create the first vector
5033 load at the loop preheader. */
5035 if (alignment_support_scheme == dr_explicit_realign_optimized)
5037 /* Create msq_init = *(floor(p1)) in the loop preheader */
5038 gassign *new_stmt;
5040 gcc_assert (!compute_in_loop);
5041 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5042 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5043 NULL_TREE, &init_addr, NULL, &inc,
5044 true, &inv_p);
5045 if (TREE_CODE (ptr) == SSA_NAME)
5046 new_temp = copy_ssa_name (ptr);
5047 else
5048 new_temp = make_ssa_name (TREE_TYPE (ptr));
5049 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5050 new_stmt = gimple_build_assign
5051 (new_temp, BIT_AND_EXPR, ptr,
5052 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5053 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5054 gcc_assert (!new_bb);
5055 data_ref
5056 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5057 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5058 new_stmt = gimple_build_assign (vec_dest, data_ref);
5059 new_temp = make_ssa_name (vec_dest, new_stmt);
5060 gimple_assign_set_lhs (new_stmt, new_temp);
5061 if (pe)
5063 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5064 gcc_assert (!new_bb);
5066 else
5067 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5069 msq_init = gimple_assign_lhs (new_stmt);
5072 /* 4. Create realignment token using a target builtin, if available.
5073 It is done either inside the containing loop, or before LOOP (as
5074 determined above). */
5076 if (targetm.vectorize.builtin_mask_for_load)
5078 gcall *new_stmt;
5079 tree builtin_decl;
5081 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5082 if (!init_addr)
5084 /* Generate the INIT_ADDR computation outside LOOP. */
5085 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5086 NULL_TREE);
5087 if (loop)
5089 pe = loop_preheader_edge (loop);
5090 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5091 gcc_assert (!new_bb);
5093 else
5094 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5097 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5098 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5099 vec_dest =
5100 vect_create_destination_var (scalar_dest,
5101 gimple_call_return_type (new_stmt));
5102 new_temp = make_ssa_name (vec_dest, new_stmt);
5103 gimple_call_set_lhs (new_stmt, new_temp);
5105 if (compute_in_loop)
5106 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5107 else
5109 /* Generate the misalignment computation outside LOOP. */
5110 pe = loop_preheader_edge (loop);
5111 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5112 gcc_assert (!new_bb);
5115 *realignment_token = gimple_call_lhs (new_stmt);
5117 /* The result of the CALL_EXPR to this builtin is determined from
5118 the value of the parameter and no global variables are touched
5119 which makes the builtin a "const" function. Requiring the
5120 builtin to have the "const" attribute makes it unnecessary
5121 to call mark_call_clobbered. */
5122 gcc_assert (TREE_READONLY (builtin_decl));
5125 if (alignment_support_scheme == dr_explicit_realign)
5126 return msq;
5128 gcc_assert (!compute_in_loop);
5129 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5132 /* 5. Create msq = phi <msq_init, lsq> in loop */
5134 pe = loop_preheader_edge (containing_loop);
5135 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5136 msq = make_ssa_name (vec_dest);
5137 phi_stmt = create_phi_node (msq, containing_loop->header);
5138 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5140 return msq;
5144 /* Function vect_grouped_load_supported.
5146 COUNT is the size of the load group (the number of statements plus the
5147 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5148 only one statement, with a gap of COUNT - 1.
5150 Returns true if a suitable permute exists. */
5152 bool
5153 vect_grouped_load_supported (tree vectype, bool single_element_p,
5154 unsigned HOST_WIDE_INT count)
5156 machine_mode mode = TYPE_MODE (vectype);
5158 /* If this is single-element interleaving with an element distance
5159 that leaves unused vector loads around punt - we at least create
5160 very sub-optimal code in that case (and blow up memory,
5161 see PR65518). */
5162 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5164 if (dump_enabled_p ())
5165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5166 "single-element interleaving not supported "
5167 "for not adjacent vector loads\n");
5168 return false;
5171 /* vect_permute_load_chain requires the group size to be equal to 3 or
5172 be a power of two. */
5173 if (count != 3 && exact_log2 (count) == -1)
5175 if (dump_enabled_p ())
5176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5177 "the size of the group of accesses"
5178 " is not a power of 2 or not equal to 3\n");
5179 return false;
5182 /* Check that the permutation is supported. */
5183 if (VECTOR_MODE_P (mode))
5185 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5187 if (count == 3)
5189 vec_perm_builder sel (nelt, nelt, 1);
5190 sel.quick_grow (nelt);
5191 vec_perm_indices indices;
5192 unsigned int k;
5193 for (k = 0; k < 3; k++)
5195 for (i = 0; i < nelt; i++)
5196 if (3 * i + k < 2 * nelt)
5197 sel[i] = 3 * i + k;
5198 else
5199 sel[i] = 0;
5200 indices.new_vector (sel, 2, nelt);
5201 if (!can_vec_perm_const_p (mode, indices))
5203 if (dump_enabled_p ())
5204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5205 "shuffle of 3 loads is not supported by"
5206 " target\n");
5207 return false;
5209 for (i = 0, j = 0; i < nelt; i++)
5210 if (3 * i + k < 2 * nelt)
5211 sel[i] = i;
5212 else
5213 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5214 indices.new_vector (sel, 2, nelt);
5215 if (!can_vec_perm_const_p (mode, indices))
5217 if (dump_enabled_p ())
5218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5219 "shuffle of 3 loads is not supported by"
5220 " target\n");
5221 return false;
5224 return true;
5226 else
5228 /* If length is not equal to 3 then only power of 2 is supported. */
5229 gcc_assert (pow2p_hwi (count));
5231 /* The encoding has a single stepped pattern. */
5232 vec_perm_builder sel (nelt, 1, 3);
5233 sel.quick_grow (3);
5234 for (i = 0; i < 3; i++)
5235 sel[i] = i * 2;
5236 vec_perm_indices indices (sel, 2, nelt);
5237 if (can_vec_perm_const_p (mode, indices))
5239 for (i = 0; i < 3; i++)
5240 sel[i] = i * 2 + 1;
5241 indices.new_vector (sel, 2, nelt);
5242 if (can_vec_perm_const_p (mode, indices))
5243 return true;
5248 if (dump_enabled_p ())
5249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5250 "extract even/odd not supported by target\n");
5251 return false;
5254 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5255 type VECTYPE. */
5257 bool
5258 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5260 return vect_lanes_optab_supported_p ("vec_load_lanes",
5261 vec_load_lanes_optab,
5262 vectype, count);
5265 /* Function vect_permute_load_chain.
5267 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5268 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5269 the input data correctly. Return the final references for loads in
5270 RESULT_CHAIN.
5272 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5273 The input is 4 vectors each containing 8 elements. We assign a number to each
5274 element, the input sequence is:
5276 1st vec: 0 1 2 3 4 5 6 7
5277 2nd vec: 8 9 10 11 12 13 14 15
5278 3rd vec: 16 17 18 19 20 21 22 23
5279 4th vec: 24 25 26 27 28 29 30 31
5281 The output sequence should be:
5283 1st vec: 0 4 8 12 16 20 24 28
5284 2nd vec: 1 5 9 13 17 21 25 29
5285 3rd vec: 2 6 10 14 18 22 26 30
5286 4th vec: 3 7 11 15 19 23 27 31
5288 i.e., the first output vector should contain the first elements of each
5289 interleaving group, etc.
5291 We use extract_even/odd instructions to create such output. The input of
5292 each extract_even/odd operation is two vectors
5293 1st vec 2nd vec
5294 0 1 2 3 4 5 6 7
5296 and the output is the vector of extracted even/odd elements. The output of
5297 extract_even will be: 0 2 4 6
5298 and of extract_odd: 1 3 5 7
5301 The permutation is done in log LENGTH stages. In each stage extract_even
5302 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5303 their order. In our example,
5305 E1: extract_even (1st vec, 2nd vec)
5306 E2: extract_odd (1st vec, 2nd vec)
5307 E3: extract_even (3rd vec, 4th vec)
5308 E4: extract_odd (3rd vec, 4th vec)
5310 The output for the first stage will be:
5312 E1: 0 2 4 6 8 10 12 14
5313 E2: 1 3 5 7 9 11 13 15
5314 E3: 16 18 20 22 24 26 28 30
5315 E4: 17 19 21 23 25 27 29 31
5317 In order to proceed and create the correct sequence for the next stage (or
5318 for the correct output, if the second stage is the last one, as in our
5319 example), we first put the output of extract_even operation and then the
5320 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5321 The input for the second stage is:
5323 1st vec (E1): 0 2 4 6 8 10 12 14
5324 2nd vec (E3): 16 18 20 22 24 26 28 30
5325 3rd vec (E2): 1 3 5 7 9 11 13 15
5326 4th vec (E4): 17 19 21 23 25 27 29 31
5328 The output of the second stage:
5330 E1: 0 4 8 12 16 20 24 28
5331 E2: 2 6 10 14 18 22 26 30
5332 E3: 1 5 9 13 17 21 25 29
5333 E4: 3 7 11 15 19 23 27 31
5335 And RESULT_CHAIN after reordering:
5337 1st vec (E1): 0 4 8 12 16 20 24 28
5338 2nd vec (E3): 1 5 9 13 17 21 25 29
5339 3rd vec (E2): 2 6 10 14 18 22 26 30
5340 4th vec (E4): 3 7 11 15 19 23 27 31. */
5342 static void
5343 vect_permute_load_chain (vec<tree> dr_chain,
5344 unsigned int length,
5345 gimple *stmt,
5346 gimple_stmt_iterator *gsi,
5347 vec<tree> *result_chain)
5349 tree data_ref, first_vect, second_vect;
5350 tree perm_mask_even, perm_mask_odd;
5351 tree perm3_mask_low, perm3_mask_high;
5352 gimple *perm_stmt;
5353 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5354 unsigned int i, j, log_length = exact_log2 (length);
5355 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5357 result_chain->quick_grow (length);
5358 memcpy (result_chain->address (), dr_chain.address (),
5359 length * sizeof (tree));
5361 if (length == 3)
5363 unsigned int k;
5365 vec_perm_builder sel (nelt, nelt, 1);
5366 sel.quick_grow (nelt);
5367 vec_perm_indices indices;
5368 for (k = 0; k < 3; k++)
5370 for (i = 0; i < nelt; i++)
5371 if (3 * i + k < 2 * nelt)
5372 sel[i] = 3 * i + k;
5373 else
5374 sel[i] = 0;
5375 indices.new_vector (sel, 2, nelt);
5376 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5378 for (i = 0, j = 0; i < nelt; i++)
5379 if (3 * i + k < 2 * nelt)
5380 sel[i] = i;
5381 else
5382 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5383 indices.new_vector (sel, 2, nelt);
5384 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5386 first_vect = dr_chain[0];
5387 second_vect = dr_chain[1];
5389 /* Create interleaving stmt (low part of):
5390 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5391 ...}> */
5392 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5393 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5394 second_vect, perm3_mask_low);
5395 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5397 /* Create interleaving stmt (high part of):
5398 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5399 ...}> */
5400 first_vect = data_ref;
5401 second_vect = dr_chain[2];
5402 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5403 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5404 second_vect, perm3_mask_high);
5405 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5406 (*result_chain)[k] = data_ref;
5409 else
5411 /* If length is not equal to 3 then only power of 2 is supported. */
5412 gcc_assert (pow2p_hwi (length));
5414 /* The encoding has a single stepped pattern. */
5415 vec_perm_builder sel (nelt, 1, 3);
5416 sel.quick_grow (3);
5417 for (i = 0; i < 3; ++i)
5418 sel[i] = i * 2;
5419 vec_perm_indices indices (sel, 2, nelt);
5420 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5422 for (i = 0; i < 3; ++i)
5423 sel[i] = i * 2 + 1;
5424 indices.new_vector (sel, 2, nelt);
5425 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5427 for (i = 0; i < log_length; i++)
5429 for (j = 0; j < length; j += 2)
5431 first_vect = dr_chain[j];
5432 second_vect = dr_chain[j+1];
5434 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5435 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5436 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5437 first_vect, second_vect,
5438 perm_mask_even);
5439 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5440 (*result_chain)[j/2] = data_ref;
5442 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5443 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5444 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5445 first_vect, second_vect,
5446 perm_mask_odd);
5447 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5448 (*result_chain)[j/2+length/2] = data_ref;
5450 memcpy (dr_chain.address (), result_chain->address (),
5451 length * sizeof (tree));
5456 /* Function vect_shift_permute_load_chain.
5458 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5459 sequence of stmts to reorder the input data accordingly.
5460 Return the final references for loads in RESULT_CHAIN.
5461 Return true if successed, false otherwise.
5463 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5464 The input is 3 vectors each containing 8 elements. We assign a
5465 number to each element, the input sequence is:
5467 1st vec: 0 1 2 3 4 5 6 7
5468 2nd vec: 8 9 10 11 12 13 14 15
5469 3rd vec: 16 17 18 19 20 21 22 23
5471 The output sequence should be:
5473 1st vec: 0 3 6 9 12 15 18 21
5474 2nd vec: 1 4 7 10 13 16 19 22
5475 3rd vec: 2 5 8 11 14 17 20 23
5477 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5479 First we shuffle all 3 vectors to get correct elements order:
5481 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5482 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5483 3rd vec: (16 19 22) (17 20 23) (18 21)
5485 Next we unite and shift vector 3 times:
5487 1st step:
5488 shift right by 6 the concatenation of:
5489 "1st vec" and "2nd vec"
5490 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5491 "2nd vec" and "3rd vec"
5492 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5493 "3rd vec" and "1st vec"
5494 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5495 | New vectors |
5497 So that now new vectors are:
5499 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5500 2nd vec: (10 13) (16 19 22) (17 20 23)
5501 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5503 2nd step:
5504 shift right by 5 the concatenation of:
5505 "1st vec" and "3rd vec"
5506 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5507 "2nd vec" and "1st vec"
5508 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5509 "3rd vec" and "2nd vec"
5510 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5511 | New vectors |
5513 So that now new vectors are:
5515 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5516 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5517 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5519 3rd step:
5520 shift right by 5 the concatenation of:
5521 "1st vec" and "1st vec"
5522 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5523 shift right by 3 the concatenation of:
5524 "2nd vec" and "2nd vec"
5525 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5526 | New vectors |
5528 So that now all vectors are READY:
5529 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5530 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5531 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5533 This algorithm is faster than one in vect_permute_load_chain if:
5534 1. "shift of a concatination" is faster than general permutation.
5535 This is usually so.
5536 2. The TARGET machine can't execute vector instructions in parallel.
5537 This is because each step of the algorithm depends on previous.
5538 The algorithm in vect_permute_load_chain is much more parallel.
5540 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5543 static bool
5544 vect_shift_permute_load_chain (vec<tree> dr_chain,
5545 unsigned int length,
5546 gimple *stmt,
5547 gimple_stmt_iterator *gsi,
5548 vec<tree> *result_chain)
5550 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5551 tree perm2_mask1, perm2_mask2, perm3_mask;
5552 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5553 gimple *perm_stmt;
5555 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5556 unsigned int i;
5557 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5558 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5559 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5561 unsigned HOST_WIDE_INT vf;
5562 if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5563 /* Not supported for variable-length vectors. */
5564 return false;
5566 vec_perm_builder sel (nelt, nelt, 1);
5567 sel.quick_grow (nelt);
5569 result_chain->quick_grow (length);
5570 memcpy (result_chain->address (), dr_chain.address (),
5571 length * sizeof (tree));
5573 if (pow2p_hwi (length) && vf > 4)
5575 unsigned int j, log_length = exact_log2 (length);
5576 for (i = 0; i < nelt / 2; ++i)
5577 sel[i] = i * 2;
5578 for (i = 0; i < nelt / 2; ++i)
5579 sel[nelt / 2 + i] = i * 2 + 1;
5580 vec_perm_indices indices (sel, 2, nelt);
5581 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5583 if (dump_enabled_p ())
5584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5585 "shuffle of 2 fields structure is not \
5586 supported by target\n");
5587 return false;
5589 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5591 for (i = 0; i < nelt / 2; ++i)
5592 sel[i] = i * 2 + 1;
5593 for (i = 0; i < nelt / 2; ++i)
5594 sel[nelt / 2 + i] = i * 2;
5595 indices.new_vector (sel, 2, nelt);
5596 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5598 if (dump_enabled_p ())
5599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5600 "shuffle of 2 fields structure is not \
5601 supported by target\n");
5602 return false;
5604 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5606 /* Generating permutation constant to shift all elements.
5607 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5608 for (i = 0; i < nelt; i++)
5609 sel[i] = nelt / 2 + i;
5610 indices.new_vector (sel, 2, nelt);
5611 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5613 if (dump_enabled_p ())
5614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5615 "shift permutation is not supported by target\n");
5616 return false;
5618 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5620 /* Generating permutation constant to select vector from 2.
5621 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5622 for (i = 0; i < nelt / 2; i++)
5623 sel[i] = i;
5624 for (i = nelt / 2; i < nelt; i++)
5625 sel[i] = nelt + i;
5626 indices.new_vector (sel, 2, nelt);
5627 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5629 if (dump_enabled_p ())
5630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5631 "select is not supported by target\n");
5632 return false;
5634 select_mask = vect_gen_perm_mask_checked (vectype, indices);
5636 for (i = 0; i < log_length; i++)
5638 for (j = 0; j < length; j += 2)
5640 first_vect = dr_chain[j];
5641 second_vect = dr_chain[j + 1];
5643 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5644 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5645 first_vect, first_vect,
5646 perm2_mask1);
5647 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5648 vect[0] = data_ref;
5650 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5651 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5652 second_vect, second_vect,
5653 perm2_mask2);
5654 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5655 vect[1] = data_ref;
5657 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5658 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5659 vect[0], vect[1], shift1_mask);
5660 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5661 (*result_chain)[j/2 + length/2] = data_ref;
5663 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5664 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5665 vect[0], vect[1], select_mask);
5666 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5667 (*result_chain)[j/2] = data_ref;
5669 memcpy (dr_chain.address (), result_chain->address (),
5670 length * sizeof (tree));
5672 return true;
5674 if (length == 3 && vf > 2)
5676 unsigned int k = 0, l = 0;
5678 /* Generating permutation constant to get all elements in rigth order.
5679 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5680 for (i = 0; i < nelt; i++)
5682 if (3 * k + (l % 3) >= nelt)
5684 k = 0;
5685 l += (3 - (nelt % 3));
5687 sel[i] = 3 * k + (l % 3);
5688 k++;
5690 vec_perm_indices indices (sel, 2, nelt);
5691 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5693 if (dump_enabled_p ())
5694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5695 "shuffle of 3 fields structure is not \
5696 supported by target\n");
5697 return false;
5699 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5701 /* Generating permutation constant to shift all elements.
5702 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5703 for (i = 0; i < nelt; i++)
5704 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5705 indices.new_vector (sel, 2, nelt);
5706 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5708 if (dump_enabled_p ())
5709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5710 "shift permutation is not supported by target\n");
5711 return false;
5713 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5715 /* Generating permutation constant to shift all elements.
5716 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5717 for (i = 0; i < nelt; i++)
5718 sel[i] = 2 * (nelt / 3) + 1 + i;
5719 indices.new_vector (sel, 2, nelt);
5720 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5722 if (dump_enabled_p ())
5723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5724 "shift permutation is not supported by target\n");
5725 return false;
5727 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5729 /* Generating permutation constant to shift all elements.
5730 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5731 for (i = 0; i < nelt; i++)
5732 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5733 indices.new_vector (sel, 2, nelt);
5734 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5736 if (dump_enabled_p ())
5737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5738 "shift permutation is not supported by target\n");
5739 return false;
5741 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5743 /* Generating permutation constant to shift all elements.
5744 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5745 for (i = 0; i < nelt; i++)
5746 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5747 indices.new_vector (sel, 2, nelt);
5748 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5750 if (dump_enabled_p ())
5751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5752 "shift permutation is not supported by target\n");
5753 return false;
5755 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5757 for (k = 0; k < 3; k++)
5759 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5760 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5761 dr_chain[k], dr_chain[k],
5762 perm3_mask);
5763 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5764 vect[k] = data_ref;
5767 for (k = 0; k < 3; k++)
5769 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5770 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5771 vect[k % 3], vect[(k + 1) % 3],
5772 shift1_mask);
5773 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5774 vect_shift[k] = data_ref;
5777 for (k = 0; k < 3; k++)
5779 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5780 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5781 vect_shift[(4 - k) % 3],
5782 vect_shift[(3 - k) % 3],
5783 shift2_mask);
5784 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5785 vect[k] = data_ref;
5788 (*result_chain)[3 - (nelt % 3)] = vect[2];
5790 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5791 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5792 vect[0], shift3_mask);
5793 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5794 (*result_chain)[nelt % 3] = data_ref;
5796 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5797 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5798 vect[1], shift4_mask);
5799 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5800 (*result_chain)[0] = data_ref;
5801 return true;
5803 return false;
5806 /* Function vect_transform_grouped_load.
5808 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5809 to perform their permutation and ascribe the result vectorized statements to
5810 the scalar statements.
5813 void
5814 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5815 gimple_stmt_iterator *gsi)
5817 machine_mode mode;
5818 vec<tree> result_chain = vNULL;
5820 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5821 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5822 vectors, that are ready for vector computation. */
5823 result_chain.create (size);
5825 /* If reassociation width for vector type is 2 or greater target machine can
5826 execute 2 or more vector instructions in parallel. Otherwise try to
5827 get chain for loads group using vect_shift_permute_load_chain. */
5828 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5829 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5830 || pow2p_hwi (size)
5831 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5832 gsi, &result_chain))
5833 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5834 vect_record_grouped_load_vectors (stmt, result_chain);
5835 result_chain.release ();
5838 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5839 generated as part of the vectorization of STMT. Assign the statement
5840 for each vector to the associated scalar statement. */
5842 void
5843 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5845 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5846 gimple *next_stmt, *new_stmt;
5847 unsigned int i, gap_count;
5848 tree tmp_data_ref;
5850 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5851 Since we scan the chain starting from it's first node, their order
5852 corresponds the order of data-refs in RESULT_CHAIN. */
5853 next_stmt = first_stmt;
5854 gap_count = 1;
5855 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5857 if (!next_stmt)
5858 break;
5860 /* Skip the gaps. Loads created for the gaps will be removed by dead
5861 code elimination pass later. No need to check for the first stmt in
5862 the group, since it always exists.
5863 GROUP_GAP is the number of steps in elements from the previous
5864 access (if there is no gap GROUP_GAP is 1). We skip loads that
5865 correspond to the gaps. */
5866 if (next_stmt != first_stmt
5867 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5869 gap_count++;
5870 continue;
5873 while (next_stmt)
5875 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5876 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5877 copies, and we put the new vector statement in the first available
5878 RELATED_STMT. */
5879 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5880 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5881 else
5883 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5885 gimple *prev_stmt =
5886 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5887 gimple *rel_stmt =
5888 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5889 while (rel_stmt)
5891 prev_stmt = rel_stmt;
5892 rel_stmt =
5893 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5896 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5897 new_stmt;
5901 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5902 gap_count = 1;
5903 /* If NEXT_STMT accesses the same DR as the previous statement,
5904 put the same TMP_DATA_REF as its vectorized statement; otherwise
5905 get the next data-ref from RESULT_CHAIN. */
5906 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5907 break;
5912 /* Function vect_force_dr_alignment_p.
5914 Returns whether the alignment of a DECL can be forced to be aligned
5915 on ALIGNMENT bit boundary. */
5917 bool
5918 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5920 if (!VAR_P (decl))
5921 return false;
5923 if (decl_in_symtab_p (decl)
5924 && !symtab_node::get (decl)->can_increase_alignment_p ())
5925 return false;
5927 if (TREE_STATIC (decl))
5928 return (alignment <= MAX_OFILE_ALIGNMENT);
5929 else
5930 return (alignment <= MAX_STACK_ALIGNMENT);
5934 /* Return whether the data reference DR is supported with respect to its
5935 alignment.
5936 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5937 it is aligned, i.e., check if it is possible to vectorize it with different
5938 alignment. */
5940 enum dr_alignment_support
5941 vect_supportable_dr_alignment (struct data_reference *dr,
5942 bool check_aligned_accesses)
5944 gimple *stmt = DR_STMT (dr);
5945 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5946 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5947 machine_mode mode = TYPE_MODE (vectype);
5948 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5949 struct loop *vect_loop = NULL;
5950 bool nested_in_vect_loop = false;
5952 if (aligned_access_p (dr) && !check_aligned_accesses)
5953 return dr_aligned;
5955 /* For now assume all conditional loads/stores support unaligned
5956 access without any special code. */
5957 if (is_gimple_call (stmt)
5958 && gimple_call_internal_p (stmt)
5959 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5960 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5961 return dr_unaligned_supported;
5963 if (loop_vinfo)
5965 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5966 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5969 /* Possibly unaligned access. */
5971 /* We can choose between using the implicit realignment scheme (generating
5972 a misaligned_move stmt) and the explicit realignment scheme (generating
5973 aligned loads with a REALIGN_LOAD). There are two variants to the
5974 explicit realignment scheme: optimized, and unoptimized.
5975 We can optimize the realignment only if the step between consecutive
5976 vector loads is equal to the vector size. Since the vector memory
5977 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5978 is guaranteed that the misalignment amount remains the same throughout the
5979 execution of the vectorized loop. Therefore, we can create the
5980 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5981 at the loop preheader.
5983 However, in the case of outer-loop vectorization, when vectorizing a
5984 memory access in the inner-loop nested within the LOOP that is now being
5985 vectorized, while it is guaranteed that the misalignment of the
5986 vectorized memory access will remain the same in different outer-loop
5987 iterations, it is *not* guaranteed that is will remain the same throughout
5988 the execution of the inner-loop. This is because the inner-loop advances
5989 with the original scalar step (and not in steps of VS). If the inner-loop
5990 step happens to be a multiple of VS, then the misalignment remains fixed
5991 and we can use the optimized realignment scheme. For example:
5993 for (i=0; i<N; i++)
5994 for (j=0; j<M; j++)
5995 s += a[i+j];
5997 When vectorizing the i-loop in the above example, the step between
5998 consecutive vector loads is 1, and so the misalignment does not remain
5999 fixed across the execution of the inner-loop, and the realignment cannot
6000 be optimized (as illustrated in the following pseudo vectorized loop):
6002 for (i=0; i<N; i+=4)
6003 for (j=0; j<M; j++){
6004 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6005 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6006 // (assuming that we start from an aligned address).
6009 We therefore have to use the unoptimized realignment scheme:
6011 for (i=0; i<N; i+=4)
6012 for (j=k; j<M; j+=4)
6013 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6014 // that the misalignment of the initial address is
6015 // 0).
6017 The loop can then be vectorized as follows:
6019 for (k=0; k<4; k++){
6020 rt = get_realignment_token (&vp[k]);
6021 for (i=0; i<N; i+=4){
6022 v1 = vp[i+k];
6023 for (j=k; j<M; j+=4){
6024 v2 = vp[i+j+VS-1];
6025 va = REALIGN_LOAD <v1,v2,rt>;
6026 vs += va;
6027 v1 = v2;
6030 } */
6032 if (DR_IS_READ (dr))
6034 bool is_packed = false;
6035 tree type = (TREE_TYPE (DR_REF (dr)));
6037 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6038 && (!targetm.vectorize.builtin_mask_for_load
6039 || targetm.vectorize.builtin_mask_for_load ()))
6041 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6043 /* If we are doing SLP then the accesses need not have the
6044 same alignment, instead it depends on the SLP group size. */
6045 if (loop_vinfo
6046 && STMT_SLP_TYPE (stmt_info)
6047 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6048 * GROUP_SIZE (vinfo_for_stmt
6049 (GROUP_FIRST_ELEMENT (stmt_info))),
6050 TYPE_VECTOR_SUBPARTS (vectype)))
6052 else if (!loop_vinfo
6053 || (nested_in_vect_loop
6054 && (TREE_INT_CST_LOW (DR_STEP (dr))
6055 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6056 return dr_explicit_realign;
6057 else
6058 return dr_explicit_realign_optimized;
6060 if (!known_alignment_for_access_p (dr))
6061 is_packed = not_size_aligned (DR_REF (dr));
6063 if (targetm.vectorize.support_vector_misalignment
6064 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6065 /* Can't software pipeline the loads, but can at least do them. */
6066 return dr_unaligned_supported;
6068 else
6070 bool is_packed = false;
6071 tree type = (TREE_TYPE (DR_REF (dr)));
6073 if (!known_alignment_for_access_p (dr))
6074 is_packed = not_size_aligned (DR_REF (dr));
6076 if (targetm.vectorize.support_vector_misalignment
6077 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6078 return dr_unaligned_supported;
6081 /* Unsupported. */
6082 return dr_unaligned_unsupported;