poly_int: vectoriser vf and uf
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob02a32950ee34fc12a42a0c7b759c06a3555b6cf8
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
64 machine_mode mode;
65 scalar_int_mode array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 limit_p = !targetm.array_mode_supported_p (mode, count);
70 if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
71 limit_p).exists (&array_mode))
73 if (dump_enabled_p ())
74 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
75 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
76 GET_MODE_NAME (mode), count);
77 return false;
80 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
82 if (dump_enabled_p ())
83 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
84 "cannot use %s<%s><%s>\n", name,
85 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
86 return false;
89 if (dump_enabled_p ())
90 dump_printf_loc (MSG_NOTE, vect_location,
91 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
92 GET_MODE_NAME (mode));
94 return true;
98 /* Return the smallest scalar part of STMT.
99 This is used to determine the vectype of the stmt. We generally set the
100 vectype according to the type of the result (lhs). For stmts whose
101 result-type is different than the type of the arguments (e.g., demotion,
102 promotion), vectype will be reset appropriately (later). Note that we have
103 to visit the smallest datatype in this function, because that determines the
104 VF. If the smallest datatype in the loop is present only as the rhs of a
105 promotion operation - we'd miss it.
106 Such a case, where a variable of this datatype does not appear in the lhs
107 anywhere in the loop, can only occur if it's an invariant: e.g.:
108 'int_x = (int) short_inv', which we'd expect to have been optimized away by
109 invariant motion. However, we cannot rely on invariant motion to always
110 take invariants out of the loop, and so in the case of promotion we also
111 have to check the rhs.
112 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
113 types. */
115 tree
116 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
117 HOST_WIDE_INT *rhs_size_unit)
119 tree scalar_type = gimple_expr_type (stmt);
120 HOST_WIDE_INT lhs, rhs;
122 /* During the analysis phase, this function is called on arbitrary
123 statements that might not have scalar results. */
124 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
125 return scalar_type;
127 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 if (is_gimple_assign (stmt)
130 && (gimple_assign_cast_p (stmt)
131 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
135 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
137 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
138 if (rhs < lhs)
139 scalar_type = rhs_type;
142 *lhs_size_unit = lhs;
143 *rhs_size_unit = rhs;
144 return scalar_type;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
152 static bool
153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
158 return false;
160 if (!runtime_alias_check_p (ddr, loop,
161 optimize_loop_nest_for_speed_p (loop)))
162 return false;
164 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
165 return true;
169 /* A subroutine of vect_analyze_data_ref_dependence. Handle
170 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
171 distances. These distances are conservatively correct but they don't
172 reflect a guaranteed dependence.
174 Return true if this function does all the work necessary to avoid
175 an alias or false if the caller should use the dependence distances
176 to limit the vectorization factor in the usual way. LOOP_DEPTH is
177 the depth of the loop described by LOOP_VINFO and the other arguments
178 are as for vect_analyze_data_ref_dependence. */
180 static bool
181 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
182 loop_vec_info loop_vinfo,
183 int loop_depth, unsigned int *max_vf)
185 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186 lambda_vector dist_v;
187 unsigned int i;
188 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
190 int dist = dist_v[loop_depth];
191 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
193 /* If the user asserted safelen >= DIST consecutive iterations
194 can be executed concurrently, assume independence.
196 ??? An alternative would be to add the alias check even
197 in this case, and vectorize the fallback loop with the
198 maximum VF set to safelen. However, if the user has
199 explicitly given a length, it's less likely that that
200 would be a win. */
201 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
203 if ((unsigned int) loop->safelen < *max_vf)
204 *max_vf = loop->safelen;
205 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
206 continue;
209 /* For dependence distances of 2 or more, we have the option
210 of limiting VF or checking for an alias at runtime.
211 Prefer to check at runtime if we can, to avoid limiting
212 the VF unnecessarily when the bases are in fact independent.
214 Note that the alias checks will be removed if the VF ends up
215 being small enough. */
216 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
219 return true;
223 /* Function vect_analyze_data_ref_dependence.
225 Return TRUE if there (might) exist a dependence between a memory-reference
226 DRA and a memory-reference DRB. When versioning for alias may check a
227 dependence at run-time, return FALSE. Adjust *MAX_VF according to
228 the data dependence. */
230 static bool
231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 unsigned int *max_vf)
235 unsigned int i;
236 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
237 struct data_reference *dra = DDR_A (ddr);
238 struct data_reference *drb = DDR_B (ddr);
239 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
240 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
241 lambda_vector dist_v;
242 unsigned int loop_depth;
244 /* In loop analysis all data references should be vectorizable. */
245 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
246 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
247 gcc_unreachable ();
249 /* Independent data accesses. */
250 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
251 return false;
253 if (dra == drb
254 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
255 return false;
257 /* We do not have to consider dependences between accesses that belong
258 to the same group. */
259 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
260 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
261 return false;
263 /* Even if we have an anti-dependence then, as the vectorized loop covers at
264 least two scalar iterations, there is always also a true dependence.
265 As the vectorizer does not re-order loads and stores we can ignore
266 the anti-dependence if TBAA can disambiguate both DRs similar to the
267 case with known negative distance anti-dependences (positive
268 distance anti-dependences would violate TBAA constraints). */
269 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
270 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
271 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
272 get_alias_set (DR_REF (drb))))
273 return false;
275 /* Unknown data dependence. */
276 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
278 /* If user asserted safelen consecutive iterations can be
279 executed concurrently, assume independence. */
280 if (loop->safelen >= 2)
282 if ((unsigned int) loop->safelen < *max_vf)
283 *max_vf = loop->safelen;
284 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
285 return false;
288 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
289 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
291 if (dump_enabled_p ())
293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
294 "versioning for alias not supported for: "
295 "can't determine dependence between ");
296 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
297 DR_REF (dra));
298 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
299 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
300 DR_REF (drb));
301 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
303 return true;
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias required: "
310 "can't determine dependence between ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
319 /* Add to list of ddrs that need to be tested at run-time. */
320 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
323 /* Known data dependence. */
324 if (DDR_NUM_DIST_VECTS (ddr) == 0)
326 /* If user asserted safelen consecutive iterations can be
327 executed concurrently, assume independence. */
328 if (loop->safelen >= 2)
330 if ((unsigned int) loop->safelen < *max_vf)
331 *max_vf = loop->safelen;
332 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
333 return false;
336 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
337 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
339 if (dump_enabled_p ())
341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
342 "versioning for alias not supported for: "
343 "bad dist vector for ");
344 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
345 DR_REF (dra));
346 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
347 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
348 DR_REF (drb));
349 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
351 return true;
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
357 "versioning for alias required: "
358 "bad dist vector for ");
359 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
360 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
361 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
362 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 /* Add to list of ddrs that need to be tested at run-time. */
365 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
368 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
370 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
371 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
372 loop_depth, max_vf))
373 return false;
375 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
377 int dist = dist_v[loop_depth];
379 if (dump_enabled_p ())
380 dump_printf_loc (MSG_NOTE, vect_location,
381 "dependence distance = %d.\n", dist);
383 if (dist == 0)
385 if (dump_enabled_p ())
387 dump_printf_loc (MSG_NOTE, vect_location,
388 "dependence distance == 0 between ");
389 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
390 dump_printf (MSG_NOTE, " and ");
391 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
392 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
395 /* When we perform grouped accesses and perform implicit CSE
396 by detecting equal accesses and doing disambiguation with
397 runtime alias tests like for
398 .. = a[i];
399 .. = a[i+1];
400 a[i] = ..;
401 a[i+1] = ..;
402 *p = ..;
403 .. = a[i];
404 .. = a[i+1];
405 where we will end up loading { a[i], a[i+1] } once, make
406 sure that inserting group loads before the first load and
407 stores after the last store will do the right thing.
408 Similar for groups like
409 a[i] = ...;
410 ... = a[i];
411 a[i+1] = ...;
412 where loads from the group interleave with the store. */
413 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
414 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
416 gimple *earlier_stmt;
417 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
418 if (DR_IS_WRITE
419 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
421 if (dump_enabled_p ())
422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
423 "READ_WRITE dependence in interleaving."
424 "\n");
425 return true;
429 continue;
432 if (dist > 0 && DDR_REVERSED_P (ddr))
434 /* If DDR_REVERSED_P the order of the data-refs in DDR was
435 reversed (to make distance vector positive), and the actual
436 distance is negative. */
437 if (dump_enabled_p ())
438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
439 "dependence distance negative.\n");
440 /* Record a negative dependence distance to later limit the
441 amount of stmt copying / unrolling we can perform.
442 Only need to handle read-after-write dependence. */
443 if (DR_IS_READ (drb)
444 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
445 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
446 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
447 continue;
450 unsigned int abs_dist = abs (dist);
451 if (abs_dist >= 2 && abs_dist < *max_vf)
453 /* The dependence distance requires reduction of the maximal
454 vectorization factor. */
455 *max_vf = abs (dist);
456 if (dump_enabled_p ())
457 dump_printf_loc (MSG_NOTE, vect_location,
458 "adjusting maximal vectorization factor to %i\n",
459 *max_vf);
462 if (abs_dist >= *max_vf)
464 /* Dependence distance does not create dependence, as far as
465 vectorization is concerned, in this case. */
466 if (dump_enabled_p ())
467 dump_printf_loc (MSG_NOTE, vect_location,
468 "dependence distance >= VF.\n");
469 continue;
472 if (dump_enabled_p ())
474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
475 "not vectorized, possible dependence "
476 "between data-refs ");
477 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
478 dump_printf (MSG_NOTE, " and ");
479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
480 dump_printf (MSG_NOTE, "\n");
483 return true;
486 return false;
489 /* Function vect_analyze_data_ref_dependences.
491 Examine all the data references in the loop, and make sure there do not
492 exist any data dependences between them. Set *MAX_VF according to
493 the maximum vectorization factor the data dependences allow. */
495 bool
496 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
497 unsigned int *max_vf)
499 unsigned int i;
500 struct data_dependence_relation *ddr;
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE, vect_location,
504 "=== vect_analyze_data_ref_dependences ===\n");
506 LOOP_VINFO_DDRS (loop_vinfo)
507 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
508 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
509 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
510 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
511 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
512 &LOOP_VINFO_DDRS (loop_vinfo),
513 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
514 return false;
516 /* For epilogues we either have no aliases or alias versioning
517 was applied to original loop. Therefore we may just get max_vf
518 using VF of original loop. */
519 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
520 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
521 else
522 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
523 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
524 return false;
526 return true;
530 /* Function vect_slp_analyze_data_ref_dependence.
532 Return TRUE if there (might) exist a dependence between a memory-reference
533 DRA and a memory-reference DRB. When versioning for alias may check a
534 dependence at run-time, return FALSE. Adjust *MAX_VF according to
535 the data dependence. */
537 static bool
538 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
540 struct data_reference *dra = DDR_A (ddr);
541 struct data_reference *drb = DDR_B (ddr);
543 /* We need to check dependences of statements marked as unvectorizable
544 as well, they still can prohibit vectorization. */
546 /* Independent data accesses. */
547 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
548 return false;
550 if (dra == drb)
551 return false;
553 /* Read-read is OK. */
554 if (DR_IS_READ (dra) && DR_IS_READ (drb))
555 return false;
557 /* If dra and drb are part of the same interleaving chain consider
558 them independent. */
559 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
560 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
561 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
562 return false;
564 /* Unknown data dependence. */
565 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
567 if (dump_enabled_p ())
569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
570 "can't determine dependence between ");
571 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
572 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
573 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
574 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
577 else if (dump_enabled_p ())
579 dump_printf_loc (MSG_NOTE, vect_location,
580 "determined dependence between ");
581 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
582 dump_printf (MSG_NOTE, " and ");
583 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
584 dump_printf (MSG_NOTE, "\n");
587 return true;
591 /* Analyze dependences involved in the transform of SLP NODE. STORES
592 contain the vector of scalar stores of this instance if we are
593 disambiguating the loads. */
595 static bool
596 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
597 vec<gimple *> stores, gimple *last_store)
599 /* This walks over all stmts involved in the SLP load/store done
600 in NODE verifying we can sink them up to the last stmt in the
601 group. */
602 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
603 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
605 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
606 if (access == last_access)
607 continue;
608 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
609 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
610 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
612 gimple *stmt = gsi_stmt (gsi);
613 if (! gimple_vuse (stmt)
614 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
615 continue;
617 /* If we couldn't record a (single) data reference for this
618 stmt we have to give up. */
619 /* ??? Here and below if dependence analysis fails we can resort
620 to the alias oracle which can handle more kinds of stmts. */
621 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
622 if (!dr_b)
623 return false;
625 bool dependent = false;
626 /* If we run into a store of this same instance (we've just
627 marked those) then delay dependence checking until we run
628 into the last store because this is where it will have
629 been sunk to (and we verify if we can do that as well). */
630 if (gimple_visited_p (stmt))
632 if (stmt != last_store)
633 continue;
634 unsigned i;
635 gimple *store;
636 FOR_EACH_VEC_ELT (stores, i, store)
638 data_reference *store_dr
639 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
640 ddr_p ddr = initialize_data_dependence_relation
641 (dr_a, store_dr, vNULL);
642 dependent = vect_slp_analyze_data_ref_dependence (ddr);
643 free_dependence_relation (ddr);
644 if (dependent)
645 break;
648 else
650 ddr_p ddr = initialize_data_dependence_relation (dr_a,
651 dr_b, vNULL);
652 dependent = vect_slp_analyze_data_ref_dependence (ddr);
653 free_dependence_relation (ddr);
655 if (dependent)
656 return false;
659 return true;
663 /* Function vect_analyze_data_ref_dependences.
665 Examine all the data references in the basic-block, and make sure there
666 do not exist any data dependences between them. Set *MAX_VF according to
667 the maximum vectorization factor the data dependences allow. */
669 bool
670 vect_slp_analyze_instance_dependence (slp_instance instance)
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_NOTE, vect_location,
674 "=== vect_slp_analyze_instance_dependence ===\n");
676 /* The stores of this instance are at the root of the SLP tree. */
677 slp_tree store = SLP_INSTANCE_TREE (instance);
678 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
679 store = NULL;
681 /* Verify we can sink stores to the vectorized stmt insert location. */
682 gimple *last_store = NULL;
683 if (store)
685 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
686 return false;
688 /* Mark stores in this instance and remember the last one. */
689 last_store = vect_find_last_scalar_stmt_in_slp (store);
690 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
691 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
694 bool res = true;
696 /* Verify we can sink loads to the vectorized stmt insert location,
697 special-casing stores of this instance. */
698 slp_tree load;
699 unsigned int i;
700 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
701 if (! vect_slp_analyze_node_dependences (instance, load,
702 store
703 ? SLP_TREE_SCALAR_STMTS (store)
704 : vNULL, last_store))
706 res = false;
707 break;
710 /* Unset the visited flag. */
711 if (store)
712 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
713 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
715 return res;
718 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
719 the statement that contains DRB, which is useful for recording in the
720 dump file. */
722 static void
723 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
724 innermost_loop_behavior *drb)
726 bool existed;
727 innermost_loop_behavior *&entry
728 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
729 if (!existed || entry->base_alignment < drb->base_alignment)
731 entry = drb;
732 if (dump_enabled_p ())
734 dump_printf_loc (MSG_NOTE, vect_location,
735 "recording new base alignment for ");
736 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
737 dump_printf (MSG_NOTE, "\n");
738 dump_printf_loc (MSG_NOTE, vect_location,
739 " alignment: %d\n", drb->base_alignment);
740 dump_printf_loc (MSG_NOTE, vect_location,
741 " misalignment: %d\n", drb->base_misalignment);
742 dump_printf_loc (MSG_NOTE, vect_location,
743 " based on: ");
744 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
749 /* If the region we're going to vectorize is reached, all unconditional
750 data references occur at least once. We can therefore pool the base
751 alignment guarantees from each unconditional reference. Do this by
752 going through all the data references in VINFO and checking whether
753 the containing statement makes the reference unconditionally. If so,
754 record the alignment of the base address in VINFO so that it can be
755 used for all other references with the same base. */
757 void
758 vect_record_base_alignments (vec_info *vinfo)
760 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
761 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
762 data_reference *dr;
763 unsigned int i;
764 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
765 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
767 gimple *stmt = DR_STMT (dr);
768 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
770 /* If DR is nested in the loop that is being vectorized, we can also
771 record the alignment of the base wrt the outer loop. */
772 if (loop && nested_in_vect_loop_p (loop, stmt))
774 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
775 vect_record_base_alignment
776 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
781 /* Return the target alignment for the vectorized form of DR. */
783 static unsigned int
784 vect_calculate_target_alignment (struct data_reference *dr)
786 gimple *stmt = DR_STMT (dr);
787 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
788 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
789 return targetm.vectorize.preferred_vector_alignment (vectype);
792 /* Function vect_compute_data_ref_alignment
794 Compute the misalignment of the data reference DR.
796 Output:
797 1. If during the misalignment computation it is found that the data reference
798 cannot be vectorized then false is returned.
799 2. DR_MISALIGNMENT (DR) is defined.
801 FOR NOW: No analysis is actually performed. Misalignment is calculated
802 only for trivial cases. TODO. */
804 bool
805 vect_compute_data_ref_alignment (struct data_reference *dr)
807 gimple *stmt = DR_STMT (dr);
808 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
809 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
810 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
811 struct loop *loop = NULL;
812 tree ref = DR_REF (dr);
813 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
815 if (dump_enabled_p ())
816 dump_printf_loc (MSG_NOTE, vect_location,
817 "vect_compute_data_ref_alignment:\n");
819 if (loop_vinfo)
820 loop = LOOP_VINFO_LOOP (loop_vinfo);
822 /* Initialize misalignment to unknown. */
823 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
825 innermost_loop_behavior *drb = vect_dr_behavior (dr);
826 bool step_preserves_misalignment_p;
828 unsigned HOST_WIDE_INT vector_alignment
829 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
830 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
832 /* No step for BB vectorization. */
833 if (!loop)
835 gcc_assert (integer_zerop (drb->step));
836 step_preserves_misalignment_p = true;
839 /* In case the dataref is in an inner-loop of the loop that is being
840 vectorized (LOOP), we use the base and misalignment information
841 relative to the outer-loop (LOOP). This is ok only if the misalignment
842 stays the same throughout the execution of the inner-loop, which is why
843 we have to check that the stride of the dataref in the inner-loop evenly
844 divides by the vector alignment. */
845 else if (nested_in_vect_loop_p (loop, stmt))
847 step_preserves_misalignment_p
848 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
850 if (dump_enabled_p ())
852 if (step_preserves_misalignment_p)
853 dump_printf_loc (MSG_NOTE, vect_location,
854 "inner step divides the vector alignment.\n");
855 else
856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
857 "inner step doesn't divide the vector"
858 " alignment.\n");
862 /* Similarly we can only use base and misalignment information relative to
863 an innermost loop if the misalignment stays the same throughout the
864 execution of the loop. As above, this is the case if the stride of
865 the dataref evenly divides by the alignment. */
866 else
868 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
869 step_preserves_misalignment_p
870 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
872 if (!step_preserves_misalignment_p && dump_enabled_p ())
873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
874 "step doesn't divide the vector alignment.\n");
877 unsigned int base_alignment = drb->base_alignment;
878 unsigned int base_misalignment = drb->base_misalignment;
880 /* Calculate the maximum of the pooled base address alignment and the
881 alignment that we can compute for DR itself. */
882 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
883 if (entry && base_alignment < (*entry)->base_alignment)
885 base_alignment = (*entry)->base_alignment;
886 base_misalignment = (*entry)->base_misalignment;
889 if (drb->offset_alignment < vector_alignment
890 || !step_preserves_misalignment_p
891 /* We need to know whether the step wrt the vectorized loop is
892 negative when computing the starting misalignment below. */
893 || TREE_CODE (drb->step) != INTEGER_CST)
895 if (dump_enabled_p ())
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "Unknown alignment for access: ");
899 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
900 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
902 return true;
905 if (base_alignment < vector_alignment)
907 tree base = drb->base_address;
908 if (TREE_CODE (base) == ADDR_EXPR)
909 base = TREE_OPERAND (base, 0);
910 if (!vect_can_force_dr_alignment_p (base,
911 vector_alignment * BITS_PER_UNIT))
913 if (dump_enabled_p ())
915 dump_printf_loc (MSG_NOTE, vect_location,
916 "can't force alignment of ref: ");
917 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
918 dump_printf (MSG_NOTE, "\n");
920 return true;
923 if (DECL_USER_ALIGN (base))
925 if (dump_enabled_p ())
927 dump_printf_loc (MSG_NOTE, vect_location,
928 "not forcing alignment of user-aligned "
929 "variable: ");
930 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
931 dump_printf (MSG_NOTE, "\n");
933 return true;
936 /* Force the alignment of the decl.
937 NOTE: This is the only change to the code we make during
938 the analysis phase, before deciding to vectorize the loop. */
939 if (dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
942 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
943 dump_printf (MSG_NOTE, "\n");
946 DR_VECT_AUX (dr)->base_decl = base;
947 DR_VECT_AUX (dr)->base_misaligned = true;
948 base_misalignment = 0;
950 poly_int64 misalignment
951 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
953 /* If this is a backward running DR then first access in the larger
954 vectype actually is N-1 elements before the address in the DR.
955 Adjust misalign accordingly. */
956 if (tree_int_cst_sgn (drb->step) < 0)
957 /* PLUS because STEP is negative. */
958 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
959 * TREE_INT_CST_LOW (drb->step));
961 unsigned int const_misalignment;
962 if (!known_misalignment (misalignment, vector_alignment,
963 &const_misalignment))
965 if (dump_enabled_p ())
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
968 "Non-constant misalignment for access: ");
969 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
970 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
972 return true;
975 SET_DR_MISALIGNMENT (dr, const_misalignment);
977 if (dump_enabled_p ())
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
980 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
981 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
982 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
985 return true;
988 /* Function vect_update_misalignment_for_peel.
989 Sets DR's misalignment
990 - to 0 if it has the same alignment as DR_PEEL,
991 - to the misalignment computed using NPEEL if DR's salignment is known,
992 - to -1 (unknown) otherwise.
994 DR - the data reference whose misalignment is to be adjusted.
995 DR_PEEL - the data reference whose misalignment is being made
996 zero in the vector loop by the peel.
997 NPEEL - the number of iterations in the peel loop if the misalignment
998 of DR_PEEL is known at compile time. */
1000 static void
1001 vect_update_misalignment_for_peel (struct data_reference *dr,
1002 struct data_reference *dr_peel, int npeel)
1004 unsigned int i;
1005 vec<dr_p> same_aligned_drs;
1006 struct data_reference *current_dr;
1007 int dr_size = vect_get_scalar_dr_size (dr);
1008 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1009 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1010 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1012 /* For interleaved data accesses the step in the loop must be multiplied by
1013 the size of the interleaving group. */
1014 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1015 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1016 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1017 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1019 /* It can be assumed that the data refs with the same alignment as dr_peel
1020 are aligned in the vector loop. */
1021 same_aligned_drs
1022 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1023 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1025 if (current_dr != dr)
1026 continue;
1027 gcc_assert (!known_alignment_for_access_p (dr)
1028 || !known_alignment_for_access_p (dr_peel)
1029 || (DR_MISALIGNMENT (dr) / dr_size
1030 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1031 SET_DR_MISALIGNMENT (dr, 0);
1032 return;
1035 if (known_alignment_for_access_p (dr)
1036 && known_alignment_for_access_p (dr_peel))
1038 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1039 int misal = DR_MISALIGNMENT (dr);
1040 misal += negative ? -npeel * dr_size : npeel * dr_size;
1041 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1042 SET_DR_MISALIGNMENT (dr, misal);
1043 return;
1046 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1048 "to unknown (-1).\n");
1049 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1053 /* Function verify_data_ref_alignment
1055 Return TRUE if DR can be handled with respect to alignment. */
1057 static bool
1058 verify_data_ref_alignment (data_reference_p dr)
1060 enum dr_alignment_support supportable_dr_alignment
1061 = vect_supportable_dr_alignment (dr, false);
1062 if (!supportable_dr_alignment)
1064 if (dump_enabled_p ())
1066 if (DR_IS_READ (dr))
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068 "not vectorized: unsupported unaligned load.");
1069 else
1070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1071 "not vectorized: unsupported unaligned "
1072 "store.");
1074 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1075 DR_REF (dr));
1076 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1078 return false;
1081 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1082 dump_printf_loc (MSG_NOTE, vect_location,
1083 "Vectorizing an unaligned access.\n");
1085 return true;
1088 /* Function vect_verify_datarefs_alignment
1090 Return TRUE if all data references in the loop can be
1091 handled with respect to alignment. */
1093 bool
1094 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1096 vec<data_reference_p> datarefs = vinfo->datarefs;
1097 struct data_reference *dr;
1098 unsigned int i;
1100 FOR_EACH_VEC_ELT (datarefs, i, dr)
1102 gimple *stmt = DR_STMT (dr);
1103 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1105 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1106 continue;
1108 /* For interleaving, only the alignment of the first access matters. */
1109 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1110 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1111 continue;
1113 /* Strided accesses perform only component accesses, alignment is
1114 irrelevant for them. */
1115 if (STMT_VINFO_STRIDED_P (stmt_info)
1116 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1117 continue;
1119 if (! verify_data_ref_alignment (dr))
1120 return false;
1123 return true;
1126 /* Given an memory reference EXP return whether its alignment is less
1127 than its size. */
1129 static bool
1130 not_size_aligned (tree exp)
1132 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1133 return true;
1135 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1136 > get_object_alignment (exp));
1139 /* Function vector_alignment_reachable_p
1141 Return true if vector alignment for DR is reachable by peeling
1142 a few loop iterations. Return false otherwise. */
1144 static bool
1145 vector_alignment_reachable_p (struct data_reference *dr)
1147 gimple *stmt = DR_STMT (dr);
1148 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1149 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1151 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1153 /* For interleaved access we peel only if number of iterations in
1154 the prolog loop ({VF - misalignment}), is a multiple of the
1155 number of the interleaved accesses. */
1156 int elem_size, mis_in_elements;
1157 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1159 /* FORNOW: handle only known alignment. */
1160 if (!known_alignment_for_access_p (dr))
1161 return false;
1163 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1164 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1166 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1167 return false;
1170 /* If misalignment is known at the compile time then allow peeling
1171 only if natural alignment is reachable through peeling. */
1172 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1174 HOST_WIDE_INT elmsize =
1175 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1176 if (dump_enabled_p ())
1178 dump_printf_loc (MSG_NOTE, vect_location,
1179 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1180 dump_printf (MSG_NOTE,
1181 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1183 if (DR_MISALIGNMENT (dr) % elmsize)
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187 "data size does not divide the misalignment.\n");
1188 return false;
1192 if (!known_alignment_for_access_p (dr))
1194 tree type = TREE_TYPE (DR_REF (dr));
1195 bool is_packed = not_size_aligned (DR_REF (dr));
1196 if (dump_enabled_p ())
1197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1198 "Unknown misalignment, %snaturally aligned\n",
1199 is_packed ? "not " : "");
1200 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1203 return true;
1207 /* Calculate the cost of the memory access represented by DR. */
1209 static void
1210 vect_get_data_access_cost (struct data_reference *dr,
1211 unsigned int *inside_cost,
1212 unsigned int *outside_cost,
1213 stmt_vector_for_cost *body_cost_vec)
1215 gimple *stmt = DR_STMT (dr);
1216 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1217 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1218 int ncopies;
1220 if (PURE_SLP_STMT (stmt_info))
1221 ncopies = 1;
1222 else
1223 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1225 if (DR_IS_READ (dr))
1226 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1227 NULL, body_cost_vec, false);
1228 else
1229 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1231 if (dump_enabled_p ())
1232 dump_printf_loc (MSG_NOTE, vect_location,
1233 "vect_get_data_access_cost: inside_cost = %d, "
1234 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1238 typedef struct _vect_peel_info
1240 struct data_reference *dr;
1241 int npeel;
1242 unsigned int count;
1243 } *vect_peel_info;
1245 typedef struct _vect_peel_extended_info
1247 struct _vect_peel_info peel_info;
1248 unsigned int inside_cost;
1249 unsigned int outside_cost;
1250 } *vect_peel_extended_info;
1253 /* Peeling hashtable helpers. */
1255 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1257 static inline hashval_t hash (const _vect_peel_info *);
1258 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1261 inline hashval_t
1262 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1264 return (hashval_t) peel_info->npeel;
1267 inline bool
1268 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1270 return (a->npeel == b->npeel);
1274 /* Insert DR into peeling hash table with NPEEL as key. */
1276 static void
1277 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1278 loop_vec_info loop_vinfo, struct data_reference *dr,
1279 int npeel)
1281 struct _vect_peel_info elem, *slot;
1282 _vect_peel_info **new_slot;
1283 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1285 elem.npeel = npeel;
1286 slot = peeling_htab->find (&elem);
1287 if (slot)
1288 slot->count++;
1289 else
1291 slot = XNEW (struct _vect_peel_info);
1292 slot->npeel = npeel;
1293 slot->dr = dr;
1294 slot->count = 1;
1295 new_slot = peeling_htab->find_slot (slot, INSERT);
1296 *new_slot = slot;
1299 if (!supportable_dr_alignment
1300 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1301 slot->count += VECT_MAX_COST;
1305 /* Traverse peeling hash table to find peeling option that aligns maximum
1306 number of data accesses. */
1309 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1310 _vect_peel_extended_info *max)
1312 vect_peel_info elem = *slot;
1314 if (elem->count > max->peel_info.count
1315 || (elem->count == max->peel_info.count
1316 && max->peel_info.npeel > elem->npeel))
1318 max->peel_info.npeel = elem->npeel;
1319 max->peel_info.count = elem->count;
1320 max->peel_info.dr = elem->dr;
1323 return 1;
1326 /* Get the costs of peeling NPEEL iterations checking data access costs
1327 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1328 misalignment will be zero after peeling. */
1330 static void
1331 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1332 struct data_reference *dr0,
1333 unsigned int *inside_cost,
1334 unsigned int *outside_cost,
1335 stmt_vector_for_cost *body_cost_vec,
1336 unsigned int npeel,
1337 bool unknown_misalignment)
1339 unsigned i;
1340 data_reference *dr;
1342 FOR_EACH_VEC_ELT (datarefs, i, dr)
1344 gimple *stmt = DR_STMT (dr);
1345 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1346 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1347 continue;
1349 /* For interleaving, only the alignment of the first access
1350 matters. */
1351 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1352 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1353 continue;
1355 /* Strided accesses perform only component accesses, alignment is
1356 irrelevant for them. */
1357 if (STMT_VINFO_STRIDED_P (stmt_info)
1358 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1359 continue;
1361 int save_misalignment;
1362 save_misalignment = DR_MISALIGNMENT (dr);
1363 if (npeel == 0)
1365 else if (unknown_misalignment && dr == dr0)
1366 SET_DR_MISALIGNMENT (dr, 0);
1367 else
1368 vect_update_misalignment_for_peel (dr, dr0, npeel);
1369 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1370 body_cost_vec);
1371 SET_DR_MISALIGNMENT (dr, save_misalignment);
1375 /* Traverse peeling hash table and calculate cost for each peeling option.
1376 Find the one with the lowest cost. */
1379 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1380 _vect_peel_extended_info *min)
1382 vect_peel_info elem = *slot;
1383 int dummy;
1384 unsigned int inside_cost = 0, outside_cost = 0;
1385 gimple *stmt = DR_STMT (elem->dr);
1386 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1387 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1388 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1389 epilogue_cost_vec;
1391 prologue_cost_vec.create (2);
1392 body_cost_vec.create (2);
1393 epilogue_cost_vec.create (2);
1395 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1396 elem->dr, &inside_cost, &outside_cost,
1397 &body_cost_vec, elem->npeel, false);
1399 body_cost_vec.release ();
1401 outside_cost += vect_get_known_peeling_cost
1402 (loop_vinfo, elem->npeel, &dummy,
1403 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1404 &prologue_cost_vec, &epilogue_cost_vec);
1406 /* Prologue and epilogue costs are added to the target model later.
1407 These costs depend only on the scalar iteration cost, the
1408 number of peeling iterations finally chosen, and the number of
1409 misaligned statements. So discard the information found here. */
1410 prologue_cost_vec.release ();
1411 epilogue_cost_vec.release ();
1413 if (inside_cost < min->inside_cost
1414 || (inside_cost == min->inside_cost
1415 && outside_cost < min->outside_cost))
1417 min->inside_cost = inside_cost;
1418 min->outside_cost = outside_cost;
1419 min->peel_info.dr = elem->dr;
1420 min->peel_info.npeel = elem->npeel;
1421 min->peel_info.count = elem->count;
1424 return 1;
1428 /* Choose best peeling option by traversing peeling hash table and either
1429 choosing an option with the lowest cost (if cost model is enabled) or the
1430 option that aligns as many accesses as possible. */
1432 static struct _vect_peel_extended_info
1433 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1434 loop_vec_info loop_vinfo)
1436 struct _vect_peel_extended_info res;
1438 res.peel_info.dr = NULL;
1440 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1442 res.inside_cost = INT_MAX;
1443 res.outside_cost = INT_MAX;
1444 peeling_htab->traverse <_vect_peel_extended_info *,
1445 vect_peeling_hash_get_lowest_cost> (&res);
1447 else
1449 res.peel_info.count = 0;
1450 peeling_htab->traverse <_vect_peel_extended_info *,
1451 vect_peeling_hash_get_most_frequent> (&res);
1452 res.inside_cost = 0;
1453 res.outside_cost = 0;
1456 return res;
1459 /* Return true if the new peeling NPEEL is supported. */
1461 static bool
1462 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1463 unsigned npeel)
1465 unsigned i;
1466 struct data_reference *dr = NULL;
1467 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1468 gimple *stmt;
1469 stmt_vec_info stmt_info;
1470 enum dr_alignment_support supportable_dr_alignment;
1472 /* Ensure that all data refs can be vectorized after the peel. */
1473 FOR_EACH_VEC_ELT (datarefs, i, dr)
1475 int save_misalignment;
1477 if (dr == dr0)
1478 continue;
1480 stmt = DR_STMT (dr);
1481 stmt_info = vinfo_for_stmt (stmt);
1482 /* For interleaving, only the alignment of the first access
1483 matters. */
1484 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1485 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1486 continue;
1488 /* Strided accesses perform only component accesses, alignment is
1489 irrelevant for them. */
1490 if (STMT_VINFO_STRIDED_P (stmt_info)
1491 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1492 continue;
1494 save_misalignment = DR_MISALIGNMENT (dr);
1495 vect_update_misalignment_for_peel (dr, dr0, npeel);
1496 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1497 SET_DR_MISALIGNMENT (dr, save_misalignment);
1499 if (!supportable_dr_alignment)
1500 return false;
1503 return true;
1506 /* Function vect_enhance_data_refs_alignment
1508 This pass will use loop versioning and loop peeling in order to enhance
1509 the alignment of data references in the loop.
1511 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1512 original loop is to be vectorized. Any other loops that are created by
1513 the transformations performed in this pass - are not supposed to be
1514 vectorized. This restriction will be relaxed.
1516 This pass will require a cost model to guide it whether to apply peeling
1517 or versioning or a combination of the two. For example, the scheme that
1518 intel uses when given a loop with several memory accesses, is as follows:
1519 choose one memory access ('p') which alignment you want to force by doing
1520 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1521 other accesses are not necessarily aligned, or (2) use loop versioning to
1522 generate one loop in which all accesses are aligned, and another loop in
1523 which only 'p' is necessarily aligned.
1525 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1526 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1527 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1529 Devising a cost model is the most critical aspect of this work. It will
1530 guide us on which access to peel for, whether to use loop versioning, how
1531 many versions to create, etc. The cost model will probably consist of
1532 generic considerations as well as target specific considerations (on
1533 powerpc for example, misaligned stores are more painful than misaligned
1534 loads).
1536 Here are the general steps involved in alignment enhancements:
1538 -- original loop, before alignment analysis:
1539 for (i=0; i<N; i++){
1540 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1541 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1544 -- After vect_compute_data_refs_alignment:
1545 for (i=0; i<N; i++){
1546 x = q[i]; # DR_MISALIGNMENT(q) = 3
1547 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1550 -- Possibility 1: we do loop versioning:
1551 if (p is aligned) {
1552 for (i=0; i<N; i++){ # loop 1A
1553 x = q[i]; # DR_MISALIGNMENT(q) = 3
1554 p[i] = y; # DR_MISALIGNMENT(p) = 0
1557 else {
1558 for (i=0; i<N; i++){ # loop 1B
1559 x = q[i]; # DR_MISALIGNMENT(q) = 3
1560 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1564 -- Possibility 2: we do loop peeling:
1565 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1566 x = q[i];
1567 p[i] = y;
1569 for (i = 3; i < N; i++){ # loop 2A
1570 x = q[i]; # DR_MISALIGNMENT(q) = 0
1571 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1574 -- Possibility 3: combination of loop peeling and versioning:
1575 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1576 x = q[i];
1577 p[i] = y;
1579 if (p is aligned) {
1580 for (i = 3; i<N; i++){ # loop 3A
1581 x = q[i]; # DR_MISALIGNMENT(q) = 0
1582 p[i] = y; # DR_MISALIGNMENT(p) = 0
1585 else {
1586 for (i = 3; i<N; i++){ # loop 3B
1587 x = q[i]; # DR_MISALIGNMENT(q) = 0
1588 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1592 These loops are later passed to loop_transform to be vectorized. The
1593 vectorizer will use the alignment information to guide the transformation
1594 (whether to generate regular loads/stores, or with special handling for
1595 misalignment). */
1597 bool
1598 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1600 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1601 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1602 enum dr_alignment_support supportable_dr_alignment;
1603 struct data_reference *dr0 = NULL, *first_store = NULL;
1604 struct data_reference *dr;
1605 unsigned int i, j;
1606 bool do_peeling = false;
1607 bool do_versioning = false;
1608 bool stat;
1609 gimple *stmt;
1610 stmt_vec_info stmt_info;
1611 unsigned int npeel = 0;
1612 bool one_misalignment_known = false;
1613 bool one_misalignment_unknown = false;
1614 bool one_dr_unsupportable = false;
1615 struct data_reference *unsupportable_dr = NULL;
1616 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1617 unsigned possible_npeel_number = 1;
1618 tree vectype;
1619 unsigned int mis, same_align_drs_max = 0;
1620 hash_table<peel_info_hasher> peeling_htab (1);
1622 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_NOTE, vect_location,
1624 "=== vect_enhance_data_refs_alignment ===\n");
1626 /* Reset data so we can safely be called multiple times. */
1627 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1628 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1630 /* While cost model enhancements are expected in the future, the high level
1631 view of the code at this time is as follows:
1633 A) If there is a misaligned access then see if peeling to align
1634 this access can make all data references satisfy
1635 vect_supportable_dr_alignment. If so, update data structures
1636 as needed and return true.
1638 B) If peeling wasn't possible and there is a data reference with an
1639 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1640 then see if loop versioning checks can be used to make all data
1641 references satisfy vect_supportable_dr_alignment. If so, update
1642 data structures as needed and return true.
1644 C) If neither peeling nor versioning were successful then return false if
1645 any data reference does not satisfy vect_supportable_dr_alignment.
1647 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1649 Note, Possibility 3 above (which is peeling and versioning together) is not
1650 being done at this time. */
1652 /* (1) Peeling to force alignment. */
1654 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1655 Considerations:
1656 + How many accesses will become aligned due to the peeling
1657 - How many accesses will become unaligned due to the peeling,
1658 and the cost of misaligned accesses.
1659 - The cost of peeling (the extra runtime checks, the increase
1660 in code size). */
1662 FOR_EACH_VEC_ELT (datarefs, i, dr)
1664 stmt = DR_STMT (dr);
1665 stmt_info = vinfo_for_stmt (stmt);
1667 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1668 continue;
1670 /* For interleaving, only the alignment of the first access
1671 matters. */
1672 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1673 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1674 continue;
1676 /* For invariant accesses there is nothing to enhance. */
1677 if (integer_zerop (DR_STEP (dr)))
1678 continue;
1680 /* Strided accesses perform only component accesses, alignment is
1681 irrelevant for them. */
1682 if (STMT_VINFO_STRIDED_P (stmt_info)
1683 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1684 continue;
1686 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1687 do_peeling = vector_alignment_reachable_p (dr);
1688 if (do_peeling)
1690 if (known_alignment_for_access_p (dr))
1692 unsigned int npeel_tmp = 0;
1693 bool negative = tree_int_cst_compare (DR_STEP (dr),
1694 size_zero_node) < 0;
1696 vectype = STMT_VINFO_VECTYPE (stmt_info);
1697 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1698 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1699 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1700 if (DR_MISALIGNMENT (dr) != 0)
1701 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1703 /* For multiple types, it is possible that the bigger type access
1704 will have more than one peeling option. E.g., a loop with two
1705 types: one of size (vector size / 4), and the other one of
1706 size (vector size / 8). Vectorization factor will 8. If both
1707 accesses are misaligned by 3, the first one needs one scalar
1708 iteration to be aligned, and the second one needs 5. But the
1709 first one will be aligned also by peeling 5 scalar
1710 iterations, and in that case both accesses will be aligned.
1711 Hence, except for the immediate peeling amount, we also want
1712 to try to add full vector size, while we don't exceed
1713 vectorization factor.
1714 We do this automatically for cost model, since we calculate
1715 cost for every peeling option. */
1716 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1718 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1719 ? vf * GROUP_SIZE (stmt_info) : vf);
1720 possible_npeel_number
1721 = vect_get_num_vectors (nscalars, vectype);
1723 /* NPEEL_TMP is 0 when there is no misalignment, but also
1724 allow peeling NELEMENTS. */
1725 if (DR_MISALIGNMENT (dr) == 0)
1726 possible_npeel_number++;
1729 /* Save info about DR in the hash table. Also include peeling
1730 amounts according to the explanation above. */
1731 for (j = 0; j < possible_npeel_number; j++)
1733 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1734 dr, npeel_tmp);
1735 npeel_tmp += target_align / dr_size;
1738 one_misalignment_known = true;
1740 else
1742 /* If we don't know any misalignment values, we prefer
1743 peeling for data-ref that has the maximum number of data-refs
1744 with the same alignment, unless the target prefers to align
1745 stores over load. */
1746 unsigned same_align_drs
1747 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1748 if (!dr0
1749 || same_align_drs_max < same_align_drs)
1751 same_align_drs_max = same_align_drs;
1752 dr0 = dr;
1754 /* For data-refs with the same number of related
1755 accesses prefer the one where the misalign
1756 computation will be invariant in the outermost loop. */
1757 else if (same_align_drs_max == same_align_drs)
1759 struct loop *ivloop0, *ivloop;
1760 ivloop0 = outermost_invariant_loop_for_expr
1761 (loop, DR_BASE_ADDRESS (dr0));
1762 ivloop = outermost_invariant_loop_for_expr
1763 (loop, DR_BASE_ADDRESS (dr));
1764 if ((ivloop && !ivloop0)
1765 || (ivloop && ivloop0
1766 && flow_loop_nested_p (ivloop, ivloop0)))
1767 dr0 = dr;
1770 one_misalignment_unknown = true;
1772 /* Check for data refs with unsupportable alignment that
1773 can be peeled. */
1774 if (!supportable_dr_alignment)
1776 one_dr_unsupportable = true;
1777 unsupportable_dr = dr;
1780 if (!first_store && DR_IS_WRITE (dr))
1781 first_store = dr;
1784 else
1786 if (!aligned_access_p (dr))
1788 if (dump_enabled_p ())
1789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1790 "vector alignment may not be reachable\n");
1791 break;
1796 /* Check if we can possibly peel the loop. */
1797 if (!vect_can_advance_ivs_p (loop_vinfo)
1798 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1799 || loop->inner)
1800 do_peeling = false;
1802 struct _vect_peel_extended_info peel_for_known_alignment;
1803 struct _vect_peel_extended_info peel_for_unknown_alignment;
1804 struct _vect_peel_extended_info best_peel;
1806 peel_for_unknown_alignment.inside_cost = INT_MAX;
1807 peel_for_unknown_alignment.outside_cost = INT_MAX;
1808 peel_for_unknown_alignment.peel_info.count = 0;
1810 if (do_peeling
1811 && one_misalignment_unknown)
1813 /* Check if the target requires to prefer stores over loads, i.e., if
1814 misaligned stores are more expensive than misaligned loads (taking
1815 drs with same alignment into account). */
1816 unsigned int load_inside_cost = 0;
1817 unsigned int load_outside_cost = 0;
1818 unsigned int store_inside_cost = 0;
1819 unsigned int store_outside_cost = 0;
1820 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1822 stmt_vector_for_cost dummy;
1823 dummy.create (2);
1824 vect_get_peeling_costs_all_drs (datarefs, dr0,
1825 &load_inside_cost,
1826 &load_outside_cost,
1827 &dummy, estimated_npeels, true);
1828 dummy.release ();
1830 if (first_store)
1832 dummy.create (2);
1833 vect_get_peeling_costs_all_drs (datarefs, first_store,
1834 &store_inside_cost,
1835 &store_outside_cost,
1836 &dummy, estimated_npeels, true);
1837 dummy.release ();
1839 else
1841 store_inside_cost = INT_MAX;
1842 store_outside_cost = INT_MAX;
1845 if (load_inside_cost > store_inside_cost
1846 || (load_inside_cost == store_inside_cost
1847 && load_outside_cost > store_outside_cost))
1849 dr0 = first_store;
1850 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1851 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1853 else
1855 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1856 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1859 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1860 prologue_cost_vec.create (2);
1861 epilogue_cost_vec.create (2);
1863 int dummy2;
1864 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1865 (loop_vinfo, estimated_npeels, &dummy2,
1866 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1867 &prologue_cost_vec, &epilogue_cost_vec);
1869 prologue_cost_vec.release ();
1870 epilogue_cost_vec.release ();
1872 peel_for_unknown_alignment.peel_info.count = 1
1873 + STMT_VINFO_SAME_ALIGN_REFS
1874 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1877 peel_for_unknown_alignment.peel_info.npeel = 0;
1878 peel_for_unknown_alignment.peel_info.dr = dr0;
1880 best_peel = peel_for_unknown_alignment;
1882 peel_for_known_alignment.inside_cost = INT_MAX;
1883 peel_for_known_alignment.outside_cost = INT_MAX;
1884 peel_for_known_alignment.peel_info.count = 0;
1885 peel_for_known_alignment.peel_info.dr = NULL;
1887 if (do_peeling && one_misalignment_known)
1889 /* Peeling is possible, but there is no data access that is not supported
1890 unless aligned. So we try to choose the best possible peeling from
1891 the hash table. */
1892 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1893 (&peeling_htab, loop_vinfo);
1896 /* Compare costs of peeling for known and unknown alignment. */
1897 if (peel_for_known_alignment.peel_info.dr != NULL
1898 && peel_for_unknown_alignment.inside_cost
1899 >= peel_for_known_alignment.inside_cost)
1901 best_peel = peel_for_known_alignment;
1903 /* If the best peeling for known alignment has NPEEL == 0, perform no
1904 peeling at all except if there is an unsupportable dr that we can
1905 align. */
1906 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1907 do_peeling = false;
1910 /* If there is an unsupportable data ref, prefer this over all choices so far
1911 since we'd have to discard a chosen peeling except when it accidentally
1912 aligned the unsupportable data ref. */
1913 if (one_dr_unsupportable)
1914 dr0 = unsupportable_dr;
1915 else if (do_peeling)
1917 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1918 TODO: Use nopeel_outside_cost or get rid of it? */
1919 unsigned nopeel_inside_cost = 0;
1920 unsigned nopeel_outside_cost = 0;
1922 stmt_vector_for_cost dummy;
1923 dummy.create (2);
1924 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1925 &nopeel_outside_cost, &dummy, 0, false);
1926 dummy.release ();
1928 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1929 costs will be recorded. */
1930 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1931 prologue_cost_vec.create (2);
1932 epilogue_cost_vec.create (2);
1934 int dummy2;
1935 nopeel_outside_cost += vect_get_known_peeling_cost
1936 (loop_vinfo, 0, &dummy2,
1937 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1938 &prologue_cost_vec, &epilogue_cost_vec);
1940 prologue_cost_vec.release ();
1941 epilogue_cost_vec.release ();
1943 npeel = best_peel.peel_info.npeel;
1944 dr0 = best_peel.peel_info.dr;
1946 /* If no peeling is not more expensive than the best peeling we
1947 have so far, don't perform any peeling. */
1948 if (nopeel_inside_cost <= best_peel.inside_cost)
1949 do_peeling = false;
1952 if (do_peeling)
1954 stmt = DR_STMT (dr0);
1955 stmt_info = vinfo_for_stmt (stmt);
1956 vectype = STMT_VINFO_VECTYPE (stmt_info);
1958 if (known_alignment_for_access_p (dr0))
1960 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1961 size_zero_node) < 0;
1962 if (!npeel)
1964 /* Since it's known at compile time, compute the number of
1965 iterations in the peeled loop (the peeling factor) for use in
1966 updating DR_MISALIGNMENT values. The peeling factor is the
1967 vectorization factor minus the misalignment as an element
1968 count. */
1969 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1970 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1971 npeel = ((mis & (target_align - 1))
1972 / vect_get_scalar_dr_size (dr0));
1975 /* For interleaved data access every iteration accesses all the
1976 members of the group, therefore we divide the number of iterations
1977 by the group size. */
1978 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1979 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1980 npeel /= GROUP_SIZE (stmt_info);
1982 if (dump_enabled_p ())
1983 dump_printf_loc (MSG_NOTE, vect_location,
1984 "Try peeling by %d\n", npeel);
1987 /* Ensure that all datarefs can be vectorized after the peel. */
1988 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1989 do_peeling = false;
1991 /* Check if all datarefs are supportable and log. */
1992 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1994 stat = vect_verify_datarefs_alignment (loop_vinfo);
1995 if (!stat)
1996 do_peeling = false;
1997 else
1998 return stat;
2001 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2002 if (do_peeling)
2004 unsigned max_allowed_peel
2005 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2006 if (max_allowed_peel != (unsigned)-1)
2008 unsigned max_peel = npeel;
2009 if (max_peel == 0)
2011 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2012 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2014 if (max_peel > max_allowed_peel)
2016 do_peeling = false;
2017 if (dump_enabled_p ())
2018 dump_printf_loc (MSG_NOTE, vect_location,
2019 "Disable peeling, max peels reached: %d\n", max_peel);
2024 /* Cost model #2 - if peeling may result in a remaining loop not
2025 iterating enough to be vectorized then do not peel. Since this
2026 is a cost heuristic rather than a correctness decision, use the
2027 most likely runtime value for variable vectorization factors. */
2028 if (do_peeling
2029 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2031 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2032 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2033 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2034 < assumed_vf + max_peel)
2035 do_peeling = false;
2038 if (do_peeling)
2040 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2041 If the misalignment of DR_i is identical to that of dr0 then set
2042 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2043 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2044 by the peeling factor times the element size of DR_i (MOD the
2045 vectorization factor times the size). Otherwise, the
2046 misalignment of DR_i must be set to unknown. */
2047 FOR_EACH_VEC_ELT (datarefs, i, dr)
2048 if (dr != dr0)
2050 /* Strided accesses perform only component accesses, alignment
2051 is irrelevant for them. */
2052 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2053 if (STMT_VINFO_STRIDED_P (stmt_info)
2054 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2055 continue;
2057 vect_update_misalignment_for_peel (dr, dr0, npeel);
2060 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2061 if (npeel)
2062 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2063 else
2064 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2065 = DR_MISALIGNMENT (dr0);
2066 SET_DR_MISALIGNMENT (dr0, 0);
2067 if (dump_enabled_p ())
2069 dump_printf_loc (MSG_NOTE, vect_location,
2070 "Alignment of access forced using peeling.\n");
2071 dump_printf_loc (MSG_NOTE, vect_location,
2072 "Peeling for alignment will be applied.\n");
2075 /* The inside-loop cost will be accounted for in vectorizable_load
2076 and vectorizable_store correctly with adjusted alignments.
2077 Drop the body_cst_vec on the floor here. */
2078 stat = vect_verify_datarefs_alignment (loop_vinfo);
2079 gcc_assert (stat);
2080 return stat;
2084 /* (2) Versioning to force alignment. */
2086 /* Try versioning if:
2087 1) optimize loop for speed
2088 2) there is at least one unsupported misaligned data ref with an unknown
2089 misalignment, and
2090 3) all misaligned data refs with a known misalignment are supported, and
2091 4) the number of runtime alignment checks is within reason. */
2093 do_versioning =
2094 optimize_loop_nest_for_speed_p (loop)
2095 && (!loop->inner); /* FORNOW */
2097 if (do_versioning)
2099 FOR_EACH_VEC_ELT (datarefs, i, dr)
2101 stmt = DR_STMT (dr);
2102 stmt_info = vinfo_for_stmt (stmt);
2104 /* For interleaving, only the alignment of the first access
2105 matters. */
2106 if (aligned_access_p (dr)
2107 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2108 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2109 continue;
2111 if (STMT_VINFO_STRIDED_P (stmt_info))
2113 /* Strided loads perform only component accesses, alignment is
2114 irrelevant for them. */
2115 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2116 continue;
2117 do_versioning = false;
2118 break;
2121 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2123 if (!supportable_dr_alignment)
2125 gimple *stmt;
2126 int mask;
2127 tree vectype;
2129 if (known_alignment_for_access_p (dr)
2130 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2131 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2133 do_versioning = false;
2134 break;
2137 stmt = DR_STMT (dr);
2138 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2139 gcc_assert (vectype);
2141 /* The rightmost bits of an aligned address must be zeros.
2142 Construct the mask needed for this test. For example,
2143 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2144 mask must be 15 = 0xf. */
2145 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2147 /* FORNOW: use the same mask to test all potentially unaligned
2148 references in the loop. The vectorizer currently supports
2149 a single vector size, see the reference to
2150 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2151 vectorization factor is computed. */
2152 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2153 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2154 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2155 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2156 DR_STMT (dr));
2160 /* Versioning requires at least one misaligned data reference. */
2161 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2162 do_versioning = false;
2163 else if (!do_versioning)
2164 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2167 if (do_versioning)
2169 vec<gimple *> may_misalign_stmts
2170 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2171 gimple *stmt;
2173 /* It can now be assumed that the data references in the statements
2174 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2175 of the loop being vectorized. */
2176 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2178 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2179 dr = STMT_VINFO_DATA_REF (stmt_info);
2180 SET_DR_MISALIGNMENT (dr, 0);
2181 if (dump_enabled_p ())
2182 dump_printf_loc (MSG_NOTE, vect_location,
2183 "Alignment of access forced using versioning.\n");
2186 if (dump_enabled_p ())
2187 dump_printf_loc (MSG_NOTE, vect_location,
2188 "Versioning for alignment will be applied.\n");
2190 /* Peeling and versioning can't be done together at this time. */
2191 gcc_assert (! (do_peeling && do_versioning));
2193 stat = vect_verify_datarefs_alignment (loop_vinfo);
2194 gcc_assert (stat);
2195 return stat;
2198 /* This point is reached if neither peeling nor versioning is being done. */
2199 gcc_assert (! (do_peeling || do_versioning));
2201 stat = vect_verify_datarefs_alignment (loop_vinfo);
2202 return stat;
2206 /* Function vect_find_same_alignment_drs.
2208 Update group and alignment relations according to the chosen
2209 vectorization factor. */
2211 static void
2212 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2214 struct data_reference *dra = DDR_A (ddr);
2215 struct data_reference *drb = DDR_B (ddr);
2216 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2217 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2219 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2220 return;
2222 if (dra == drb)
2223 return;
2225 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2226 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2227 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2228 return;
2230 /* Two references with distance zero have the same alignment. */
2231 offset_int diff = (wi::to_offset (DR_INIT (dra))
2232 - wi::to_offset (DR_INIT (drb)));
2233 if (diff != 0)
2235 /* Get the wider of the two alignments. */
2236 unsigned int align_a = (vect_calculate_target_alignment (dra)
2237 / BITS_PER_UNIT);
2238 unsigned int align_b = (vect_calculate_target_alignment (drb)
2239 / BITS_PER_UNIT);
2240 unsigned int max_align = MAX (align_a, align_b);
2242 /* Require the gap to be a multiple of the larger vector alignment. */
2243 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2244 return;
2247 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2248 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2249 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_NOTE, vect_location,
2252 "accesses have the same alignment: ");
2253 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2254 dump_printf (MSG_NOTE, " and ");
2255 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2256 dump_printf (MSG_NOTE, "\n");
2261 /* Function vect_analyze_data_refs_alignment
2263 Analyze the alignment of the data-references in the loop.
2264 Return FALSE if a data reference is found that cannot be vectorized. */
2266 bool
2267 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2269 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_NOTE, vect_location,
2271 "=== vect_analyze_data_refs_alignment ===\n");
2273 /* Mark groups of data references with same alignment using
2274 data dependence information. */
2275 vec<ddr_p> ddrs = vinfo->ddrs;
2276 struct data_dependence_relation *ddr;
2277 unsigned int i;
2279 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2280 vect_find_same_alignment_drs (ddr);
2282 vec<data_reference_p> datarefs = vinfo->datarefs;
2283 struct data_reference *dr;
2285 vect_record_base_alignments (vinfo);
2286 FOR_EACH_VEC_ELT (datarefs, i, dr)
2288 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2289 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2290 && !vect_compute_data_ref_alignment (dr))
2292 /* Strided accesses perform only component accesses, misalignment
2293 information is irrelevant for them. */
2294 if (STMT_VINFO_STRIDED_P (stmt_info)
2295 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2296 continue;
2298 if (dump_enabled_p ())
2299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2300 "not vectorized: can't calculate alignment "
2301 "for data ref.\n");
2303 return false;
2307 return true;
2311 /* Analyze alignment of DRs of stmts in NODE. */
2313 static bool
2314 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2316 /* We vectorize from the first scalar stmt in the node unless
2317 the node is permuted in which case we start from the first
2318 element in the group. */
2319 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2320 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2321 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2322 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2324 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2325 if (! vect_compute_data_ref_alignment (dr)
2326 /* For creating the data-ref pointer we need alignment of the
2327 first element anyway. */
2328 || (dr != first_dr
2329 && ! vect_compute_data_ref_alignment (first_dr))
2330 || ! verify_data_ref_alignment (dr))
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334 "not vectorized: bad data alignment in basic "
2335 "block.\n");
2336 return false;
2339 return true;
2342 /* Function vect_slp_analyze_instance_alignment
2344 Analyze the alignment of the data-references in the SLP instance.
2345 Return FALSE if a data reference is found that cannot be vectorized. */
2347 bool
2348 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2350 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_NOTE, vect_location,
2352 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2354 slp_tree node;
2355 unsigned i;
2356 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2357 if (! vect_slp_analyze_and_verify_node_alignment (node))
2358 return false;
2360 node = SLP_INSTANCE_TREE (instance);
2361 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2362 && ! vect_slp_analyze_and_verify_node_alignment
2363 (SLP_INSTANCE_TREE (instance)))
2364 return false;
2366 return true;
2370 /* Analyze groups of accesses: check that DR belongs to a group of
2371 accesses of legal size, step, etc. Detect gaps, single element
2372 interleaving, and other special cases. Set grouped access info.
2373 Collect groups of strided stores for further use in SLP analysis.
2374 Worker for vect_analyze_group_access. */
2376 static bool
2377 vect_analyze_group_access_1 (struct data_reference *dr)
2379 tree step = DR_STEP (dr);
2380 tree scalar_type = TREE_TYPE (DR_REF (dr));
2381 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2382 gimple *stmt = DR_STMT (dr);
2383 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2384 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2385 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2386 HOST_WIDE_INT dr_step = -1;
2387 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2388 bool slp_impossible = false;
2390 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2391 size of the interleaving group (including gaps). */
2392 if (tree_fits_shwi_p (step))
2394 dr_step = tree_to_shwi (step);
2395 /* Check that STEP is a multiple of type size. Otherwise there is
2396 a non-element-sized gap at the end of the group which we
2397 cannot represent in GROUP_GAP or GROUP_SIZE.
2398 ??? As we can handle non-constant step fine here we should
2399 simply remove uses of GROUP_GAP between the last and first
2400 element and instead rely on DR_STEP. GROUP_SIZE then would
2401 simply not include that gap. */
2402 if ((dr_step % type_size) != 0)
2404 if (dump_enabled_p ())
2406 dump_printf_loc (MSG_NOTE, vect_location,
2407 "Step ");
2408 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2409 dump_printf (MSG_NOTE,
2410 " is not a multiple of the element size for ");
2411 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2412 dump_printf (MSG_NOTE, "\n");
2414 return false;
2416 groupsize = absu_hwi (dr_step) / type_size;
2418 else
2419 groupsize = 0;
2421 /* Not consecutive access is possible only if it is a part of interleaving. */
2422 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2424 /* Check if it this DR is a part of interleaving, and is a single
2425 element of the group that is accessed in the loop. */
2427 /* Gaps are supported only for loads. STEP must be a multiple of the type
2428 size. The size of the group must be a power of 2. */
2429 if (DR_IS_READ (dr)
2430 && (dr_step % type_size) == 0
2431 && groupsize > 0
2432 && pow2p_hwi (groupsize))
2434 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2435 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2436 GROUP_GAP (stmt_info) = groupsize - 1;
2437 if (dump_enabled_p ())
2439 dump_printf_loc (MSG_NOTE, vect_location,
2440 "Detected single element interleaving ");
2441 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2442 dump_printf (MSG_NOTE, " step ");
2443 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2444 dump_printf (MSG_NOTE, "\n");
2447 return true;
2450 if (dump_enabled_p ())
2452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453 "not consecutive access ");
2454 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2457 if (bb_vinfo)
2459 /* Mark the statement as unvectorizable. */
2460 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2461 return true;
2464 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2465 STMT_VINFO_STRIDED_P (stmt_info) = true;
2466 return true;
2469 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2471 /* First stmt in the interleaving chain. Check the chain. */
2472 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2473 struct data_reference *data_ref = dr;
2474 unsigned int count = 1;
2475 tree prev_init = DR_INIT (data_ref);
2476 gimple *prev = stmt;
2477 HOST_WIDE_INT diff, gaps = 0;
2479 while (next)
2481 /* Skip same data-refs. In case that two or more stmts share
2482 data-ref (supported only for loads), we vectorize only the first
2483 stmt, and the rest get their vectorized loads from the first
2484 one. */
2485 if (!tree_int_cst_compare (DR_INIT (data_ref),
2486 DR_INIT (STMT_VINFO_DATA_REF (
2487 vinfo_for_stmt (next)))))
2489 if (DR_IS_WRITE (data_ref))
2491 if (dump_enabled_p ())
2492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2493 "Two store stmts share the same dr.\n");
2494 return false;
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2499 "Two or more load stmts share the same dr.\n");
2501 /* For load use the same data-ref load. */
2502 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2504 prev = next;
2505 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2506 continue;
2509 prev = next;
2510 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2512 /* All group members have the same STEP by construction. */
2513 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2515 /* Check that the distance between two accesses is equal to the type
2516 size. Otherwise, we have gaps. */
2517 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2518 - TREE_INT_CST_LOW (prev_init)) / type_size;
2519 if (diff != 1)
2521 /* FORNOW: SLP of accesses with gaps is not supported. */
2522 slp_impossible = true;
2523 if (DR_IS_WRITE (data_ref))
2525 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2527 "interleaved store with gaps\n");
2528 return false;
2531 gaps += diff - 1;
2534 last_accessed_element += diff;
2536 /* Store the gap from the previous member of the group. If there is no
2537 gap in the access, GROUP_GAP is always 1. */
2538 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2540 prev_init = DR_INIT (data_ref);
2541 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2542 /* Count the number of data-refs in the chain. */
2543 count++;
2546 if (groupsize == 0)
2547 groupsize = count + gaps;
2549 /* This could be UINT_MAX but as we are generating code in a very
2550 inefficient way we have to cap earlier. See PR78699 for example. */
2551 if (groupsize > 4096)
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555 "group is too large\n");
2556 return false;
2559 /* Check that the size of the interleaving is equal to count for stores,
2560 i.e., that there are no gaps. */
2561 if (groupsize != count
2562 && !DR_IS_READ (dr))
2564 if (dump_enabled_p ())
2565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2566 "interleaved store with gaps\n");
2567 return false;
2570 /* If there is a gap after the last load in the group it is the
2571 difference between the groupsize and the last accessed
2572 element.
2573 When there is no gap, this difference should be 0. */
2574 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2576 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2577 if (dump_enabled_p ())
2579 dump_printf_loc (MSG_NOTE, vect_location,
2580 "Detected interleaving ");
2581 if (DR_IS_READ (dr))
2582 dump_printf (MSG_NOTE, "load ");
2583 else
2584 dump_printf (MSG_NOTE, "store ");
2585 dump_printf (MSG_NOTE, "of size %u starting with ",
2586 (unsigned)groupsize);
2587 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2588 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2589 dump_printf_loc (MSG_NOTE, vect_location,
2590 "There is a gap of %u elements after the group\n",
2591 GROUP_GAP (vinfo_for_stmt (stmt)));
2594 /* SLP: create an SLP data structure for every interleaving group of
2595 stores for further analysis in vect_analyse_slp. */
2596 if (DR_IS_WRITE (dr) && !slp_impossible)
2598 if (loop_vinfo)
2599 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2600 if (bb_vinfo)
2601 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2605 return true;
2608 /* Analyze groups of accesses: check that DR belongs to a group of
2609 accesses of legal size, step, etc. Detect gaps, single element
2610 interleaving, and other special cases. Set grouped access info.
2611 Collect groups of strided stores for further use in SLP analysis. */
2613 static bool
2614 vect_analyze_group_access (struct data_reference *dr)
2616 if (!vect_analyze_group_access_1 (dr))
2618 /* Dissolve the group if present. */
2619 gimple *next;
2620 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2621 while (stmt)
2623 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2624 next = GROUP_NEXT_ELEMENT (vinfo);
2625 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2626 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2627 stmt = next;
2629 return false;
2631 return true;
2634 /* Analyze the access pattern of the data-reference DR.
2635 In case of non-consecutive accesses call vect_analyze_group_access() to
2636 analyze groups of accesses. */
2638 static bool
2639 vect_analyze_data_ref_access (struct data_reference *dr)
2641 tree step = DR_STEP (dr);
2642 tree scalar_type = TREE_TYPE (DR_REF (dr));
2643 gimple *stmt = DR_STMT (dr);
2644 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2645 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2646 struct loop *loop = NULL;
2648 if (loop_vinfo)
2649 loop = LOOP_VINFO_LOOP (loop_vinfo);
2651 if (loop_vinfo && !step)
2653 if (dump_enabled_p ())
2654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2655 "bad data-ref access in loop\n");
2656 return false;
2659 /* Allow loads with zero step in inner-loop vectorization. */
2660 if (loop_vinfo && integer_zerop (step))
2662 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2663 if (!nested_in_vect_loop_p (loop, stmt))
2664 return DR_IS_READ (dr);
2665 /* Allow references with zero step for outer loops marked
2666 with pragma omp simd only - it guarantees absence of
2667 loop-carried dependencies between inner loop iterations. */
2668 if (!loop->force_vectorize)
2670 if (dump_enabled_p ())
2671 dump_printf_loc (MSG_NOTE, vect_location,
2672 "zero step in inner loop of nest\n");
2673 return false;
2677 if (loop && nested_in_vect_loop_p (loop, stmt))
2679 /* Interleaved accesses are not yet supported within outer-loop
2680 vectorization for references in the inner-loop. */
2681 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2683 /* For the rest of the analysis we use the outer-loop step. */
2684 step = STMT_VINFO_DR_STEP (stmt_info);
2685 if (integer_zerop (step))
2687 if (dump_enabled_p ())
2688 dump_printf_loc (MSG_NOTE, vect_location,
2689 "zero step in outer loop.\n");
2690 return DR_IS_READ (dr);
2694 /* Consecutive? */
2695 if (TREE_CODE (step) == INTEGER_CST)
2697 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2698 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2699 || (dr_step < 0
2700 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2702 /* Mark that it is not interleaving. */
2703 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2704 return true;
2708 if (loop && nested_in_vect_loop_p (loop, stmt))
2710 if (dump_enabled_p ())
2711 dump_printf_loc (MSG_NOTE, vect_location,
2712 "grouped access in outer loop.\n");
2713 return false;
2717 /* Assume this is a DR handled by non-constant strided load case. */
2718 if (TREE_CODE (step) != INTEGER_CST)
2719 return (STMT_VINFO_STRIDED_P (stmt_info)
2720 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2721 || vect_analyze_group_access (dr)));
2723 /* Not consecutive access - check if it's a part of interleaving group. */
2724 return vect_analyze_group_access (dr);
2727 /* Compare two data-references DRA and DRB to group them into chunks
2728 suitable for grouping. */
2730 static int
2731 dr_group_sort_cmp (const void *dra_, const void *drb_)
2733 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2734 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2735 int cmp;
2737 /* Stabilize sort. */
2738 if (dra == drb)
2739 return 0;
2741 /* DRs in different loops never belong to the same group. */
2742 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2743 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2744 if (loopa != loopb)
2745 return loopa->num < loopb->num ? -1 : 1;
2747 /* Ordering of DRs according to base. */
2748 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2749 DR_BASE_ADDRESS (drb));
2750 if (cmp != 0)
2751 return cmp;
2753 /* And according to DR_OFFSET. */
2754 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2755 if (cmp != 0)
2756 return cmp;
2758 /* Put reads before writes. */
2759 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2760 return DR_IS_READ (dra) ? -1 : 1;
2762 /* Then sort after access size. */
2763 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2764 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2765 if (cmp != 0)
2766 return cmp;
2768 /* And after step. */
2769 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2770 if (cmp != 0)
2771 return cmp;
2773 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2774 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2775 if (cmp == 0)
2776 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2777 return cmp;
2780 /* Function vect_analyze_data_ref_accesses.
2782 Analyze the access pattern of all the data references in the loop.
2784 FORNOW: the only access pattern that is considered vectorizable is a
2785 simple step 1 (consecutive) access.
2787 FORNOW: handle only arrays and pointer accesses. */
2789 bool
2790 vect_analyze_data_ref_accesses (vec_info *vinfo)
2792 unsigned int i;
2793 vec<data_reference_p> datarefs = vinfo->datarefs;
2794 struct data_reference *dr;
2796 if (dump_enabled_p ())
2797 dump_printf_loc (MSG_NOTE, vect_location,
2798 "=== vect_analyze_data_ref_accesses ===\n");
2800 if (datarefs.is_empty ())
2801 return true;
2803 /* Sort the array of datarefs to make building the interleaving chains
2804 linear. Don't modify the original vector's order, it is needed for
2805 determining what dependencies are reversed. */
2806 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2807 datarefs_copy.qsort (dr_group_sort_cmp);
2809 /* Build the interleaving chains. */
2810 for (i = 0; i < datarefs_copy.length () - 1;)
2812 data_reference_p dra = datarefs_copy[i];
2813 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2814 stmt_vec_info lastinfo = NULL;
2815 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2817 ++i;
2818 continue;
2820 for (i = i + 1; i < datarefs_copy.length (); ++i)
2822 data_reference_p drb = datarefs_copy[i];
2823 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2824 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2825 break;
2827 /* ??? Imperfect sorting (non-compatible types, non-modulo
2828 accesses, same accesses) can lead to a group to be artificially
2829 split here as we don't just skip over those. If it really
2830 matters we can push those to a worklist and re-iterate
2831 over them. The we can just skip ahead to the next DR here. */
2833 /* DRs in a different loop should not be put into the same
2834 interleaving group. */
2835 if (gimple_bb (DR_STMT (dra))->loop_father
2836 != gimple_bb (DR_STMT (drb))->loop_father)
2837 break;
2839 /* Check that the data-refs have same first location (except init)
2840 and they are both either store or load (not load and store,
2841 not masked loads or stores). */
2842 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2843 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2844 DR_BASE_ADDRESS (drb)) != 0
2845 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2846 || !gimple_assign_single_p (DR_STMT (dra))
2847 || !gimple_assign_single_p (DR_STMT (drb)))
2848 break;
2850 /* Check that the data-refs have the same constant size. */
2851 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2852 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2853 if (!tree_fits_uhwi_p (sza)
2854 || !tree_fits_uhwi_p (szb)
2855 || !tree_int_cst_equal (sza, szb))
2856 break;
2858 /* Check that the data-refs have the same step. */
2859 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2860 break;
2862 /* Check the types are compatible.
2863 ??? We don't distinguish this during sorting. */
2864 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2865 TREE_TYPE (DR_REF (drb))))
2866 break;
2868 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2869 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2870 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2871 HOST_WIDE_INT init_prev
2872 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2873 gcc_assert (init_a <= init_b
2874 && init_a <= init_prev
2875 && init_prev <= init_b);
2877 /* Do not place the same access in the interleaving chain twice. */
2878 if (init_b == init_prev)
2880 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2881 < gimple_uid (DR_STMT (drb)));
2882 /* ??? For now we simply "drop" the later reference which is
2883 otherwise the same rather than finishing off this group.
2884 In the end we'd want to re-process duplicates forming
2885 multiple groups from the refs, likely by just collecting
2886 all candidates (including duplicates and split points
2887 below) in a vector and then process them together. */
2888 continue;
2891 /* If init_b == init_a + the size of the type * k, we have an
2892 interleaving, and DRA is accessed before DRB. */
2893 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2894 if (type_size_a == 0
2895 || (init_b - init_a) % type_size_a != 0)
2896 break;
2898 /* If we have a store, the accesses are adjacent. This splits
2899 groups into chunks we support (we don't support vectorization
2900 of stores with gaps). */
2901 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2902 break;
2904 /* If the step (if not zero or non-constant) is greater than the
2905 difference between data-refs' inits this splits groups into
2906 suitable sizes. */
2907 if (tree_fits_shwi_p (DR_STEP (dra)))
2909 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2910 if (step != 0 && step <= (init_b - init_a))
2911 break;
2914 if (dump_enabled_p ())
2916 dump_printf_loc (MSG_NOTE, vect_location,
2917 "Detected interleaving ");
2918 if (DR_IS_READ (dra))
2919 dump_printf (MSG_NOTE, "load ");
2920 else
2921 dump_printf (MSG_NOTE, "store ");
2922 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2923 dump_printf (MSG_NOTE, " and ");
2924 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2925 dump_printf (MSG_NOTE, "\n");
2928 /* Link the found element into the group list. */
2929 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2931 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2932 lastinfo = stmtinfo_a;
2934 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2935 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2936 lastinfo = stmtinfo_b;
2940 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2941 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2942 && !vect_analyze_data_ref_access (dr))
2944 if (dump_enabled_p ())
2945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2946 "not vectorized: complicated access pattern.\n");
2948 if (is_a <bb_vec_info> (vinfo))
2950 /* Mark the statement as not vectorizable. */
2951 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2952 continue;
2954 else
2956 datarefs_copy.release ();
2957 return false;
2961 datarefs_copy.release ();
2962 return true;
2965 /* Function vect_vfa_segment_size.
2967 Create an expression that computes the size of segment
2968 that will be accessed for a data reference. The functions takes into
2969 account that realignment loads may access one more vector.
2971 Input:
2972 DR: The data reference.
2973 LENGTH_FACTOR: segment length to consider.
2975 Return an expression whose value is the size of segment which will be
2976 accessed by DR. */
2978 static tree
2979 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2981 tree segment_length;
2983 if (integer_zerop (DR_STEP (dr)))
2984 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2985 else
2986 segment_length = size_binop (MULT_EXPR,
2987 fold_convert (sizetype, DR_STEP (dr)),
2988 fold_convert (sizetype, length_factor));
2990 if (vect_supportable_dr_alignment (dr, false)
2991 == dr_explicit_realign_optimized)
2993 tree vector_size = TYPE_SIZE_UNIT
2994 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2996 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2998 return segment_length;
3001 /* Function vect_no_alias_p.
3003 Given data references A and B with equal base and offset, the alias
3004 relation can be decided at compilation time, return TRUE if they do
3005 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
3006 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3007 respectively. */
3009 static bool
3010 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
3011 tree segment_length_a, tree segment_length_b)
3013 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
3014 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
3015 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
3016 return false;
3018 tree seg_a_min = DR_INIT (a);
3019 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3020 seg_a_min, segment_length_a);
3021 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3022 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3023 [a, a+12) */
3024 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3026 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3027 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3028 seg_a_max, unit_size);
3029 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3030 DR_INIT (a), unit_size);
3032 tree seg_b_min = DR_INIT (b);
3033 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3034 seg_b_min, segment_length_b);
3035 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3037 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3038 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3039 seg_b_max, unit_size);
3040 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3041 DR_INIT (b), unit_size);
3044 if (tree_int_cst_le (seg_a_max, seg_b_min)
3045 || tree_int_cst_le (seg_b_max, seg_a_min))
3046 return true;
3048 return false;
3051 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3052 in DDR is >= VF. */
3054 static bool
3055 dependence_distance_ge_vf (data_dependence_relation *ddr,
3056 unsigned int loop_depth, poly_uint64 vf)
3058 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3059 || DDR_NUM_DIST_VECTS (ddr) == 0)
3060 return false;
3062 /* If the dependence is exact, we should have limited the VF instead. */
3063 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3065 unsigned int i;
3066 lambda_vector dist_v;
3067 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3069 HOST_WIDE_INT dist = dist_v[loop_depth];
3070 if (dist != 0
3071 && !(dist > 0 && DDR_REVERSED_P (ddr))
3072 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3073 return false;
3076 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_NOTE, vect_location,
3079 "dependence distance between ");
3080 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3081 dump_printf (MSG_NOTE, " and ");
3082 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3083 dump_printf (MSG_NOTE, " is >= VF\n");
3086 return true;
3089 /* Function vect_prune_runtime_alias_test_list.
3091 Prune a list of ddrs to be tested at run-time by versioning for alias.
3092 Merge several alias checks into one if possible.
3093 Return FALSE if resulting list of ddrs is longer then allowed by
3094 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3096 bool
3097 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3099 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3100 hash_set <tree_pair_hash> compared_objects;
3102 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3103 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3104 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3105 vec<vec_object_pair> &check_unequal_addrs
3106 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3107 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3108 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3110 ddr_p ddr;
3111 unsigned int i;
3112 tree length_factor;
3114 if (dump_enabled_p ())
3115 dump_printf_loc (MSG_NOTE, vect_location,
3116 "=== vect_prune_runtime_alias_test_list ===\n");
3118 if (may_alias_ddrs.is_empty ())
3119 return true;
3121 comp_alias_ddrs.create (may_alias_ddrs.length ());
3123 unsigned int loop_depth
3124 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3125 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3127 /* First, we collect all data ref pairs for aliasing checks. */
3128 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3130 int comp_res;
3131 struct data_reference *dr_a, *dr_b;
3132 gimple *dr_group_first_a, *dr_group_first_b;
3133 tree segment_length_a, segment_length_b;
3134 gimple *stmt_a, *stmt_b;
3136 /* Ignore the alias if the VF we chose ended up being no greater
3137 than the dependence distance. */
3138 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3139 continue;
3141 if (DDR_OBJECT_A (ddr))
3143 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3144 if (!compared_objects.add (new_pair))
3146 if (dump_enabled_p ())
3148 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3149 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3150 dump_printf (MSG_NOTE, " and ");
3151 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3152 dump_printf (MSG_NOTE, " have different addresses\n");
3154 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3156 continue;
3159 dr_a = DDR_A (ddr);
3160 stmt_a = DR_STMT (DDR_A (ddr));
3161 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3162 if (dr_group_first_a)
3164 stmt_a = dr_group_first_a;
3165 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3168 dr_b = DDR_B (ddr);
3169 stmt_b = DR_STMT (DDR_B (ddr));
3170 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3171 if (dr_group_first_b)
3173 stmt_b = dr_group_first_b;
3174 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3177 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3178 length_factor = scalar_loop_iters;
3179 else
3180 length_factor = size_int (vect_factor);
3181 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3182 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3184 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3185 DR_BASE_ADDRESS (dr_b));
3186 if (comp_res == 0)
3187 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3188 DR_OFFSET (dr_b));
3190 /* Alias is known at compilation time. */
3191 if (comp_res == 0
3192 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3193 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3194 && TREE_CODE (segment_length_a) == INTEGER_CST
3195 && TREE_CODE (segment_length_b) == INTEGER_CST)
3197 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3198 continue;
3200 if (dump_enabled_p ())
3201 dump_printf_loc (MSG_NOTE, vect_location,
3202 "not vectorized: compilation time alias.\n");
3204 return false;
3207 dr_with_seg_len_pair_t dr_with_seg_len_pair
3208 (dr_with_seg_len (dr_a, segment_length_a),
3209 dr_with_seg_len (dr_b, segment_length_b));
3211 /* Canonicalize pairs by sorting the two DR members. */
3212 if (comp_res > 0)
3213 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3215 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3218 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3220 unsigned int count = (comp_alias_ddrs.length ()
3221 + check_unequal_addrs.length ());
3222 dump_printf_loc (MSG_NOTE, vect_location,
3223 "improved number of alias checks from %d to %d\n",
3224 may_alias_ddrs.length (), count);
3225 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3227 if (dump_enabled_p ())
3228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3229 "number of versioning for alias "
3230 "run-time tests exceeds %d "
3231 "(--param vect-max-version-for-alias-checks)\n",
3232 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3233 return false;
3236 return true;
3239 /* Return true if a non-affine read or write in STMT is suitable for a
3240 gather load or scatter store. Describe the operation in *INFO if so. */
3242 bool
3243 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3244 gather_scatter_info *info)
3246 HOST_WIDE_INT scale = 1;
3247 poly_int64 pbitpos, pbitsize;
3248 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3249 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3250 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3251 tree offtype = NULL_TREE;
3252 tree decl, base, off;
3253 machine_mode pmode;
3254 int punsignedp, reversep, pvolatilep = 0;
3256 base = DR_REF (dr);
3257 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3258 see if we can use the def stmt of the address. */
3259 if (is_gimple_call (stmt)
3260 && gimple_call_internal_p (stmt)
3261 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3262 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3263 && TREE_CODE (base) == MEM_REF
3264 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3265 && integer_zerop (TREE_OPERAND (base, 1))
3266 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3268 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3269 if (is_gimple_assign (def_stmt)
3270 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3271 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3274 /* The gather and scatter builtins need address of the form
3275 loop_invariant + vector * {1, 2, 4, 8}
3277 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3278 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3279 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3280 multiplications and additions in it. To get a vector, we need
3281 a single SSA_NAME that will be defined in the loop and will
3282 contain everything that is not loop invariant and that can be
3283 vectorized. The following code attempts to find such a preexistng
3284 SSA_NAME OFF and put the loop invariants into a tree BASE
3285 that can be gimplified before the loop. */
3286 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3287 &punsignedp, &reversep, &pvolatilep);
3288 gcc_assert (base && !reversep);
3289 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3291 if (TREE_CODE (base) == MEM_REF)
3293 if (!integer_zerop (TREE_OPERAND (base, 1)))
3295 if (off == NULL_TREE)
3296 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3297 else
3298 off = size_binop (PLUS_EXPR, off,
3299 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3301 base = TREE_OPERAND (base, 0);
3303 else
3304 base = build_fold_addr_expr (base);
3306 if (off == NULL_TREE)
3307 off = size_zero_node;
3309 /* If base is not loop invariant, either off is 0, then we start with just
3310 the constant offset in the loop invariant BASE and continue with base
3311 as OFF, otherwise give up.
3312 We could handle that case by gimplifying the addition of base + off
3313 into some SSA_NAME and use that as off, but for now punt. */
3314 if (!expr_invariant_in_loop_p (loop, base))
3316 if (!integer_zerop (off))
3317 return false;
3318 off = base;
3319 base = size_int (pbytepos);
3321 /* Otherwise put base + constant offset into the loop invariant BASE
3322 and continue with OFF. */
3323 else
3325 base = fold_convert (sizetype, base);
3326 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3329 /* OFF at this point may be either a SSA_NAME or some tree expression
3330 from get_inner_reference. Try to peel off loop invariants from it
3331 into BASE as long as possible. */
3332 STRIP_NOPS (off);
3333 while (offtype == NULL_TREE)
3335 enum tree_code code;
3336 tree op0, op1, add = NULL_TREE;
3338 if (TREE_CODE (off) == SSA_NAME)
3340 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3342 if (expr_invariant_in_loop_p (loop, off))
3343 return false;
3345 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3346 break;
3348 op0 = gimple_assign_rhs1 (def_stmt);
3349 code = gimple_assign_rhs_code (def_stmt);
3350 op1 = gimple_assign_rhs2 (def_stmt);
3352 else
3354 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3355 return false;
3356 code = TREE_CODE (off);
3357 extract_ops_from_tree (off, &code, &op0, &op1);
3359 switch (code)
3361 case POINTER_PLUS_EXPR:
3362 case PLUS_EXPR:
3363 if (expr_invariant_in_loop_p (loop, op0))
3365 add = op0;
3366 off = op1;
3367 do_add:
3368 add = fold_convert (sizetype, add);
3369 if (scale != 1)
3370 add = size_binop (MULT_EXPR, add, size_int (scale));
3371 base = size_binop (PLUS_EXPR, base, add);
3372 continue;
3374 if (expr_invariant_in_loop_p (loop, op1))
3376 add = op1;
3377 off = op0;
3378 goto do_add;
3380 break;
3381 case MINUS_EXPR:
3382 if (expr_invariant_in_loop_p (loop, op1))
3384 add = fold_convert (sizetype, op1);
3385 add = size_binop (MINUS_EXPR, size_zero_node, add);
3386 off = op0;
3387 goto do_add;
3389 break;
3390 case MULT_EXPR:
3391 if (scale == 1 && tree_fits_shwi_p (op1))
3393 scale = tree_to_shwi (op1);
3394 off = op0;
3395 continue;
3397 break;
3398 case SSA_NAME:
3399 off = op0;
3400 continue;
3401 CASE_CONVERT:
3402 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3403 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3404 break;
3405 if (TYPE_PRECISION (TREE_TYPE (op0))
3406 == TYPE_PRECISION (TREE_TYPE (off)))
3408 off = op0;
3409 continue;
3411 if (TYPE_PRECISION (TREE_TYPE (op0))
3412 < TYPE_PRECISION (TREE_TYPE (off)))
3414 off = op0;
3415 offtype = TREE_TYPE (off);
3416 STRIP_NOPS (off);
3417 continue;
3419 break;
3420 default:
3421 break;
3423 break;
3426 /* If at the end OFF still isn't a SSA_NAME or isn't
3427 defined in the loop, punt. */
3428 if (TREE_CODE (off) != SSA_NAME
3429 || expr_invariant_in_loop_p (loop, off))
3430 return false;
3432 if (offtype == NULL_TREE)
3433 offtype = TREE_TYPE (off);
3435 if (DR_IS_READ (dr))
3436 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3437 offtype, scale);
3438 else
3439 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3440 offtype, scale);
3442 if (decl == NULL_TREE)
3443 return false;
3445 info->decl = decl;
3446 info->base = base;
3447 info->offset = off;
3448 info->offset_dt = vect_unknown_def_type;
3449 info->offset_vectype = NULL_TREE;
3450 info->scale = scale;
3451 return true;
3454 /* Function vect_analyze_data_refs.
3456 Find all the data references in the loop or basic block.
3458 The general structure of the analysis of data refs in the vectorizer is as
3459 follows:
3460 1- vect_analyze_data_refs(loop/bb): call
3461 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3462 in the loop/bb and their dependences.
3463 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3464 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3465 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3469 bool
3470 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3472 struct loop *loop = NULL;
3473 unsigned int i;
3474 struct data_reference *dr;
3475 tree scalar_type;
3477 if (dump_enabled_p ())
3478 dump_printf_loc (MSG_NOTE, vect_location,
3479 "=== vect_analyze_data_refs ===\n");
3481 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3482 loop = LOOP_VINFO_LOOP (loop_vinfo);
3484 /* Go through the data-refs, check that the analysis succeeded. Update
3485 pointer from stmt_vec_info struct to DR and vectype. */
3487 vec<data_reference_p> datarefs = vinfo->datarefs;
3488 FOR_EACH_VEC_ELT (datarefs, i, dr)
3490 gimple *stmt;
3491 stmt_vec_info stmt_info;
3492 tree base, offset, init;
3493 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3494 bool simd_lane_access = false;
3495 poly_uint64 vf;
3497 again:
3498 if (!dr || !DR_REF (dr))
3500 if (dump_enabled_p ())
3501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3502 "not vectorized: unhandled data-ref\n");
3503 return false;
3506 stmt = DR_STMT (dr);
3507 stmt_info = vinfo_for_stmt (stmt);
3509 /* Discard clobbers from the dataref vector. We will remove
3510 clobber stmts during vectorization. */
3511 if (gimple_clobber_p (stmt))
3513 free_data_ref (dr);
3514 if (i == datarefs.length () - 1)
3516 datarefs.pop ();
3517 break;
3519 datarefs.ordered_remove (i);
3520 dr = datarefs[i];
3521 goto again;
3524 /* Check that analysis of the data-ref succeeded. */
3525 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3526 || !DR_STEP (dr))
3528 bool maybe_gather
3529 = DR_IS_READ (dr)
3530 && !TREE_THIS_VOLATILE (DR_REF (dr))
3531 && targetm.vectorize.builtin_gather != NULL;
3532 bool maybe_scatter
3533 = DR_IS_WRITE (dr)
3534 && !TREE_THIS_VOLATILE (DR_REF (dr))
3535 && targetm.vectorize.builtin_scatter != NULL;
3536 bool maybe_simd_lane_access
3537 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3539 /* If target supports vector gather loads or scatter stores, or if
3540 this might be a SIMD lane access, see if they can't be used. */
3541 if (is_a <loop_vec_info> (vinfo)
3542 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3543 && !nested_in_vect_loop_p (loop, stmt))
3545 struct data_reference *newdr
3546 = create_data_ref (NULL, loop_containing_stmt (stmt),
3547 DR_REF (dr), stmt, !maybe_scatter,
3548 DR_IS_CONDITIONAL_IN_STMT (dr));
3549 gcc_assert (newdr != NULL && DR_REF (newdr));
3550 if (DR_BASE_ADDRESS (newdr)
3551 && DR_OFFSET (newdr)
3552 && DR_INIT (newdr)
3553 && DR_STEP (newdr)
3554 && integer_zerop (DR_STEP (newdr)))
3556 if (maybe_simd_lane_access)
3558 tree off = DR_OFFSET (newdr);
3559 STRIP_NOPS (off);
3560 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3561 && TREE_CODE (off) == MULT_EXPR
3562 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3564 tree step = TREE_OPERAND (off, 1);
3565 off = TREE_OPERAND (off, 0);
3566 STRIP_NOPS (off);
3567 if (CONVERT_EXPR_P (off)
3568 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3569 0)))
3570 < TYPE_PRECISION (TREE_TYPE (off)))
3571 off = TREE_OPERAND (off, 0);
3572 if (TREE_CODE (off) == SSA_NAME)
3574 gimple *def = SSA_NAME_DEF_STMT (off);
3575 tree reft = TREE_TYPE (DR_REF (newdr));
3576 if (is_gimple_call (def)
3577 && gimple_call_internal_p (def)
3578 && (gimple_call_internal_fn (def)
3579 == IFN_GOMP_SIMD_LANE))
3581 tree arg = gimple_call_arg (def, 0);
3582 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3583 arg = SSA_NAME_VAR (arg);
3584 if (arg == loop->simduid
3585 /* For now. */
3586 && tree_int_cst_equal
3587 (TYPE_SIZE_UNIT (reft),
3588 step))
3590 DR_OFFSET (newdr) = ssize_int (0);
3591 DR_STEP (newdr) = step;
3592 DR_OFFSET_ALIGNMENT (newdr)
3593 = BIGGEST_ALIGNMENT;
3594 DR_STEP_ALIGNMENT (newdr)
3595 = highest_pow2_factor (step);
3596 dr = newdr;
3597 simd_lane_access = true;
3603 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3605 dr = newdr;
3606 if (maybe_gather)
3607 gatherscatter = GATHER;
3608 else
3609 gatherscatter = SCATTER;
3612 if (gatherscatter == SG_NONE && !simd_lane_access)
3613 free_data_ref (newdr);
3616 if (gatherscatter == SG_NONE && !simd_lane_access)
3618 if (dump_enabled_p ())
3620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3621 "not vectorized: data ref analysis "
3622 "failed ");
3623 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3626 if (is_a <bb_vec_info> (vinfo))
3627 break;
3629 return false;
3633 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3635 if (dump_enabled_p ())
3636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3637 "not vectorized: base addr of dr is a "
3638 "constant\n");
3640 if (is_a <bb_vec_info> (vinfo))
3641 break;
3643 if (gatherscatter != SG_NONE || simd_lane_access)
3644 free_data_ref (dr);
3645 return false;
3648 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3650 if (dump_enabled_p ())
3652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3653 "not vectorized: volatile type ");
3654 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3657 if (is_a <bb_vec_info> (vinfo))
3658 break;
3660 return false;
3663 if (stmt_can_throw_internal (stmt))
3665 if (dump_enabled_p ())
3667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3668 "not vectorized: statement can throw an "
3669 "exception ");
3670 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3673 if (is_a <bb_vec_info> (vinfo))
3674 break;
3676 if (gatherscatter != SG_NONE || simd_lane_access)
3677 free_data_ref (dr);
3678 return false;
3681 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3682 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3684 if (dump_enabled_p ())
3686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3687 "not vectorized: statement is bitfield "
3688 "access ");
3689 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3692 if (is_a <bb_vec_info> (vinfo))
3693 break;
3695 if (gatherscatter != SG_NONE || simd_lane_access)
3696 free_data_ref (dr);
3697 return false;
3700 base = unshare_expr (DR_BASE_ADDRESS (dr));
3701 offset = unshare_expr (DR_OFFSET (dr));
3702 init = unshare_expr (DR_INIT (dr));
3704 if (is_gimple_call (stmt)
3705 && (!gimple_call_internal_p (stmt)
3706 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3707 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3709 if (dump_enabled_p ())
3711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3712 "not vectorized: dr in a call ");
3713 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3716 if (is_a <bb_vec_info> (vinfo))
3717 break;
3719 if (gatherscatter != SG_NONE || simd_lane_access)
3720 free_data_ref (dr);
3721 return false;
3724 /* Update DR field in stmt_vec_info struct. */
3726 /* If the dataref is in an inner-loop of the loop that is considered for
3727 for vectorization, we also want to analyze the access relative to
3728 the outer-loop (DR contains information only relative to the
3729 inner-most enclosing loop). We do that by building a reference to the
3730 first location accessed by the inner-loop, and analyze it relative to
3731 the outer-loop. */
3732 if (loop && nested_in_vect_loop_p (loop, stmt))
3734 /* Build a reference to the first location accessed by the
3735 inner loop: *(BASE + INIT + OFFSET). By construction,
3736 this address must be invariant in the inner loop, so we
3737 can consider it as being used in the outer loop. */
3738 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3739 init, offset);
3740 tree init_addr = fold_build_pointer_plus (base, init_offset);
3741 tree init_ref = build_fold_indirect_ref (init_addr);
3743 if (dump_enabled_p ())
3745 dump_printf_loc (MSG_NOTE, vect_location,
3746 "analyze in outer loop: ");
3747 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3748 dump_printf (MSG_NOTE, "\n");
3751 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3752 init_ref, loop))
3753 /* dr_analyze_innermost already explained the failure. */
3754 return false;
3756 if (dump_enabled_p ())
3758 dump_printf_loc (MSG_NOTE, vect_location,
3759 "\touter base_address: ");
3760 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3761 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3762 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3763 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3764 STMT_VINFO_DR_OFFSET (stmt_info));
3765 dump_printf (MSG_NOTE,
3766 "\n\touter constant offset from base address: ");
3767 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3768 STMT_VINFO_DR_INIT (stmt_info));
3769 dump_printf (MSG_NOTE, "\n\touter step: ");
3770 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3771 STMT_VINFO_DR_STEP (stmt_info));
3772 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3773 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3774 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3775 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3776 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3777 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3778 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3779 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3783 if (STMT_VINFO_DATA_REF (stmt_info))
3785 if (dump_enabled_p ())
3787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3788 "not vectorized: more than one data ref "
3789 "in stmt: ");
3790 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3793 if (is_a <bb_vec_info> (vinfo))
3794 break;
3796 if (gatherscatter != SG_NONE || simd_lane_access)
3797 free_data_ref (dr);
3798 return false;
3801 STMT_VINFO_DATA_REF (stmt_info) = dr;
3802 if (simd_lane_access)
3804 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3805 free_data_ref (datarefs[i]);
3806 datarefs[i] = dr;
3809 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3810 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3811 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3813 if (dump_enabled_p ())
3815 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3816 "not vectorized: base object not addressable "
3817 "for stmt: ");
3818 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3820 if (is_a <bb_vec_info> (vinfo))
3822 /* In BB vectorization the ref can still participate
3823 in dependence analysis, we just can't vectorize it. */
3824 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3825 continue;
3827 return false;
3830 /* Set vectype for STMT. */
3831 scalar_type = TREE_TYPE (DR_REF (dr));
3832 STMT_VINFO_VECTYPE (stmt_info)
3833 = get_vectype_for_scalar_type (scalar_type);
3834 if (!STMT_VINFO_VECTYPE (stmt_info))
3836 if (dump_enabled_p ())
3838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3839 "not vectorized: no vectype for stmt: ");
3840 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3841 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3842 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3843 scalar_type);
3844 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3847 if (is_a <bb_vec_info> (vinfo))
3849 /* No vector type is fine, the ref can still participate
3850 in dependence analysis, we just can't vectorize it. */
3851 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3852 continue;
3855 if (gatherscatter != SG_NONE || simd_lane_access)
3857 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3858 if (gatherscatter != SG_NONE)
3859 free_data_ref (dr);
3861 return false;
3863 else
3865 if (dump_enabled_p ())
3867 dump_printf_loc (MSG_NOTE, vect_location,
3868 "got vectype for stmt: ");
3869 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3870 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3871 STMT_VINFO_VECTYPE (stmt_info));
3872 dump_printf (MSG_NOTE, "\n");
3876 /* Adjust the minimal vectorization factor according to the
3877 vector type. */
3878 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3879 *min_vf = upper_bound (*min_vf, vf);
3881 if (gatherscatter != SG_NONE)
3883 gather_scatter_info gs_info;
3884 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3885 &gs_info)
3886 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3888 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3889 free_data_ref (dr);
3890 if (dump_enabled_p ())
3892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3893 (gatherscatter == GATHER) ?
3894 "not vectorized: not suitable for gather "
3895 "load " :
3896 "not vectorized: not suitable for scatter "
3897 "store ");
3898 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3900 return false;
3903 free_data_ref (datarefs[i]);
3904 datarefs[i] = dr;
3905 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3908 else if (is_a <loop_vec_info> (vinfo)
3909 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3911 if (nested_in_vect_loop_p (loop, stmt))
3913 if (dump_enabled_p ())
3915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3916 "not vectorized: not suitable for strided "
3917 "load ");
3918 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3920 return false;
3922 STMT_VINFO_STRIDED_P (stmt_info) = true;
3926 /* If we stopped analysis at the first dataref we could not analyze
3927 when trying to vectorize a basic-block mark the rest of the datarefs
3928 as not vectorizable and truncate the vector of datarefs. That
3929 avoids spending useless time in analyzing their dependence. */
3930 if (i != datarefs.length ())
3932 gcc_assert (is_a <bb_vec_info> (vinfo));
3933 for (unsigned j = i; j < datarefs.length (); ++j)
3935 data_reference_p dr = datarefs[j];
3936 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3937 free_data_ref (dr);
3939 datarefs.truncate (i);
3942 return true;
3946 /* Function vect_get_new_vect_var.
3948 Returns a name for a new variable. The current naming scheme appends the
3949 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3950 the name of vectorizer generated variables, and appends that to NAME if
3951 provided. */
3953 tree
3954 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3956 const char *prefix;
3957 tree new_vect_var;
3959 switch (var_kind)
3961 case vect_simple_var:
3962 prefix = "vect";
3963 break;
3964 case vect_scalar_var:
3965 prefix = "stmp";
3966 break;
3967 case vect_mask_var:
3968 prefix = "mask";
3969 break;
3970 case vect_pointer_var:
3971 prefix = "vectp";
3972 break;
3973 default:
3974 gcc_unreachable ();
3977 if (name)
3979 char* tmp = concat (prefix, "_", name, NULL);
3980 new_vect_var = create_tmp_reg (type, tmp);
3981 free (tmp);
3983 else
3984 new_vect_var = create_tmp_reg (type, prefix);
3986 return new_vect_var;
3989 /* Like vect_get_new_vect_var but return an SSA name. */
3991 tree
3992 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3994 const char *prefix;
3995 tree new_vect_var;
3997 switch (var_kind)
3999 case vect_simple_var:
4000 prefix = "vect";
4001 break;
4002 case vect_scalar_var:
4003 prefix = "stmp";
4004 break;
4005 case vect_pointer_var:
4006 prefix = "vectp";
4007 break;
4008 default:
4009 gcc_unreachable ();
4012 if (name)
4014 char* tmp = concat (prefix, "_", name, NULL);
4015 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4016 free (tmp);
4018 else
4019 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4021 return new_vect_var;
4024 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4026 static void
4027 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4029 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4030 int misalign = DR_MISALIGNMENT (dr);
4031 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4032 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4033 else
4034 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4035 DR_TARGET_ALIGNMENT (dr), misalign);
4038 /* Function vect_create_addr_base_for_vector_ref.
4040 Create an expression that computes the address of the first memory location
4041 that will be accessed for a data reference.
4043 Input:
4044 STMT: The statement containing the data reference.
4045 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4046 OFFSET: Optional. If supplied, it is be added to the initial address.
4047 LOOP: Specify relative to which loop-nest should the address be computed.
4048 For example, when the dataref is in an inner-loop nested in an
4049 outer-loop that is now being vectorized, LOOP can be either the
4050 outer-loop, or the inner-loop. The first memory location accessed
4051 by the following dataref ('in' points to short):
4053 for (i=0; i<N; i++)
4054 for (j=0; j<M; j++)
4055 s += in[i+j]
4057 is as follows:
4058 if LOOP=i_loop: &in (relative to i_loop)
4059 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4060 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4061 initial address. Unlike OFFSET, which is number of elements to
4062 be added, BYTE_OFFSET is measured in bytes.
4064 Output:
4065 1. Return an SSA_NAME whose value is the address of the memory location of
4066 the first vector of the data reference.
4067 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4068 these statement(s) which define the returned SSA_NAME.
4070 FORNOW: We are only handling array accesses with step 1. */
4072 tree
4073 vect_create_addr_base_for_vector_ref (gimple *stmt,
4074 gimple_seq *new_stmt_list,
4075 tree offset,
4076 tree byte_offset)
4078 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4079 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4080 const char *base_name;
4081 tree addr_base;
4082 tree dest;
4083 gimple_seq seq = NULL;
4084 tree vect_ptr_type;
4085 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4086 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4087 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4089 tree data_ref_base = unshare_expr (drb->base_address);
4090 tree base_offset = unshare_expr (drb->offset);
4091 tree init = unshare_expr (drb->init);
4093 if (loop_vinfo)
4094 base_name = get_name (data_ref_base);
4095 else
4097 base_offset = ssize_int (0);
4098 init = ssize_int (0);
4099 base_name = get_name (DR_REF (dr));
4102 /* Create base_offset */
4103 base_offset = size_binop (PLUS_EXPR,
4104 fold_convert (sizetype, base_offset),
4105 fold_convert (sizetype, init));
4107 if (offset)
4109 offset = fold_build2 (MULT_EXPR, sizetype,
4110 fold_convert (sizetype, offset), step);
4111 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4112 base_offset, offset);
4114 if (byte_offset)
4116 byte_offset = fold_convert (sizetype, byte_offset);
4117 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4118 base_offset, byte_offset);
4121 /* base + base_offset */
4122 if (loop_vinfo)
4123 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4124 else
4126 addr_base = build1 (ADDR_EXPR,
4127 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4128 unshare_expr (DR_REF (dr)));
4131 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4132 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4133 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4134 gimple_seq_add_seq (new_stmt_list, seq);
4136 if (DR_PTR_INFO (dr)
4137 && TREE_CODE (addr_base) == SSA_NAME
4138 && !SSA_NAME_PTR_INFO (addr_base))
4140 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4141 if (offset || byte_offset)
4142 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4145 if (dump_enabled_p ())
4147 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4148 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4149 dump_printf (MSG_NOTE, "\n");
4152 return addr_base;
4156 /* Function vect_create_data_ref_ptr.
4158 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4159 location accessed in the loop by STMT, along with the def-use update
4160 chain to appropriately advance the pointer through the loop iterations.
4161 Also set aliasing information for the pointer. This pointer is used by
4162 the callers to this function to create a memory reference expression for
4163 vector load/store access.
4165 Input:
4166 1. STMT: a stmt that references memory. Expected to be of the form
4167 GIMPLE_ASSIGN <name, data-ref> or
4168 GIMPLE_ASSIGN <data-ref, name>.
4169 2. AGGR_TYPE: the type of the reference, which should be either a vector
4170 or an array.
4171 3. AT_LOOP: the loop where the vector memref is to be created.
4172 4. OFFSET (optional): an offset to be added to the initial address accessed
4173 by the data-ref in STMT.
4174 5. BSI: location where the new stmts are to be placed if there is no loop
4175 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4176 pointing to the initial address.
4177 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4178 to the initial address accessed by the data-ref in STMT. This is
4179 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4180 in bytes.
4182 Output:
4183 1. Declare a new ptr to vector_type, and have it point to the base of the
4184 data reference (initial addressed accessed by the data reference).
4185 For example, for vector of type V8HI, the following code is generated:
4187 v8hi *ap;
4188 ap = (v8hi *)initial_address;
4190 if OFFSET is not supplied:
4191 initial_address = &a[init];
4192 if OFFSET is supplied:
4193 initial_address = &a[init + OFFSET];
4194 if BYTE_OFFSET is supplied:
4195 initial_address = &a[init] + BYTE_OFFSET;
4197 Return the initial_address in INITIAL_ADDRESS.
4199 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4200 update the pointer in each iteration of the loop.
4202 Return the increment stmt that updates the pointer in PTR_INCR.
4204 3. Set INV_P to true if the access pattern of the data reference in the
4205 vectorized loop is invariant. Set it to false otherwise.
4207 4. Return the pointer. */
4209 tree
4210 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4211 tree offset, tree *initial_address,
4212 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4213 bool only_init, bool *inv_p, tree byte_offset)
4215 const char *base_name;
4216 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4217 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4218 struct loop *loop = NULL;
4219 bool nested_in_vect_loop = false;
4220 struct loop *containing_loop = NULL;
4221 tree aggr_ptr_type;
4222 tree aggr_ptr;
4223 tree new_temp;
4224 gimple_seq new_stmt_list = NULL;
4225 edge pe = NULL;
4226 basic_block new_bb;
4227 tree aggr_ptr_init;
4228 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4229 tree aptr;
4230 gimple_stmt_iterator incr_gsi;
4231 bool insert_after;
4232 tree indx_before_incr, indx_after_incr;
4233 gimple *incr;
4234 tree step;
4235 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4237 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4238 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4240 if (loop_vinfo)
4242 loop = LOOP_VINFO_LOOP (loop_vinfo);
4243 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4244 containing_loop = (gimple_bb (stmt))->loop_father;
4245 pe = loop_preheader_edge (loop);
4247 else
4249 gcc_assert (bb_vinfo);
4250 only_init = true;
4251 *ptr_incr = NULL;
4254 /* Check the step (evolution) of the load in LOOP, and record
4255 whether it's invariant. */
4256 step = vect_dr_behavior (dr)->step;
4257 if (integer_zerop (step))
4258 *inv_p = true;
4259 else
4260 *inv_p = false;
4262 /* Create an expression for the first address accessed by this load
4263 in LOOP. */
4264 base_name = get_name (DR_BASE_ADDRESS (dr));
4266 if (dump_enabled_p ())
4268 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4269 dump_printf_loc (MSG_NOTE, vect_location,
4270 "create %s-pointer variable to type: ",
4271 get_tree_code_name (TREE_CODE (aggr_type)));
4272 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4273 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4274 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4275 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4276 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4277 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4278 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4279 else
4280 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4281 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4282 dump_printf (MSG_NOTE, "\n");
4285 /* (1) Create the new aggregate-pointer variable.
4286 Vector and array types inherit the alias set of their component
4287 type by default so we need to use a ref-all pointer if the data
4288 reference does not conflict with the created aggregated data
4289 reference because it is not addressable. */
4290 bool need_ref_all = false;
4291 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4292 get_alias_set (DR_REF (dr))))
4293 need_ref_all = true;
4294 /* Likewise for any of the data references in the stmt group. */
4295 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4297 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4300 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4301 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4302 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4303 get_alias_set (DR_REF (sdr))))
4305 need_ref_all = true;
4306 break;
4308 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4310 while (orig_stmt);
4312 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4313 need_ref_all);
4314 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4317 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4318 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4319 def-use update cycles for the pointer: one relative to the outer-loop
4320 (LOOP), which is what steps (3) and (4) below do. The other is relative
4321 to the inner-loop (which is the inner-most loop containing the dataref),
4322 and this is done be step (5) below.
4324 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4325 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4326 redundant. Steps (3),(4) create the following:
4328 vp0 = &base_addr;
4329 LOOP: vp1 = phi(vp0,vp2)
4332 vp2 = vp1 + step
4333 goto LOOP
4335 If there is an inner-loop nested in loop, then step (5) will also be
4336 applied, and an additional update in the inner-loop will be created:
4338 vp0 = &base_addr;
4339 LOOP: vp1 = phi(vp0,vp2)
4341 inner: vp3 = phi(vp1,vp4)
4342 vp4 = vp3 + inner_step
4343 if () goto inner
4345 vp2 = vp1 + step
4346 if () goto LOOP */
4348 /* (2) Calculate the initial address of the aggregate-pointer, and set
4349 the aggregate-pointer to point to it before the loop. */
4351 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4353 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4354 offset, byte_offset);
4355 if (new_stmt_list)
4357 if (pe)
4359 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4360 gcc_assert (!new_bb);
4362 else
4363 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4366 *initial_address = new_temp;
4367 aggr_ptr_init = new_temp;
4369 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4370 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4371 inner-loop nested in LOOP (during outer-loop vectorization). */
4373 /* No update in loop is required. */
4374 if (only_init && (!loop_vinfo || at_loop == loop))
4375 aptr = aggr_ptr_init;
4376 else
4378 /* The step of the aggregate pointer is the type size. */
4379 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4380 /* One exception to the above is when the scalar step of the load in
4381 LOOP is zero. In this case the step here is also zero. */
4382 if (*inv_p)
4383 iv_step = size_zero_node;
4384 else if (tree_int_cst_sgn (step) == -1)
4385 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4387 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4389 create_iv (aggr_ptr_init,
4390 fold_convert (aggr_ptr_type, iv_step),
4391 aggr_ptr, loop, &incr_gsi, insert_after,
4392 &indx_before_incr, &indx_after_incr);
4393 incr = gsi_stmt (incr_gsi);
4394 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4396 /* Copy the points-to information if it exists. */
4397 if (DR_PTR_INFO (dr))
4399 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4400 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4402 if (ptr_incr)
4403 *ptr_incr = incr;
4405 aptr = indx_before_incr;
4408 if (!nested_in_vect_loop || only_init)
4409 return aptr;
4412 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4413 nested in LOOP, if exists. */
4415 gcc_assert (nested_in_vect_loop);
4416 if (!only_init)
4418 standard_iv_increment_position (containing_loop, &incr_gsi,
4419 &insert_after);
4420 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4421 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4422 &indx_after_incr);
4423 incr = gsi_stmt (incr_gsi);
4424 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4426 /* Copy the points-to information if it exists. */
4427 if (DR_PTR_INFO (dr))
4429 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4430 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4432 if (ptr_incr)
4433 *ptr_incr = incr;
4435 return indx_before_incr;
4437 else
4438 gcc_unreachable ();
4442 /* Function bump_vector_ptr
4444 Increment a pointer (to a vector type) by vector-size. If requested,
4445 i.e. if PTR-INCR is given, then also connect the new increment stmt
4446 to the existing def-use update-chain of the pointer, by modifying
4447 the PTR_INCR as illustrated below:
4449 The pointer def-use update-chain before this function:
4450 DATAREF_PTR = phi (p_0, p_2)
4451 ....
4452 PTR_INCR: p_2 = DATAREF_PTR + step
4454 The pointer def-use update-chain after this function:
4455 DATAREF_PTR = phi (p_0, p_2)
4456 ....
4457 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4458 ....
4459 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4461 Input:
4462 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4463 in the loop.
4464 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4465 the loop. The increment amount across iterations is expected
4466 to be vector_size.
4467 BSI - location where the new update stmt is to be placed.
4468 STMT - the original scalar memory-access stmt that is being vectorized.
4469 BUMP - optional. The offset by which to bump the pointer. If not given,
4470 the offset is assumed to be vector_size.
4472 Output: Return NEW_DATAREF_PTR as illustrated above.
4476 tree
4477 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4478 gimple *stmt, tree bump)
4480 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4481 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4482 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4483 tree update = TYPE_SIZE_UNIT (vectype);
4484 gassign *incr_stmt;
4485 ssa_op_iter iter;
4486 use_operand_p use_p;
4487 tree new_dataref_ptr;
4489 if (bump)
4490 update = bump;
4492 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4493 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4494 else
4495 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4496 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4497 dataref_ptr, update);
4498 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4500 /* Copy the points-to information if it exists. */
4501 if (DR_PTR_INFO (dr))
4503 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4504 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4507 if (!ptr_incr)
4508 return new_dataref_ptr;
4510 /* Update the vector-pointer's cross-iteration increment. */
4511 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4513 tree use = USE_FROM_PTR (use_p);
4515 if (use == dataref_ptr)
4516 SET_USE (use_p, new_dataref_ptr);
4517 else
4518 gcc_assert (tree_int_cst_compare (use, update) == 0);
4521 return new_dataref_ptr;
4525 /* Function vect_create_destination_var.
4527 Create a new temporary of type VECTYPE. */
4529 tree
4530 vect_create_destination_var (tree scalar_dest, tree vectype)
4532 tree vec_dest;
4533 const char *name;
4534 char *new_name;
4535 tree type;
4536 enum vect_var_kind kind;
4538 kind = vectype
4539 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4540 ? vect_mask_var
4541 : vect_simple_var
4542 : vect_scalar_var;
4543 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4545 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4547 name = get_name (scalar_dest);
4548 if (name)
4549 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4550 else
4551 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4552 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4553 free (new_name);
4555 return vec_dest;
4558 /* Function vect_grouped_store_supported.
4560 Returns TRUE if interleave high and interleave low permutations
4561 are supported, and FALSE otherwise. */
4563 bool
4564 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4566 machine_mode mode = TYPE_MODE (vectype);
4568 /* vect_permute_store_chain requires the group size to be equal to 3 or
4569 be a power of two. */
4570 if (count != 3 && exact_log2 (count) == -1)
4572 if (dump_enabled_p ())
4573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4574 "the size of the group of accesses"
4575 " is not a power of 2 or not eqaul to 3\n");
4576 return false;
4579 /* Check that the permutation is supported. */
4580 if (VECTOR_MODE_P (mode))
4582 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4583 if (count == 3)
4585 unsigned int j0 = 0, j1 = 0, j2 = 0;
4586 unsigned int i, j;
4588 vec_perm_builder sel (nelt, nelt, 1);
4589 sel.quick_grow (nelt);
4590 vec_perm_indices indices;
4591 for (j = 0; j < 3; j++)
4593 int nelt0 = ((3 - j) * nelt) % 3;
4594 int nelt1 = ((3 - j) * nelt + 1) % 3;
4595 int nelt2 = ((3 - j) * nelt + 2) % 3;
4596 for (i = 0; i < nelt; i++)
4598 if (3 * i + nelt0 < nelt)
4599 sel[3 * i + nelt0] = j0++;
4600 if (3 * i + nelt1 < nelt)
4601 sel[3 * i + nelt1] = nelt + j1++;
4602 if (3 * i + nelt2 < nelt)
4603 sel[3 * i + nelt2] = 0;
4605 indices.new_vector (sel, 2, nelt);
4606 if (!can_vec_perm_const_p (mode, indices))
4608 if (dump_enabled_p ())
4609 dump_printf (MSG_MISSED_OPTIMIZATION,
4610 "permutation op not supported by target.\n");
4611 return false;
4614 for (i = 0; i < nelt; i++)
4616 if (3 * i + nelt0 < nelt)
4617 sel[3 * i + nelt0] = 3 * i + nelt0;
4618 if (3 * i + nelt1 < nelt)
4619 sel[3 * i + nelt1] = 3 * i + nelt1;
4620 if (3 * i + nelt2 < nelt)
4621 sel[3 * i + nelt2] = nelt + j2++;
4623 indices.new_vector (sel, 2, nelt);
4624 if (!can_vec_perm_const_p (mode, indices))
4626 if (dump_enabled_p ())
4627 dump_printf (MSG_MISSED_OPTIMIZATION,
4628 "permutation op not supported by target.\n");
4629 return false;
4632 return true;
4634 else
4636 /* If length is not equal to 3 then only power of 2 is supported. */
4637 gcc_assert (pow2p_hwi (count));
4639 /* The encoding has 2 interleaved stepped patterns. */
4640 vec_perm_builder sel (nelt, 2, 3);
4641 sel.quick_grow (6);
4642 for (i = 0; i < 3; i++)
4644 sel[i * 2] = i;
4645 sel[i * 2 + 1] = i + nelt;
4647 vec_perm_indices indices (sel, 2, nelt);
4648 if (can_vec_perm_const_p (mode, indices))
4650 for (i = 0; i < 6; i++)
4651 sel[i] += nelt / 2;
4652 indices.new_vector (sel, 2, nelt);
4653 if (can_vec_perm_const_p (mode, indices))
4654 return true;
4659 if (dump_enabled_p ())
4660 dump_printf (MSG_MISSED_OPTIMIZATION,
4661 "permutaion op not supported by target.\n");
4662 return false;
4666 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4667 type VECTYPE. */
4669 bool
4670 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4672 return vect_lanes_optab_supported_p ("vec_store_lanes",
4673 vec_store_lanes_optab,
4674 vectype, count);
4678 /* Function vect_permute_store_chain.
4680 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4681 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4682 the data correctly for the stores. Return the final references for stores
4683 in RESULT_CHAIN.
4685 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4686 The input is 4 vectors each containing 8 elements. We assign a number to
4687 each element, the input sequence is:
4689 1st vec: 0 1 2 3 4 5 6 7
4690 2nd vec: 8 9 10 11 12 13 14 15
4691 3rd vec: 16 17 18 19 20 21 22 23
4692 4th vec: 24 25 26 27 28 29 30 31
4694 The output sequence should be:
4696 1st vec: 0 8 16 24 1 9 17 25
4697 2nd vec: 2 10 18 26 3 11 19 27
4698 3rd vec: 4 12 20 28 5 13 21 30
4699 4th vec: 6 14 22 30 7 15 23 31
4701 i.e., we interleave the contents of the four vectors in their order.
4703 We use interleave_high/low instructions to create such output. The input of
4704 each interleave_high/low operation is two vectors:
4705 1st vec 2nd vec
4706 0 1 2 3 4 5 6 7
4707 the even elements of the result vector are obtained left-to-right from the
4708 high/low elements of the first vector. The odd elements of the result are
4709 obtained left-to-right from the high/low elements of the second vector.
4710 The output of interleave_high will be: 0 4 1 5
4711 and of interleave_low: 2 6 3 7
4714 The permutation is done in log LENGTH stages. In each stage interleave_high
4715 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4716 where the first argument is taken from the first half of DR_CHAIN and the
4717 second argument from it's second half.
4718 In our example,
4720 I1: interleave_high (1st vec, 3rd vec)
4721 I2: interleave_low (1st vec, 3rd vec)
4722 I3: interleave_high (2nd vec, 4th vec)
4723 I4: interleave_low (2nd vec, 4th vec)
4725 The output for the first stage is:
4727 I1: 0 16 1 17 2 18 3 19
4728 I2: 4 20 5 21 6 22 7 23
4729 I3: 8 24 9 25 10 26 11 27
4730 I4: 12 28 13 29 14 30 15 31
4732 The output of the second stage, i.e. the final result is:
4734 I1: 0 8 16 24 1 9 17 25
4735 I2: 2 10 18 26 3 11 19 27
4736 I3: 4 12 20 28 5 13 21 30
4737 I4: 6 14 22 30 7 15 23 31. */
4739 void
4740 vect_permute_store_chain (vec<tree> dr_chain,
4741 unsigned int length,
4742 gimple *stmt,
4743 gimple_stmt_iterator *gsi,
4744 vec<tree> *result_chain)
4746 tree vect1, vect2, high, low;
4747 gimple *perm_stmt;
4748 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4749 tree perm_mask_low, perm_mask_high;
4750 tree data_ref;
4751 tree perm3_mask_low, perm3_mask_high;
4752 unsigned int i, n, log_length = exact_log2 (length);
4753 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4755 result_chain->quick_grow (length);
4756 memcpy (result_chain->address (), dr_chain.address (),
4757 length * sizeof (tree));
4759 if (length == 3)
4761 unsigned int j0 = 0, j1 = 0, j2 = 0;
4763 vec_perm_builder sel (nelt, nelt, 1);
4764 sel.quick_grow (nelt);
4765 vec_perm_indices indices;
4766 for (j = 0; j < 3; j++)
4768 int nelt0 = ((3 - j) * nelt) % 3;
4769 int nelt1 = ((3 - j) * nelt + 1) % 3;
4770 int nelt2 = ((3 - j) * nelt + 2) % 3;
4772 for (i = 0; i < nelt; i++)
4774 if (3 * i + nelt0 < nelt)
4775 sel[3 * i + nelt0] = j0++;
4776 if (3 * i + nelt1 < nelt)
4777 sel[3 * i + nelt1] = nelt + j1++;
4778 if (3 * i + nelt2 < nelt)
4779 sel[3 * i + nelt2] = 0;
4781 indices.new_vector (sel, 2, nelt);
4782 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4784 for (i = 0; i < nelt; i++)
4786 if (3 * i + nelt0 < nelt)
4787 sel[3 * i + nelt0] = 3 * i + nelt0;
4788 if (3 * i + nelt1 < nelt)
4789 sel[3 * i + nelt1] = 3 * i + nelt1;
4790 if (3 * i + nelt2 < nelt)
4791 sel[3 * i + nelt2] = nelt + j2++;
4793 indices.new_vector (sel, 2, nelt);
4794 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4796 vect1 = dr_chain[0];
4797 vect2 = dr_chain[1];
4799 /* Create interleaving stmt:
4800 low = VEC_PERM_EXPR <vect1, vect2,
4801 {j, nelt, *, j + 1, nelt + j + 1, *,
4802 j + 2, nelt + j + 2, *, ...}> */
4803 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4804 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4805 vect2, perm3_mask_low);
4806 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4808 vect1 = data_ref;
4809 vect2 = dr_chain[2];
4810 /* Create interleaving stmt:
4811 low = VEC_PERM_EXPR <vect1, vect2,
4812 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4813 6, 7, nelt + j + 2, ...}> */
4814 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4815 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4816 vect2, perm3_mask_high);
4817 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4818 (*result_chain)[j] = data_ref;
4821 else
4823 /* If length is not equal to 3 then only power of 2 is supported. */
4824 gcc_assert (pow2p_hwi (length));
4826 /* The encoding has 2 interleaved stepped patterns. */
4827 vec_perm_builder sel (nelt, 2, 3);
4828 sel.quick_grow (6);
4829 for (i = 0; i < 3; i++)
4831 sel[i * 2] = i;
4832 sel[i * 2 + 1] = i + nelt;
4834 vec_perm_indices indices (sel, 2, nelt);
4835 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4837 for (i = 0; i < 6; i++)
4838 sel[i] += nelt / 2;
4839 indices.new_vector (sel, 2, nelt);
4840 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4842 for (i = 0, n = log_length; i < n; i++)
4844 for (j = 0; j < length/2; j++)
4846 vect1 = dr_chain[j];
4847 vect2 = dr_chain[j+length/2];
4849 /* Create interleaving stmt:
4850 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4851 ...}> */
4852 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4853 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4854 vect2, perm_mask_high);
4855 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4856 (*result_chain)[2*j] = high;
4858 /* Create interleaving stmt:
4859 low = VEC_PERM_EXPR <vect1, vect2,
4860 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4861 ...}> */
4862 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4863 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4864 vect2, perm_mask_low);
4865 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4866 (*result_chain)[2*j+1] = low;
4868 memcpy (dr_chain.address (), result_chain->address (),
4869 length * sizeof (tree));
4874 /* Function vect_setup_realignment
4876 This function is called when vectorizing an unaligned load using
4877 the dr_explicit_realign[_optimized] scheme.
4878 This function generates the following code at the loop prolog:
4880 p = initial_addr;
4881 x msq_init = *(floor(p)); # prolog load
4882 realignment_token = call target_builtin;
4883 loop:
4884 x msq = phi (msq_init, ---)
4886 The stmts marked with x are generated only for the case of
4887 dr_explicit_realign_optimized.
4889 The code above sets up a new (vector) pointer, pointing to the first
4890 location accessed by STMT, and a "floor-aligned" load using that pointer.
4891 It also generates code to compute the "realignment-token" (if the relevant
4892 target hook was defined), and creates a phi-node at the loop-header bb
4893 whose arguments are the result of the prolog-load (created by this
4894 function) and the result of a load that takes place in the loop (to be
4895 created by the caller to this function).
4897 For the case of dr_explicit_realign_optimized:
4898 The caller to this function uses the phi-result (msq) to create the
4899 realignment code inside the loop, and sets up the missing phi argument,
4900 as follows:
4901 loop:
4902 msq = phi (msq_init, lsq)
4903 lsq = *(floor(p')); # load in loop
4904 result = realign_load (msq, lsq, realignment_token);
4906 For the case of dr_explicit_realign:
4907 loop:
4908 msq = *(floor(p)); # load in loop
4909 p' = p + (VS-1);
4910 lsq = *(floor(p')); # load in loop
4911 result = realign_load (msq, lsq, realignment_token);
4913 Input:
4914 STMT - (scalar) load stmt to be vectorized. This load accesses
4915 a memory location that may be unaligned.
4916 BSI - place where new code is to be inserted.
4917 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4918 is used.
4920 Output:
4921 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4922 target hook, if defined.
4923 Return value - the result of the loop-header phi node. */
4925 tree
4926 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4927 tree *realignment_token,
4928 enum dr_alignment_support alignment_support_scheme,
4929 tree init_addr,
4930 struct loop **at_loop)
4932 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4933 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4934 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4935 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4936 struct loop *loop = NULL;
4937 edge pe = NULL;
4938 tree scalar_dest = gimple_assign_lhs (stmt);
4939 tree vec_dest;
4940 gimple *inc;
4941 tree ptr;
4942 tree data_ref;
4943 basic_block new_bb;
4944 tree msq_init = NULL_TREE;
4945 tree new_temp;
4946 gphi *phi_stmt;
4947 tree msq = NULL_TREE;
4948 gimple_seq stmts = NULL;
4949 bool inv_p;
4950 bool compute_in_loop = false;
4951 bool nested_in_vect_loop = false;
4952 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4953 struct loop *loop_for_initial_load = NULL;
4955 if (loop_vinfo)
4957 loop = LOOP_VINFO_LOOP (loop_vinfo);
4958 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4961 gcc_assert (alignment_support_scheme == dr_explicit_realign
4962 || alignment_support_scheme == dr_explicit_realign_optimized);
4964 /* We need to generate three things:
4965 1. the misalignment computation
4966 2. the extra vector load (for the optimized realignment scheme).
4967 3. the phi node for the two vectors from which the realignment is
4968 done (for the optimized realignment scheme). */
4970 /* 1. Determine where to generate the misalignment computation.
4972 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4973 calculation will be generated by this function, outside the loop (in the
4974 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4975 caller, inside the loop.
4977 Background: If the misalignment remains fixed throughout the iterations of
4978 the loop, then both realignment schemes are applicable, and also the
4979 misalignment computation can be done outside LOOP. This is because we are
4980 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4981 are a multiple of VS (the Vector Size), and therefore the misalignment in
4982 different vectorized LOOP iterations is always the same.
4983 The problem arises only if the memory access is in an inner-loop nested
4984 inside LOOP, which is now being vectorized using outer-loop vectorization.
4985 This is the only case when the misalignment of the memory access may not
4986 remain fixed throughout the iterations of the inner-loop (as explained in
4987 detail in vect_supportable_dr_alignment). In this case, not only is the
4988 optimized realignment scheme not applicable, but also the misalignment
4989 computation (and generation of the realignment token that is passed to
4990 REALIGN_LOAD) have to be done inside the loop.
4992 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4993 or not, which in turn determines if the misalignment is computed inside
4994 the inner-loop, or outside LOOP. */
4996 if (init_addr != NULL_TREE || !loop_vinfo)
4998 compute_in_loop = true;
4999 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5003 /* 2. Determine where to generate the extra vector load.
5005 For the optimized realignment scheme, instead of generating two vector
5006 loads in each iteration, we generate a single extra vector load in the
5007 preheader of the loop, and in each iteration reuse the result of the
5008 vector load from the previous iteration. In case the memory access is in
5009 an inner-loop nested inside LOOP, which is now being vectorized using
5010 outer-loop vectorization, we need to determine whether this initial vector
5011 load should be generated at the preheader of the inner-loop, or can be
5012 generated at the preheader of LOOP. If the memory access has no evolution
5013 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5014 to be generated inside LOOP (in the preheader of the inner-loop). */
5016 if (nested_in_vect_loop)
5018 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5019 bool invariant_in_outerloop =
5020 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5021 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5023 else
5024 loop_for_initial_load = loop;
5025 if (at_loop)
5026 *at_loop = loop_for_initial_load;
5028 if (loop_for_initial_load)
5029 pe = loop_preheader_edge (loop_for_initial_load);
5031 /* 3. For the case of the optimized realignment, create the first vector
5032 load at the loop preheader. */
5034 if (alignment_support_scheme == dr_explicit_realign_optimized)
5036 /* Create msq_init = *(floor(p1)) in the loop preheader */
5037 gassign *new_stmt;
5039 gcc_assert (!compute_in_loop);
5040 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5041 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5042 NULL_TREE, &init_addr, NULL, &inc,
5043 true, &inv_p);
5044 if (TREE_CODE (ptr) == SSA_NAME)
5045 new_temp = copy_ssa_name (ptr);
5046 else
5047 new_temp = make_ssa_name (TREE_TYPE (ptr));
5048 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5049 new_stmt = gimple_build_assign
5050 (new_temp, BIT_AND_EXPR, ptr,
5051 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5052 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5053 gcc_assert (!new_bb);
5054 data_ref
5055 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5056 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5057 new_stmt = gimple_build_assign (vec_dest, data_ref);
5058 new_temp = make_ssa_name (vec_dest, new_stmt);
5059 gimple_assign_set_lhs (new_stmt, new_temp);
5060 if (pe)
5062 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5063 gcc_assert (!new_bb);
5065 else
5066 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5068 msq_init = gimple_assign_lhs (new_stmt);
5071 /* 4. Create realignment token using a target builtin, if available.
5072 It is done either inside the containing loop, or before LOOP (as
5073 determined above). */
5075 if (targetm.vectorize.builtin_mask_for_load)
5077 gcall *new_stmt;
5078 tree builtin_decl;
5080 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5081 if (!init_addr)
5083 /* Generate the INIT_ADDR computation outside LOOP. */
5084 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5085 NULL_TREE);
5086 if (loop)
5088 pe = loop_preheader_edge (loop);
5089 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5090 gcc_assert (!new_bb);
5092 else
5093 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5096 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5097 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5098 vec_dest =
5099 vect_create_destination_var (scalar_dest,
5100 gimple_call_return_type (new_stmt));
5101 new_temp = make_ssa_name (vec_dest, new_stmt);
5102 gimple_call_set_lhs (new_stmt, new_temp);
5104 if (compute_in_loop)
5105 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5106 else
5108 /* Generate the misalignment computation outside LOOP. */
5109 pe = loop_preheader_edge (loop);
5110 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5111 gcc_assert (!new_bb);
5114 *realignment_token = gimple_call_lhs (new_stmt);
5116 /* The result of the CALL_EXPR to this builtin is determined from
5117 the value of the parameter and no global variables are touched
5118 which makes the builtin a "const" function. Requiring the
5119 builtin to have the "const" attribute makes it unnecessary
5120 to call mark_call_clobbered. */
5121 gcc_assert (TREE_READONLY (builtin_decl));
5124 if (alignment_support_scheme == dr_explicit_realign)
5125 return msq;
5127 gcc_assert (!compute_in_loop);
5128 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5131 /* 5. Create msq = phi <msq_init, lsq> in loop */
5133 pe = loop_preheader_edge (containing_loop);
5134 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5135 msq = make_ssa_name (vec_dest);
5136 phi_stmt = create_phi_node (msq, containing_loop->header);
5137 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5139 return msq;
5143 /* Function vect_grouped_load_supported.
5145 COUNT is the size of the load group (the number of statements plus the
5146 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5147 only one statement, with a gap of COUNT - 1.
5149 Returns true if a suitable permute exists. */
5151 bool
5152 vect_grouped_load_supported (tree vectype, bool single_element_p,
5153 unsigned HOST_WIDE_INT count)
5155 machine_mode mode = TYPE_MODE (vectype);
5157 /* If this is single-element interleaving with an element distance
5158 that leaves unused vector loads around punt - we at least create
5159 very sub-optimal code in that case (and blow up memory,
5160 see PR65518). */
5161 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5163 if (dump_enabled_p ())
5164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5165 "single-element interleaving not supported "
5166 "for not adjacent vector loads\n");
5167 return false;
5170 /* vect_permute_load_chain requires the group size to be equal to 3 or
5171 be a power of two. */
5172 if (count != 3 && exact_log2 (count) == -1)
5174 if (dump_enabled_p ())
5175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5176 "the size of the group of accesses"
5177 " is not a power of 2 or not equal to 3\n");
5178 return false;
5181 /* Check that the permutation is supported. */
5182 if (VECTOR_MODE_P (mode))
5184 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5186 if (count == 3)
5188 vec_perm_builder sel (nelt, nelt, 1);
5189 sel.quick_grow (nelt);
5190 vec_perm_indices indices;
5191 unsigned int k;
5192 for (k = 0; k < 3; k++)
5194 for (i = 0; i < nelt; i++)
5195 if (3 * i + k < 2 * nelt)
5196 sel[i] = 3 * i + k;
5197 else
5198 sel[i] = 0;
5199 indices.new_vector (sel, 2, nelt);
5200 if (!can_vec_perm_const_p (mode, indices))
5202 if (dump_enabled_p ())
5203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5204 "shuffle of 3 loads is not supported by"
5205 " target\n");
5206 return false;
5208 for (i = 0, j = 0; i < nelt; i++)
5209 if (3 * i + k < 2 * nelt)
5210 sel[i] = i;
5211 else
5212 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5213 indices.new_vector (sel, 2, nelt);
5214 if (!can_vec_perm_const_p (mode, indices))
5216 if (dump_enabled_p ())
5217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5218 "shuffle of 3 loads is not supported by"
5219 " target\n");
5220 return false;
5223 return true;
5225 else
5227 /* If length is not equal to 3 then only power of 2 is supported. */
5228 gcc_assert (pow2p_hwi (count));
5230 /* The encoding has a single stepped pattern. */
5231 vec_perm_builder sel (nelt, 1, 3);
5232 sel.quick_grow (3);
5233 for (i = 0; i < 3; i++)
5234 sel[i] = i * 2;
5235 vec_perm_indices indices (sel, 2, nelt);
5236 if (can_vec_perm_const_p (mode, indices))
5238 for (i = 0; i < 3; i++)
5239 sel[i] = i * 2 + 1;
5240 indices.new_vector (sel, 2, nelt);
5241 if (can_vec_perm_const_p (mode, indices))
5242 return true;
5247 if (dump_enabled_p ())
5248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5249 "extract even/odd not supported by target\n");
5250 return false;
5253 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5254 type VECTYPE. */
5256 bool
5257 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5259 return vect_lanes_optab_supported_p ("vec_load_lanes",
5260 vec_load_lanes_optab,
5261 vectype, count);
5264 /* Function vect_permute_load_chain.
5266 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5267 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5268 the input data correctly. Return the final references for loads in
5269 RESULT_CHAIN.
5271 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5272 The input is 4 vectors each containing 8 elements. We assign a number to each
5273 element, the input sequence is:
5275 1st vec: 0 1 2 3 4 5 6 7
5276 2nd vec: 8 9 10 11 12 13 14 15
5277 3rd vec: 16 17 18 19 20 21 22 23
5278 4th vec: 24 25 26 27 28 29 30 31
5280 The output sequence should be:
5282 1st vec: 0 4 8 12 16 20 24 28
5283 2nd vec: 1 5 9 13 17 21 25 29
5284 3rd vec: 2 6 10 14 18 22 26 30
5285 4th vec: 3 7 11 15 19 23 27 31
5287 i.e., the first output vector should contain the first elements of each
5288 interleaving group, etc.
5290 We use extract_even/odd instructions to create such output. The input of
5291 each extract_even/odd operation is two vectors
5292 1st vec 2nd vec
5293 0 1 2 3 4 5 6 7
5295 and the output is the vector of extracted even/odd elements. The output of
5296 extract_even will be: 0 2 4 6
5297 and of extract_odd: 1 3 5 7
5300 The permutation is done in log LENGTH stages. In each stage extract_even
5301 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5302 their order. In our example,
5304 E1: extract_even (1st vec, 2nd vec)
5305 E2: extract_odd (1st vec, 2nd vec)
5306 E3: extract_even (3rd vec, 4th vec)
5307 E4: extract_odd (3rd vec, 4th vec)
5309 The output for the first stage will be:
5311 E1: 0 2 4 6 8 10 12 14
5312 E2: 1 3 5 7 9 11 13 15
5313 E3: 16 18 20 22 24 26 28 30
5314 E4: 17 19 21 23 25 27 29 31
5316 In order to proceed and create the correct sequence for the next stage (or
5317 for the correct output, if the second stage is the last one, as in our
5318 example), we first put the output of extract_even operation and then the
5319 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5320 The input for the second stage is:
5322 1st vec (E1): 0 2 4 6 8 10 12 14
5323 2nd vec (E3): 16 18 20 22 24 26 28 30
5324 3rd vec (E2): 1 3 5 7 9 11 13 15
5325 4th vec (E4): 17 19 21 23 25 27 29 31
5327 The output of the second stage:
5329 E1: 0 4 8 12 16 20 24 28
5330 E2: 2 6 10 14 18 22 26 30
5331 E3: 1 5 9 13 17 21 25 29
5332 E4: 3 7 11 15 19 23 27 31
5334 And RESULT_CHAIN after reordering:
5336 1st vec (E1): 0 4 8 12 16 20 24 28
5337 2nd vec (E3): 1 5 9 13 17 21 25 29
5338 3rd vec (E2): 2 6 10 14 18 22 26 30
5339 4th vec (E4): 3 7 11 15 19 23 27 31. */
5341 static void
5342 vect_permute_load_chain (vec<tree> dr_chain,
5343 unsigned int length,
5344 gimple *stmt,
5345 gimple_stmt_iterator *gsi,
5346 vec<tree> *result_chain)
5348 tree data_ref, first_vect, second_vect;
5349 tree perm_mask_even, perm_mask_odd;
5350 tree perm3_mask_low, perm3_mask_high;
5351 gimple *perm_stmt;
5352 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5353 unsigned int i, j, log_length = exact_log2 (length);
5354 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5356 result_chain->quick_grow (length);
5357 memcpy (result_chain->address (), dr_chain.address (),
5358 length * sizeof (tree));
5360 if (length == 3)
5362 unsigned int k;
5364 vec_perm_builder sel (nelt, nelt, 1);
5365 sel.quick_grow (nelt);
5366 vec_perm_indices indices;
5367 for (k = 0; k < 3; k++)
5369 for (i = 0; i < nelt; i++)
5370 if (3 * i + k < 2 * nelt)
5371 sel[i] = 3 * i + k;
5372 else
5373 sel[i] = 0;
5374 indices.new_vector (sel, 2, nelt);
5375 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5377 for (i = 0, j = 0; i < nelt; i++)
5378 if (3 * i + k < 2 * nelt)
5379 sel[i] = i;
5380 else
5381 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5382 indices.new_vector (sel, 2, nelt);
5383 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5385 first_vect = dr_chain[0];
5386 second_vect = dr_chain[1];
5388 /* Create interleaving stmt (low part of):
5389 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5390 ...}> */
5391 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5392 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5393 second_vect, perm3_mask_low);
5394 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5396 /* Create interleaving stmt (high part of):
5397 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5398 ...}> */
5399 first_vect = data_ref;
5400 second_vect = dr_chain[2];
5401 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5402 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5403 second_vect, perm3_mask_high);
5404 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5405 (*result_chain)[k] = data_ref;
5408 else
5410 /* If length is not equal to 3 then only power of 2 is supported. */
5411 gcc_assert (pow2p_hwi (length));
5413 /* The encoding has a single stepped pattern. */
5414 vec_perm_builder sel (nelt, 1, 3);
5415 sel.quick_grow (3);
5416 for (i = 0; i < 3; ++i)
5417 sel[i] = i * 2;
5418 vec_perm_indices indices (sel, 2, nelt);
5419 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5421 for (i = 0; i < 3; ++i)
5422 sel[i] = i * 2 + 1;
5423 indices.new_vector (sel, 2, nelt);
5424 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5426 for (i = 0; i < log_length; i++)
5428 for (j = 0; j < length; j += 2)
5430 first_vect = dr_chain[j];
5431 second_vect = dr_chain[j+1];
5433 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5434 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5435 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5436 first_vect, second_vect,
5437 perm_mask_even);
5438 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5439 (*result_chain)[j/2] = data_ref;
5441 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5442 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5443 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5444 first_vect, second_vect,
5445 perm_mask_odd);
5446 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5447 (*result_chain)[j/2+length/2] = data_ref;
5449 memcpy (dr_chain.address (), result_chain->address (),
5450 length * sizeof (tree));
5455 /* Function vect_shift_permute_load_chain.
5457 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5458 sequence of stmts to reorder the input data accordingly.
5459 Return the final references for loads in RESULT_CHAIN.
5460 Return true if successed, false otherwise.
5462 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5463 The input is 3 vectors each containing 8 elements. We assign a
5464 number to each element, the input sequence is:
5466 1st vec: 0 1 2 3 4 5 6 7
5467 2nd vec: 8 9 10 11 12 13 14 15
5468 3rd vec: 16 17 18 19 20 21 22 23
5470 The output sequence should be:
5472 1st vec: 0 3 6 9 12 15 18 21
5473 2nd vec: 1 4 7 10 13 16 19 22
5474 3rd vec: 2 5 8 11 14 17 20 23
5476 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5478 First we shuffle all 3 vectors to get correct elements order:
5480 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5481 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5482 3rd vec: (16 19 22) (17 20 23) (18 21)
5484 Next we unite and shift vector 3 times:
5486 1st step:
5487 shift right by 6 the concatenation of:
5488 "1st vec" and "2nd vec"
5489 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5490 "2nd vec" and "3rd vec"
5491 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5492 "3rd vec" and "1st vec"
5493 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5494 | New vectors |
5496 So that now new vectors are:
5498 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5499 2nd vec: (10 13) (16 19 22) (17 20 23)
5500 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5502 2nd step:
5503 shift right by 5 the concatenation of:
5504 "1st vec" and "3rd vec"
5505 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5506 "2nd vec" and "1st vec"
5507 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5508 "3rd vec" and "2nd vec"
5509 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5510 | New vectors |
5512 So that now new vectors are:
5514 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5515 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5516 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5518 3rd step:
5519 shift right by 5 the concatenation of:
5520 "1st vec" and "1st vec"
5521 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5522 shift right by 3 the concatenation of:
5523 "2nd vec" and "2nd vec"
5524 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5525 | New vectors |
5527 So that now all vectors are READY:
5528 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5529 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5530 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5532 This algorithm is faster than one in vect_permute_load_chain if:
5533 1. "shift of a concatination" is faster than general permutation.
5534 This is usually so.
5535 2. The TARGET machine can't execute vector instructions in parallel.
5536 This is because each step of the algorithm depends on previous.
5537 The algorithm in vect_permute_load_chain is much more parallel.
5539 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5542 static bool
5543 vect_shift_permute_load_chain (vec<tree> dr_chain,
5544 unsigned int length,
5545 gimple *stmt,
5546 gimple_stmt_iterator *gsi,
5547 vec<tree> *result_chain)
5549 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5550 tree perm2_mask1, perm2_mask2, perm3_mask;
5551 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5552 gimple *perm_stmt;
5554 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5555 unsigned int i;
5556 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5557 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5558 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5560 unsigned HOST_WIDE_INT vf;
5561 if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5562 /* Not supported for variable-length vectors. */
5563 return false;
5565 vec_perm_builder sel (nelt, nelt, 1);
5566 sel.quick_grow (nelt);
5568 result_chain->quick_grow (length);
5569 memcpy (result_chain->address (), dr_chain.address (),
5570 length * sizeof (tree));
5572 if (pow2p_hwi (length) && vf > 4)
5574 unsigned int j, log_length = exact_log2 (length);
5575 for (i = 0; i < nelt / 2; ++i)
5576 sel[i] = i * 2;
5577 for (i = 0; i < nelt / 2; ++i)
5578 sel[nelt / 2 + i] = i * 2 + 1;
5579 vec_perm_indices indices (sel, 2, nelt);
5580 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5582 if (dump_enabled_p ())
5583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5584 "shuffle of 2 fields structure is not \
5585 supported by target\n");
5586 return false;
5588 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5590 for (i = 0; i < nelt / 2; ++i)
5591 sel[i] = i * 2 + 1;
5592 for (i = 0; i < nelt / 2; ++i)
5593 sel[nelt / 2 + i] = i * 2;
5594 indices.new_vector (sel, 2, nelt);
5595 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5597 if (dump_enabled_p ())
5598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5599 "shuffle of 2 fields structure is not \
5600 supported by target\n");
5601 return false;
5603 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5605 /* Generating permutation constant to shift all elements.
5606 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5607 for (i = 0; i < nelt; i++)
5608 sel[i] = nelt / 2 + i;
5609 indices.new_vector (sel, 2, nelt);
5610 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5612 if (dump_enabled_p ())
5613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5614 "shift permutation is not supported by target\n");
5615 return false;
5617 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5619 /* Generating permutation constant to select vector from 2.
5620 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5621 for (i = 0; i < nelt / 2; i++)
5622 sel[i] = i;
5623 for (i = nelt / 2; i < nelt; i++)
5624 sel[i] = nelt + i;
5625 indices.new_vector (sel, 2, nelt);
5626 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5628 if (dump_enabled_p ())
5629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5630 "select is not supported by target\n");
5631 return false;
5633 select_mask = vect_gen_perm_mask_checked (vectype, indices);
5635 for (i = 0; i < log_length; i++)
5637 for (j = 0; j < length; j += 2)
5639 first_vect = dr_chain[j];
5640 second_vect = dr_chain[j + 1];
5642 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5643 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5644 first_vect, first_vect,
5645 perm2_mask1);
5646 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5647 vect[0] = data_ref;
5649 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5650 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5651 second_vect, second_vect,
5652 perm2_mask2);
5653 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5654 vect[1] = data_ref;
5656 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5657 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5658 vect[0], vect[1], shift1_mask);
5659 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5660 (*result_chain)[j/2 + length/2] = data_ref;
5662 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5663 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5664 vect[0], vect[1], select_mask);
5665 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5666 (*result_chain)[j/2] = data_ref;
5668 memcpy (dr_chain.address (), result_chain->address (),
5669 length * sizeof (tree));
5671 return true;
5673 if (length == 3 && vf > 2)
5675 unsigned int k = 0, l = 0;
5677 /* Generating permutation constant to get all elements in rigth order.
5678 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5679 for (i = 0; i < nelt; i++)
5681 if (3 * k + (l % 3) >= nelt)
5683 k = 0;
5684 l += (3 - (nelt % 3));
5686 sel[i] = 3 * k + (l % 3);
5687 k++;
5689 vec_perm_indices indices (sel, 2, nelt);
5690 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5692 if (dump_enabled_p ())
5693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5694 "shuffle of 3 fields structure is not \
5695 supported by target\n");
5696 return false;
5698 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5700 /* Generating permutation constant to shift all elements.
5701 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5702 for (i = 0; i < nelt; i++)
5703 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5704 indices.new_vector (sel, 2, nelt);
5705 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5707 if (dump_enabled_p ())
5708 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5709 "shift permutation is not supported by target\n");
5710 return false;
5712 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5714 /* Generating permutation constant to shift all elements.
5715 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5716 for (i = 0; i < nelt; i++)
5717 sel[i] = 2 * (nelt / 3) + 1 + i;
5718 indices.new_vector (sel, 2, nelt);
5719 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5721 if (dump_enabled_p ())
5722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5723 "shift permutation is not supported by target\n");
5724 return false;
5726 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5728 /* Generating permutation constant to shift all elements.
5729 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5730 for (i = 0; i < nelt; i++)
5731 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5732 indices.new_vector (sel, 2, nelt);
5733 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5735 if (dump_enabled_p ())
5736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5737 "shift permutation is not supported by target\n");
5738 return false;
5740 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5742 /* Generating permutation constant to shift all elements.
5743 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5744 for (i = 0; i < nelt; i++)
5745 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5746 indices.new_vector (sel, 2, nelt);
5747 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5749 if (dump_enabled_p ())
5750 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5751 "shift permutation is not supported by target\n");
5752 return false;
5754 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5756 for (k = 0; k < 3; k++)
5758 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5759 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5760 dr_chain[k], dr_chain[k],
5761 perm3_mask);
5762 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5763 vect[k] = data_ref;
5766 for (k = 0; k < 3; k++)
5768 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5769 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5770 vect[k % 3], vect[(k + 1) % 3],
5771 shift1_mask);
5772 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5773 vect_shift[k] = data_ref;
5776 for (k = 0; k < 3; k++)
5778 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5779 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5780 vect_shift[(4 - k) % 3],
5781 vect_shift[(3 - k) % 3],
5782 shift2_mask);
5783 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5784 vect[k] = data_ref;
5787 (*result_chain)[3 - (nelt % 3)] = vect[2];
5789 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5790 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5791 vect[0], shift3_mask);
5792 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5793 (*result_chain)[nelt % 3] = data_ref;
5795 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5796 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5797 vect[1], shift4_mask);
5798 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5799 (*result_chain)[0] = data_ref;
5800 return true;
5802 return false;
5805 /* Function vect_transform_grouped_load.
5807 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5808 to perform their permutation and ascribe the result vectorized statements to
5809 the scalar statements.
5812 void
5813 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5814 gimple_stmt_iterator *gsi)
5816 machine_mode mode;
5817 vec<tree> result_chain = vNULL;
5819 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5820 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5821 vectors, that are ready for vector computation. */
5822 result_chain.create (size);
5824 /* If reassociation width for vector type is 2 or greater target machine can
5825 execute 2 or more vector instructions in parallel. Otherwise try to
5826 get chain for loads group using vect_shift_permute_load_chain. */
5827 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5828 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5829 || pow2p_hwi (size)
5830 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5831 gsi, &result_chain))
5832 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5833 vect_record_grouped_load_vectors (stmt, result_chain);
5834 result_chain.release ();
5837 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5838 generated as part of the vectorization of STMT. Assign the statement
5839 for each vector to the associated scalar statement. */
5841 void
5842 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5844 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5845 gimple *next_stmt, *new_stmt;
5846 unsigned int i, gap_count;
5847 tree tmp_data_ref;
5849 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5850 Since we scan the chain starting from it's first node, their order
5851 corresponds the order of data-refs in RESULT_CHAIN. */
5852 next_stmt = first_stmt;
5853 gap_count = 1;
5854 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5856 if (!next_stmt)
5857 break;
5859 /* Skip the gaps. Loads created for the gaps will be removed by dead
5860 code elimination pass later. No need to check for the first stmt in
5861 the group, since it always exists.
5862 GROUP_GAP is the number of steps in elements from the previous
5863 access (if there is no gap GROUP_GAP is 1). We skip loads that
5864 correspond to the gaps. */
5865 if (next_stmt != first_stmt
5866 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5868 gap_count++;
5869 continue;
5872 while (next_stmt)
5874 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5875 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5876 copies, and we put the new vector statement in the first available
5877 RELATED_STMT. */
5878 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5879 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5880 else
5882 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5884 gimple *prev_stmt =
5885 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5886 gimple *rel_stmt =
5887 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5888 while (rel_stmt)
5890 prev_stmt = rel_stmt;
5891 rel_stmt =
5892 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5895 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5896 new_stmt;
5900 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5901 gap_count = 1;
5902 /* If NEXT_STMT accesses the same DR as the previous statement,
5903 put the same TMP_DATA_REF as its vectorized statement; otherwise
5904 get the next data-ref from RESULT_CHAIN. */
5905 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5906 break;
5911 /* Function vect_force_dr_alignment_p.
5913 Returns whether the alignment of a DECL can be forced to be aligned
5914 on ALIGNMENT bit boundary. */
5916 bool
5917 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5919 if (!VAR_P (decl))
5920 return false;
5922 if (decl_in_symtab_p (decl)
5923 && !symtab_node::get (decl)->can_increase_alignment_p ())
5924 return false;
5926 if (TREE_STATIC (decl))
5927 return (alignment <= MAX_OFILE_ALIGNMENT);
5928 else
5929 return (alignment <= MAX_STACK_ALIGNMENT);
5933 /* Return whether the data reference DR is supported with respect to its
5934 alignment.
5935 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5936 it is aligned, i.e., check if it is possible to vectorize it with different
5937 alignment. */
5939 enum dr_alignment_support
5940 vect_supportable_dr_alignment (struct data_reference *dr,
5941 bool check_aligned_accesses)
5943 gimple *stmt = DR_STMT (dr);
5944 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5945 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5946 machine_mode mode = TYPE_MODE (vectype);
5947 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5948 struct loop *vect_loop = NULL;
5949 bool nested_in_vect_loop = false;
5951 if (aligned_access_p (dr) && !check_aligned_accesses)
5952 return dr_aligned;
5954 /* For now assume all conditional loads/stores support unaligned
5955 access without any special code. */
5956 if (is_gimple_call (stmt)
5957 && gimple_call_internal_p (stmt)
5958 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5959 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5960 return dr_unaligned_supported;
5962 if (loop_vinfo)
5964 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5965 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5968 /* Possibly unaligned access. */
5970 /* We can choose between using the implicit realignment scheme (generating
5971 a misaligned_move stmt) and the explicit realignment scheme (generating
5972 aligned loads with a REALIGN_LOAD). There are two variants to the
5973 explicit realignment scheme: optimized, and unoptimized.
5974 We can optimize the realignment only if the step between consecutive
5975 vector loads is equal to the vector size. Since the vector memory
5976 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5977 is guaranteed that the misalignment amount remains the same throughout the
5978 execution of the vectorized loop. Therefore, we can create the
5979 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5980 at the loop preheader.
5982 However, in the case of outer-loop vectorization, when vectorizing a
5983 memory access in the inner-loop nested within the LOOP that is now being
5984 vectorized, while it is guaranteed that the misalignment of the
5985 vectorized memory access will remain the same in different outer-loop
5986 iterations, it is *not* guaranteed that is will remain the same throughout
5987 the execution of the inner-loop. This is because the inner-loop advances
5988 with the original scalar step (and not in steps of VS). If the inner-loop
5989 step happens to be a multiple of VS, then the misalignment remains fixed
5990 and we can use the optimized realignment scheme. For example:
5992 for (i=0; i<N; i++)
5993 for (j=0; j<M; j++)
5994 s += a[i+j];
5996 When vectorizing the i-loop in the above example, the step between
5997 consecutive vector loads is 1, and so the misalignment does not remain
5998 fixed across the execution of the inner-loop, and the realignment cannot
5999 be optimized (as illustrated in the following pseudo vectorized loop):
6001 for (i=0; i<N; i+=4)
6002 for (j=0; j<M; j++){
6003 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6004 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6005 // (assuming that we start from an aligned address).
6008 We therefore have to use the unoptimized realignment scheme:
6010 for (i=0; i<N; i+=4)
6011 for (j=k; j<M; j+=4)
6012 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6013 // that the misalignment of the initial address is
6014 // 0).
6016 The loop can then be vectorized as follows:
6018 for (k=0; k<4; k++){
6019 rt = get_realignment_token (&vp[k]);
6020 for (i=0; i<N; i+=4){
6021 v1 = vp[i+k];
6022 for (j=k; j<M; j+=4){
6023 v2 = vp[i+j+VS-1];
6024 va = REALIGN_LOAD <v1,v2,rt>;
6025 vs += va;
6026 v1 = v2;
6029 } */
6031 if (DR_IS_READ (dr))
6033 bool is_packed = false;
6034 tree type = (TREE_TYPE (DR_REF (dr)));
6036 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6037 && (!targetm.vectorize.builtin_mask_for_load
6038 || targetm.vectorize.builtin_mask_for_load ()))
6040 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6042 /* If we are doing SLP then the accesses need not have the
6043 same alignment, instead it depends on the SLP group size. */
6044 if (loop_vinfo
6045 && STMT_SLP_TYPE (stmt_info)
6046 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6047 * GROUP_SIZE (vinfo_for_stmt
6048 (GROUP_FIRST_ELEMENT (stmt_info))),
6049 TYPE_VECTOR_SUBPARTS (vectype)))
6051 else if (!loop_vinfo
6052 || (nested_in_vect_loop
6053 && (TREE_INT_CST_LOW (DR_STEP (dr))
6054 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6055 return dr_explicit_realign;
6056 else
6057 return dr_explicit_realign_optimized;
6059 if (!known_alignment_for_access_p (dr))
6060 is_packed = not_size_aligned (DR_REF (dr));
6062 if (targetm.vectorize.support_vector_misalignment
6063 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6064 /* Can't software pipeline the loads, but can at least do them. */
6065 return dr_unaligned_supported;
6067 else
6069 bool is_packed = false;
6070 tree type = (TREE_TYPE (DR_REF (dr)));
6072 if (!known_alignment_for_access_p (dr))
6073 is_packed = not_size_aligned (DR_REF (dr));
6075 if (targetm.vectorize.support_vector_misalignment
6076 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6077 return dr_unaligned_supported;
6080 /* Unsupported. */
6081 return dr_unaligned_unsupported;