poly_int: GET_MODE_NUNITS
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob4de0864774d0738092de436111f9806760b552c6
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
64 machine_mode mode;
65 scalar_int_mode array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 limit_p = !targetm.array_mode_supported_p (mode, count);
70 if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
71 limit_p).exists (&array_mode))
73 if (dump_enabled_p ())
74 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
75 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
76 GET_MODE_NAME (mode), count);
77 return false;
80 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
82 if (dump_enabled_p ())
83 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
84 "cannot use %s<%s><%s>\n", name,
85 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
86 return false;
89 if (dump_enabled_p ())
90 dump_printf_loc (MSG_NOTE, vect_location,
91 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
92 GET_MODE_NAME (mode));
94 return true;
98 /* Return the smallest scalar part of STMT.
99 This is used to determine the vectype of the stmt. We generally set the
100 vectype according to the type of the result (lhs). For stmts whose
101 result-type is different than the type of the arguments (e.g., demotion,
102 promotion), vectype will be reset appropriately (later). Note that we have
103 to visit the smallest datatype in this function, because that determines the
104 VF. If the smallest datatype in the loop is present only as the rhs of a
105 promotion operation - we'd miss it.
106 Such a case, where a variable of this datatype does not appear in the lhs
107 anywhere in the loop, can only occur if it's an invariant: e.g.:
108 'int_x = (int) short_inv', which we'd expect to have been optimized away by
109 invariant motion. However, we cannot rely on invariant motion to always
110 take invariants out of the loop, and so in the case of promotion we also
111 have to check the rhs.
112 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
113 types. */
115 tree
116 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
117 HOST_WIDE_INT *rhs_size_unit)
119 tree scalar_type = gimple_expr_type (stmt);
120 HOST_WIDE_INT lhs, rhs;
122 /* During the analysis phase, this function is called on arbitrary
123 statements that might not have scalar results. */
124 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
125 return scalar_type;
127 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 if (is_gimple_assign (stmt)
130 && (gimple_assign_cast_p (stmt)
131 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
135 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
137 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
138 if (rhs < lhs)
139 scalar_type = rhs_type;
142 *lhs_size_unit = lhs;
143 *rhs_size_unit = rhs;
144 return scalar_type;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
152 static bool
153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
158 return false;
160 if (!runtime_alias_check_p (ddr, loop,
161 optimize_loop_nest_for_speed_p (loop)))
162 return false;
164 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
165 return true;
169 /* A subroutine of vect_analyze_data_ref_dependence. Handle
170 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
171 distances. These distances are conservatively correct but they don't
172 reflect a guaranteed dependence.
174 Return true if this function does all the work necessary to avoid
175 an alias or false if the caller should use the dependence distances
176 to limit the vectorization factor in the usual way. LOOP_DEPTH is
177 the depth of the loop described by LOOP_VINFO and the other arguments
178 are as for vect_analyze_data_ref_dependence. */
180 static bool
181 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
182 loop_vec_info loop_vinfo,
183 int loop_depth, unsigned int *max_vf)
185 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186 lambda_vector dist_v;
187 unsigned int i;
188 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
190 int dist = dist_v[loop_depth];
191 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
193 /* If the user asserted safelen >= DIST consecutive iterations
194 can be executed concurrently, assume independence.
196 ??? An alternative would be to add the alias check even
197 in this case, and vectorize the fallback loop with the
198 maximum VF set to safelen. However, if the user has
199 explicitly given a length, it's less likely that that
200 would be a win. */
201 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
203 if ((unsigned int) loop->safelen < *max_vf)
204 *max_vf = loop->safelen;
205 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
206 continue;
209 /* For dependence distances of 2 or more, we have the option
210 of limiting VF or checking for an alias at runtime.
211 Prefer to check at runtime if we can, to avoid limiting
212 the VF unnecessarily when the bases are in fact independent.
214 Note that the alias checks will be removed if the VF ends up
215 being small enough. */
216 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
219 return true;
223 /* Function vect_analyze_data_ref_dependence.
225 Return TRUE if there (might) exist a dependence between a memory-reference
226 DRA and a memory-reference DRB. When versioning for alias may check a
227 dependence at run-time, return FALSE. Adjust *MAX_VF according to
228 the data dependence. */
230 static bool
231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 unsigned int *max_vf)
235 unsigned int i;
236 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
237 struct data_reference *dra = DDR_A (ddr);
238 struct data_reference *drb = DDR_B (ddr);
239 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
240 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
241 lambda_vector dist_v;
242 unsigned int loop_depth;
244 /* In loop analysis all data references should be vectorizable. */
245 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
246 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
247 gcc_unreachable ();
249 /* Independent data accesses. */
250 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
251 return false;
253 if (dra == drb
254 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
255 return false;
257 /* We do not have to consider dependences between accesses that belong
258 to the same group. */
259 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
260 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
261 return false;
263 /* Even if we have an anti-dependence then, as the vectorized loop covers at
264 least two scalar iterations, there is always also a true dependence.
265 As the vectorizer does not re-order loads and stores we can ignore
266 the anti-dependence if TBAA can disambiguate both DRs similar to the
267 case with known negative distance anti-dependences (positive
268 distance anti-dependences would violate TBAA constraints). */
269 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
270 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
271 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
272 get_alias_set (DR_REF (drb))))
273 return false;
275 /* Unknown data dependence. */
276 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
278 /* If user asserted safelen consecutive iterations can be
279 executed concurrently, assume independence. */
280 if (loop->safelen >= 2)
282 if ((unsigned int) loop->safelen < *max_vf)
283 *max_vf = loop->safelen;
284 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
285 return false;
288 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
289 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
291 if (dump_enabled_p ())
293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
294 "versioning for alias not supported for: "
295 "can't determine dependence between ");
296 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
297 DR_REF (dra));
298 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
299 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
300 DR_REF (drb));
301 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
303 return true;
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias required: "
310 "can't determine dependence between ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
319 /* Add to list of ddrs that need to be tested at run-time. */
320 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
323 /* Known data dependence. */
324 if (DDR_NUM_DIST_VECTS (ddr) == 0)
326 /* If user asserted safelen consecutive iterations can be
327 executed concurrently, assume independence. */
328 if (loop->safelen >= 2)
330 if ((unsigned int) loop->safelen < *max_vf)
331 *max_vf = loop->safelen;
332 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
333 return false;
336 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
337 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
339 if (dump_enabled_p ())
341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
342 "versioning for alias not supported for: "
343 "bad dist vector for ");
344 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
345 DR_REF (dra));
346 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
347 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
348 DR_REF (drb));
349 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
351 return true;
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
357 "versioning for alias required: "
358 "bad dist vector for ");
359 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
360 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
361 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
362 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 /* Add to list of ddrs that need to be tested at run-time. */
365 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
368 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
370 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
371 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
372 loop_depth, max_vf))
373 return false;
375 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
377 int dist = dist_v[loop_depth];
379 if (dump_enabled_p ())
380 dump_printf_loc (MSG_NOTE, vect_location,
381 "dependence distance = %d.\n", dist);
383 if (dist == 0)
385 if (dump_enabled_p ())
387 dump_printf_loc (MSG_NOTE, vect_location,
388 "dependence distance == 0 between ");
389 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
390 dump_printf (MSG_NOTE, " and ");
391 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
392 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
395 /* When we perform grouped accesses and perform implicit CSE
396 by detecting equal accesses and doing disambiguation with
397 runtime alias tests like for
398 .. = a[i];
399 .. = a[i+1];
400 a[i] = ..;
401 a[i+1] = ..;
402 *p = ..;
403 .. = a[i];
404 .. = a[i+1];
405 where we will end up loading { a[i], a[i+1] } once, make
406 sure that inserting group loads before the first load and
407 stores after the last store will do the right thing.
408 Similar for groups like
409 a[i] = ...;
410 ... = a[i];
411 a[i+1] = ...;
412 where loads from the group interleave with the store. */
413 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
414 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
416 gimple *earlier_stmt;
417 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
418 if (DR_IS_WRITE
419 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
421 if (dump_enabled_p ())
422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
423 "READ_WRITE dependence in interleaving."
424 "\n");
425 return true;
429 continue;
432 if (dist > 0 && DDR_REVERSED_P (ddr))
434 /* If DDR_REVERSED_P the order of the data-refs in DDR was
435 reversed (to make distance vector positive), and the actual
436 distance is negative. */
437 if (dump_enabled_p ())
438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
439 "dependence distance negative.\n");
440 /* Record a negative dependence distance to later limit the
441 amount of stmt copying / unrolling we can perform.
442 Only need to handle read-after-write dependence. */
443 if (DR_IS_READ (drb)
444 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
445 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
446 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
447 continue;
450 unsigned int abs_dist = abs (dist);
451 if (abs_dist >= 2 && abs_dist < *max_vf)
453 /* The dependence distance requires reduction of the maximal
454 vectorization factor. */
455 *max_vf = abs (dist);
456 if (dump_enabled_p ())
457 dump_printf_loc (MSG_NOTE, vect_location,
458 "adjusting maximal vectorization factor to %i\n",
459 *max_vf);
462 if (abs_dist >= *max_vf)
464 /* Dependence distance does not create dependence, as far as
465 vectorization is concerned, in this case. */
466 if (dump_enabled_p ())
467 dump_printf_loc (MSG_NOTE, vect_location,
468 "dependence distance >= VF.\n");
469 continue;
472 if (dump_enabled_p ())
474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
475 "not vectorized, possible dependence "
476 "between data-refs ");
477 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
478 dump_printf (MSG_NOTE, " and ");
479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
480 dump_printf (MSG_NOTE, "\n");
483 return true;
486 return false;
489 /* Function vect_analyze_data_ref_dependences.
491 Examine all the data references in the loop, and make sure there do not
492 exist any data dependences between them. Set *MAX_VF according to
493 the maximum vectorization factor the data dependences allow. */
495 bool
496 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
497 unsigned int *max_vf)
499 unsigned int i;
500 struct data_dependence_relation *ddr;
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE, vect_location,
504 "=== vect_analyze_data_ref_dependences ===\n");
506 LOOP_VINFO_DDRS (loop_vinfo)
507 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
508 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
509 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
510 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
511 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
512 &LOOP_VINFO_DDRS (loop_vinfo),
513 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
514 return false;
516 /* For epilogues we either have no aliases or alias versioning
517 was applied to original loop. Therefore we may just get max_vf
518 using VF of original loop. */
519 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
520 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
521 else
522 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
523 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
524 return false;
526 return true;
530 /* Function vect_slp_analyze_data_ref_dependence.
532 Return TRUE if there (might) exist a dependence between a memory-reference
533 DRA and a memory-reference DRB. When versioning for alias may check a
534 dependence at run-time, return FALSE. Adjust *MAX_VF according to
535 the data dependence. */
537 static bool
538 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
540 struct data_reference *dra = DDR_A (ddr);
541 struct data_reference *drb = DDR_B (ddr);
543 /* We need to check dependences of statements marked as unvectorizable
544 as well, they still can prohibit vectorization. */
546 /* Independent data accesses. */
547 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
548 return false;
550 if (dra == drb)
551 return false;
553 /* Read-read is OK. */
554 if (DR_IS_READ (dra) && DR_IS_READ (drb))
555 return false;
557 /* If dra and drb are part of the same interleaving chain consider
558 them independent. */
559 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
560 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
561 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
562 return false;
564 /* Unknown data dependence. */
565 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
567 if (dump_enabled_p ())
569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
570 "can't determine dependence between ");
571 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
572 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
573 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
574 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
577 else if (dump_enabled_p ())
579 dump_printf_loc (MSG_NOTE, vect_location,
580 "determined dependence between ");
581 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
582 dump_printf (MSG_NOTE, " and ");
583 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
584 dump_printf (MSG_NOTE, "\n");
587 return true;
591 /* Analyze dependences involved in the transform of SLP NODE. STORES
592 contain the vector of scalar stores of this instance if we are
593 disambiguating the loads. */
595 static bool
596 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
597 vec<gimple *> stores, gimple *last_store)
599 /* This walks over all stmts involved in the SLP load/store done
600 in NODE verifying we can sink them up to the last stmt in the
601 group. */
602 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
603 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
605 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
606 if (access == last_access)
607 continue;
608 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
609 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
610 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
612 gimple *stmt = gsi_stmt (gsi);
613 if (! gimple_vuse (stmt)
614 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
615 continue;
617 /* If we couldn't record a (single) data reference for this
618 stmt we have to give up. */
619 /* ??? Here and below if dependence analysis fails we can resort
620 to the alias oracle which can handle more kinds of stmts. */
621 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
622 if (!dr_b)
623 return false;
625 bool dependent = false;
626 /* If we run into a store of this same instance (we've just
627 marked those) then delay dependence checking until we run
628 into the last store because this is where it will have
629 been sunk to (and we verify if we can do that as well). */
630 if (gimple_visited_p (stmt))
632 if (stmt != last_store)
633 continue;
634 unsigned i;
635 gimple *store;
636 FOR_EACH_VEC_ELT (stores, i, store)
638 data_reference *store_dr
639 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
640 ddr_p ddr = initialize_data_dependence_relation
641 (dr_a, store_dr, vNULL);
642 dependent = vect_slp_analyze_data_ref_dependence (ddr);
643 free_dependence_relation (ddr);
644 if (dependent)
645 break;
648 else
650 ddr_p ddr = initialize_data_dependence_relation (dr_a,
651 dr_b, vNULL);
652 dependent = vect_slp_analyze_data_ref_dependence (ddr);
653 free_dependence_relation (ddr);
655 if (dependent)
656 return false;
659 return true;
663 /* Function vect_analyze_data_ref_dependences.
665 Examine all the data references in the basic-block, and make sure there
666 do not exist any data dependences between them. Set *MAX_VF according to
667 the maximum vectorization factor the data dependences allow. */
669 bool
670 vect_slp_analyze_instance_dependence (slp_instance instance)
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_NOTE, vect_location,
674 "=== vect_slp_analyze_instance_dependence ===\n");
676 /* The stores of this instance are at the root of the SLP tree. */
677 slp_tree store = SLP_INSTANCE_TREE (instance);
678 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
679 store = NULL;
681 /* Verify we can sink stores to the vectorized stmt insert location. */
682 gimple *last_store = NULL;
683 if (store)
685 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
686 return false;
688 /* Mark stores in this instance and remember the last one. */
689 last_store = vect_find_last_scalar_stmt_in_slp (store);
690 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
691 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
694 bool res = true;
696 /* Verify we can sink loads to the vectorized stmt insert location,
697 special-casing stores of this instance. */
698 slp_tree load;
699 unsigned int i;
700 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
701 if (! vect_slp_analyze_node_dependences (instance, load,
702 store
703 ? SLP_TREE_SCALAR_STMTS (store)
704 : vNULL, last_store))
706 res = false;
707 break;
710 /* Unset the visited flag. */
711 if (store)
712 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
713 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
715 return res;
718 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
719 the statement that contains DRB, which is useful for recording in the
720 dump file. */
722 static void
723 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
724 innermost_loop_behavior *drb)
726 bool existed;
727 innermost_loop_behavior *&entry
728 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
729 if (!existed || entry->base_alignment < drb->base_alignment)
731 entry = drb;
732 if (dump_enabled_p ())
734 dump_printf_loc (MSG_NOTE, vect_location,
735 "recording new base alignment for ");
736 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
737 dump_printf (MSG_NOTE, "\n");
738 dump_printf_loc (MSG_NOTE, vect_location,
739 " alignment: %d\n", drb->base_alignment);
740 dump_printf_loc (MSG_NOTE, vect_location,
741 " misalignment: %d\n", drb->base_misalignment);
742 dump_printf_loc (MSG_NOTE, vect_location,
743 " based on: ");
744 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
749 /* If the region we're going to vectorize is reached, all unconditional
750 data references occur at least once. We can therefore pool the base
751 alignment guarantees from each unconditional reference. Do this by
752 going through all the data references in VINFO and checking whether
753 the containing statement makes the reference unconditionally. If so,
754 record the alignment of the base address in VINFO so that it can be
755 used for all other references with the same base. */
757 void
758 vect_record_base_alignments (vec_info *vinfo)
760 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
761 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
762 data_reference *dr;
763 unsigned int i;
764 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
765 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
767 gimple *stmt = DR_STMT (dr);
768 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
770 /* If DR is nested in the loop that is being vectorized, we can also
771 record the alignment of the base wrt the outer loop. */
772 if (loop && nested_in_vect_loop_p (loop, stmt))
774 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
775 vect_record_base_alignment
776 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
781 /* Return the target alignment for the vectorized form of DR. */
783 static unsigned int
784 vect_calculate_target_alignment (struct data_reference *dr)
786 gimple *stmt = DR_STMT (dr);
787 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
788 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
789 return targetm.vectorize.preferred_vector_alignment (vectype);
792 /* Function vect_compute_data_ref_alignment
794 Compute the misalignment of the data reference DR.
796 Output:
797 1. If during the misalignment computation it is found that the data reference
798 cannot be vectorized then false is returned.
799 2. DR_MISALIGNMENT (DR) is defined.
801 FOR NOW: No analysis is actually performed. Misalignment is calculated
802 only for trivial cases. TODO. */
804 bool
805 vect_compute_data_ref_alignment (struct data_reference *dr)
807 gimple *stmt = DR_STMT (dr);
808 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
809 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
810 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
811 struct loop *loop = NULL;
812 tree ref = DR_REF (dr);
813 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
815 if (dump_enabled_p ())
816 dump_printf_loc (MSG_NOTE, vect_location,
817 "vect_compute_data_ref_alignment:\n");
819 if (loop_vinfo)
820 loop = LOOP_VINFO_LOOP (loop_vinfo);
822 /* Initialize misalignment to unknown. */
823 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
825 innermost_loop_behavior *drb = vect_dr_behavior (dr);
826 bool step_preserves_misalignment_p;
828 unsigned HOST_WIDE_INT vector_alignment
829 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
830 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
832 /* No step for BB vectorization. */
833 if (!loop)
835 gcc_assert (integer_zerop (drb->step));
836 step_preserves_misalignment_p = true;
839 /* In case the dataref is in an inner-loop of the loop that is being
840 vectorized (LOOP), we use the base and misalignment information
841 relative to the outer-loop (LOOP). This is ok only if the misalignment
842 stays the same throughout the execution of the inner-loop, which is why
843 we have to check that the stride of the dataref in the inner-loop evenly
844 divides by the vector alignment. */
845 else if (nested_in_vect_loop_p (loop, stmt))
847 step_preserves_misalignment_p
848 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
850 if (dump_enabled_p ())
852 if (step_preserves_misalignment_p)
853 dump_printf_loc (MSG_NOTE, vect_location,
854 "inner step divides the vector alignment.\n");
855 else
856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
857 "inner step doesn't divide the vector"
858 " alignment.\n");
862 /* Similarly we can only use base and misalignment information relative to
863 an innermost loop if the misalignment stays the same throughout the
864 execution of the loop. As above, this is the case if the stride of
865 the dataref evenly divides by the alignment. */
866 else
868 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
869 step_preserves_misalignment_p
870 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
872 if (!step_preserves_misalignment_p && dump_enabled_p ())
873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
874 "step doesn't divide the vector alignment.\n");
877 unsigned int base_alignment = drb->base_alignment;
878 unsigned int base_misalignment = drb->base_misalignment;
880 /* Calculate the maximum of the pooled base address alignment and the
881 alignment that we can compute for DR itself. */
882 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
883 if (entry && base_alignment < (*entry)->base_alignment)
885 base_alignment = (*entry)->base_alignment;
886 base_misalignment = (*entry)->base_misalignment;
889 if (drb->offset_alignment < vector_alignment
890 || !step_preserves_misalignment_p
891 /* We need to know whether the step wrt the vectorized loop is
892 negative when computing the starting misalignment below. */
893 || TREE_CODE (drb->step) != INTEGER_CST)
895 if (dump_enabled_p ())
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "Unknown alignment for access: ");
899 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
900 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
902 return true;
905 if (base_alignment < vector_alignment)
907 tree base = drb->base_address;
908 if (TREE_CODE (base) == ADDR_EXPR)
909 base = TREE_OPERAND (base, 0);
910 if (!vect_can_force_dr_alignment_p (base,
911 vector_alignment * BITS_PER_UNIT))
913 if (dump_enabled_p ())
915 dump_printf_loc (MSG_NOTE, vect_location,
916 "can't force alignment of ref: ");
917 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
918 dump_printf (MSG_NOTE, "\n");
920 return true;
923 if (DECL_USER_ALIGN (base))
925 if (dump_enabled_p ())
927 dump_printf_loc (MSG_NOTE, vect_location,
928 "not forcing alignment of user-aligned "
929 "variable: ");
930 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
931 dump_printf (MSG_NOTE, "\n");
933 return true;
936 /* Force the alignment of the decl.
937 NOTE: This is the only change to the code we make during
938 the analysis phase, before deciding to vectorize the loop. */
939 if (dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
942 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
943 dump_printf (MSG_NOTE, "\n");
946 DR_VECT_AUX (dr)->base_decl = base;
947 DR_VECT_AUX (dr)->base_misaligned = true;
948 base_misalignment = 0;
950 poly_int64 misalignment
951 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
953 /* If this is a backward running DR then first access in the larger
954 vectype actually is N-1 elements before the address in the DR.
955 Adjust misalign accordingly. */
956 if (tree_int_cst_sgn (drb->step) < 0)
957 /* PLUS because STEP is negative. */
958 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
959 * TREE_INT_CST_LOW (drb->step));
961 unsigned int const_misalignment;
962 if (!known_misalignment (misalignment, vector_alignment,
963 &const_misalignment))
965 if (dump_enabled_p ())
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
968 "Non-constant misalignment for access: ");
969 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
970 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
972 return true;
975 SET_DR_MISALIGNMENT (dr, const_misalignment);
977 if (dump_enabled_p ())
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
980 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
981 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
982 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
985 return true;
988 /* Function vect_update_misalignment_for_peel.
989 Sets DR's misalignment
990 - to 0 if it has the same alignment as DR_PEEL,
991 - to the misalignment computed using NPEEL if DR's salignment is known,
992 - to -1 (unknown) otherwise.
994 DR - the data reference whose misalignment is to be adjusted.
995 DR_PEEL - the data reference whose misalignment is being made
996 zero in the vector loop by the peel.
997 NPEEL - the number of iterations in the peel loop if the misalignment
998 of DR_PEEL is known at compile time. */
1000 static void
1001 vect_update_misalignment_for_peel (struct data_reference *dr,
1002 struct data_reference *dr_peel, int npeel)
1004 unsigned int i;
1005 vec<dr_p> same_aligned_drs;
1006 struct data_reference *current_dr;
1007 int dr_size = vect_get_scalar_dr_size (dr);
1008 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1009 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1010 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1012 /* For interleaved data accesses the step in the loop must be multiplied by
1013 the size of the interleaving group. */
1014 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1015 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1016 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1017 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1019 /* It can be assumed that the data refs with the same alignment as dr_peel
1020 are aligned in the vector loop. */
1021 same_aligned_drs
1022 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1023 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1025 if (current_dr != dr)
1026 continue;
1027 gcc_assert (!known_alignment_for_access_p (dr)
1028 || !known_alignment_for_access_p (dr_peel)
1029 || (DR_MISALIGNMENT (dr) / dr_size
1030 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1031 SET_DR_MISALIGNMENT (dr, 0);
1032 return;
1035 if (known_alignment_for_access_p (dr)
1036 && known_alignment_for_access_p (dr_peel))
1038 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1039 int misal = DR_MISALIGNMENT (dr);
1040 misal += negative ? -npeel * dr_size : npeel * dr_size;
1041 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1042 SET_DR_MISALIGNMENT (dr, misal);
1043 return;
1046 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1048 "to unknown (-1).\n");
1049 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1053 /* Function verify_data_ref_alignment
1055 Return TRUE if DR can be handled with respect to alignment. */
1057 static bool
1058 verify_data_ref_alignment (data_reference_p dr)
1060 enum dr_alignment_support supportable_dr_alignment
1061 = vect_supportable_dr_alignment (dr, false);
1062 if (!supportable_dr_alignment)
1064 if (dump_enabled_p ())
1066 if (DR_IS_READ (dr))
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068 "not vectorized: unsupported unaligned load.");
1069 else
1070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1071 "not vectorized: unsupported unaligned "
1072 "store.");
1074 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1075 DR_REF (dr));
1076 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1078 return false;
1081 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1082 dump_printf_loc (MSG_NOTE, vect_location,
1083 "Vectorizing an unaligned access.\n");
1085 return true;
1088 /* Function vect_verify_datarefs_alignment
1090 Return TRUE if all data references in the loop can be
1091 handled with respect to alignment. */
1093 bool
1094 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1096 vec<data_reference_p> datarefs = vinfo->datarefs;
1097 struct data_reference *dr;
1098 unsigned int i;
1100 FOR_EACH_VEC_ELT (datarefs, i, dr)
1102 gimple *stmt = DR_STMT (dr);
1103 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1105 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1106 continue;
1108 /* For interleaving, only the alignment of the first access matters. */
1109 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1110 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1111 continue;
1113 /* Strided accesses perform only component accesses, alignment is
1114 irrelevant for them. */
1115 if (STMT_VINFO_STRIDED_P (stmt_info)
1116 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1117 continue;
1119 if (! verify_data_ref_alignment (dr))
1120 return false;
1123 return true;
1126 /* Given an memory reference EXP return whether its alignment is less
1127 than its size. */
1129 static bool
1130 not_size_aligned (tree exp)
1132 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1133 return true;
1135 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1136 > get_object_alignment (exp));
1139 /* Function vector_alignment_reachable_p
1141 Return true if vector alignment for DR is reachable by peeling
1142 a few loop iterations. Return false otherwise. */
1144 static bool
1145 vector_alignment_reachable_p (struct data_reference *dr)
1147 gimple *stmt = DR_STMT (dr);
1148 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1149 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1151 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1153 /* For interleaved access we peel only if number of iterations in
1154 the prolog loop ({VF - misalignment}), is a multiple of the
1155 number of the interleaved accesses. */
1156 int elem_size, mis_in_elements;
1158 /* FORNOW: handle only known alignment. */
1159 if (!known_alignment_for_access_p (dr))
1160 return false;
1162 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1163 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1164 elem_size = vector_element_size (vector_size, nelements);
1165 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1167 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1168 return false;
1171 /* If misalignment is known at the compile time then allow peeling
1172 only if natural alignment is reachable through peeling. */
1173 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1175 HOST_WIDE_INT elmsize =
1176 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1177 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_NOTE, vect_location,
1180 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1181 dump_printf (MSG_NOTE,
1182 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1184 if (DR_MISALIGNMENT (dr) % elmsize)
1186 if (dump_enabled_p ())
1187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188 "data size does not divide the misalignment.\n");
1189 return false;
1193 if (!known_alignment_for_access_p (dr))
1195 tree type = TREE_TYPE (DR_REF (dr));
1196 bool is_packed = not_size_aligned (DR_REF (dr));
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199 "Unknown misalignment, %snaturally aligned\n",
1200 is_packed ? "not " : "");
1201 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1204 return true;
1208 /* Calculate the cost of the memory access represented by DR. */
1210 static void
1211 vect_get_data_access_cost (struct data_reference *dr,
1212 unsigned int *inside_cost,
1213 unsigned int *outside_cost,
1214 stmt_vector_for_cost *body_cost_vec)
1216 gimple *stmt = DR_STMT (dr);
1217 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1218 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1219 int ncopies;
1221 if (PURE_SLP_STMT (stmt_info))
1222 ncopies = 1;
1223 else
1224 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1226 if (DR_IS_READ (dr))
1227 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1228 NULL, body_cost_vec, false);
1229 else
1230 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_NOTE, vect_location,
1234 "vect_get_data_access_cost: inside_cost = %d, "
1235 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1239 typedef struct _vect_peel_info
1241 struct data_reference *dr;
1242 int npeel;
1243 unsigned int count;
1244 } *vect_peel_info;
1246 typedef struct _vect_peel_extended_info
1248 struct _vect_peel_info peel_info;
1249 unsigned int inside_cost;
1250 unsigned int outside_cost;
1251 } *vect_peel_extended_info;
1254 /* Peeling hashtable helpers. */
1256 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1258 static inline hashval_t hash (const _vect_peel_info *);
1259 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1262 inline hashval_t
1263 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1265 return (hashval_t) peel_info->npeel;
1268 inline bool
1269 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1271 return (a->npeel == b->npeel);
1275 /* Insert DR into peeling hash table with NPEEL as key. */
1277 static void
1278 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1279 loop_vec_info loop_vinfo, struct data_reference *dr,
1280 int npeel)
1282 struct _vect_peel_info elem, *slot;
1283 _vect_peel_info **new_slot;
1284 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1286 elem.npeel = npeel;
1287 slot = peeling_htab->find (&elem);
1288 if (slot)
1289 slot->count++;
1290 else
1292 slot = XNEW (struct _vect_peel_info);
1293 slot->npeel = npeel;
1294 slot->dr = dr;
1295 slot->count = 1;
1296 new_slot = peeling_htab->find_slot (slot, INSERT);
1297 *new_slot = slot;
1300 if (!supportable_dr_alignment
1301 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1302 slot->count += VECT_MAX_COST;
1306 /* Traverse peeling hash table to find peeling option that aligns maximum
1307 number of data accesses. */
1310 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1311 _vect_peel_extended_info *max)
1313 vect_peel_info elem = *slot;
1315 if (elem->count > max->peel_info.count
1316 || (elem->count == max->peel_info.count
1317 && max->peel_info.npeel > elem->npeel))
1319 max->peel_info.npeel = elem->npeel;
1320 max->peel_info.count = elem->count;
1321 max->peel_info.dr = elem->dr;
1324 return 1;
1327 /* Get the costs of peeling NPEEL iterations checking data access costs
1328 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1329 misalignment will be zero after peeling. */
1331 static void
1332 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1333 struct data_reference *dr0,
1334 unsigned int *inside_cost,
1335 unsigned int *outside_cost,
1336 stmt_vector_for_cost *body_cost_vec,
1337 unsigned int npeel,
1338 bool unknown_misalignment)
1340 unsigned i;
1341 data_reference *dr;
1343 FOR_EACH_VEC_ELT (datarefs, i, dr)
1345 gimple *stmt = DR_STMT (dr);
1346 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1347 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1348 continue;
1350 /* For interleaving, only the alignment of the first access
1351 matters. */
1352 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1353 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1354 continue;
1356 /* Strided accesses perform only component accesses, alignment is
1357 irrelevant for them. */
1358 if (STMT_VINFO_STRIDED_P (stmt_info)
1359 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1360 continue;
1362 int save_misalignment;
1363 save_misalignment = DR_MISALIGNMENT (dr);
1364 if (npeel == 0)
1366 else if (unknown_misalignment && dr == dr0)
1367 SET_DR_MISALIGNMENT (dr, 0);
1368 else
1369 vect_update_misalignment_for_peel (dr, dr0, npeel);
1370 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1371 body_cost_vec);
1372 SET_DR_MISALIGNMENT (dr, save_misalignment);
1376 /* Traverse peeling hash table and calculate cost for each peeling option.
1377 Find the one with the lowest cost. */
1380 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1381 _vect_peel_extended_info *min)
1383 vect_peel_info elem = *slot;
1384 int dummy;
1385 unsigned int inside_cost = 0, outside_cost = 0;
1386 gimple *stmt = DR_STMT (elem->dr);
1387 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1388 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1389 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1390 epilogue_cost_vec;
1392 prologue_cost_vec.create (2);
1393 body_cost_vec.create (2);
1394 epilogue_cost_vec.create (2);
1396 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1397 elem->dr, &inside_cost, &outside_cost,
1398 &body_cost_vec, elem->npeel, false);
1400 body_cost_vec.release ();
1402 outside_cost += vect_get_known_peeling_cost
1403 (loop_vinfo, elem->npeel, &dummy,
1404 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1405 &prologue_cost_vec, &epilogue_cost_vec);
1407 /* Prologue and epilogue costs are added to the target model later.
1408 These costs depend only on the scalar iteration cost, the
1409 number of peeling iterations finally chosen, and the number of
1410 misaligned statements. So discard the information found here. */
1411 prologue_cost_vec.release ();
1412 epilogue_cost_vec.release ();
1414 if (inside_cost < min->inside_cost
1415 || (inside_cost == min->inside_cost
1416 && outside_cost < min->outside_cost))
1418 min->inside_cost = inside_cost;
1419 min->outside_cost = outside_cost;
1420 min->peel_info.dr = elem->dr;
1421 min->peel_info.npeel = elem->npeel;
1422 min->peel_info.count = elem->count;
1425 return 1;
1429 /* Choose best peeling option by traversing peeling hash table and either
1430 choosing an option with the lowest cost (if cost model is enabled) or the
1431 option that aligns as many accesses as possible. */
1433 static struct _vect_peel_extended_info
1434 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1435 loop_vec_info loop_vinfo)
1437 struct _vect_peel_extended_info res;
1439 res.peel_info.dr = NULL;
1441 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1443 res.inside_cost = INT_MAX;
1444 res.outside_cost = INT_MAX;
1445 peeling_htab->traverse <_vect_peel_extended_info *,
1446 vect_peeling_hash_get_lowest_cost> (&res);
1448 else
1450 res.peel_info.count = 0;
1451 peeling_htab->traverse <_vect_peel_extended_info *,
1452 vect_peeling_hash_get_most_frequent> (&res);
1453 res.inside_cost = 0;
1454 res.outside_cost = 0;
1457 return res;
1460 /* Return true if the new peeling NPEEL is supported. */
1462 static bool
1463 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1464 unsigned npeel)
1466 unsigned i;
1467 struct data_reference *dr = NULL;
1468 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1469 gimple *stmt;
1470 stmt_vec_info stmt_info;
1471 enum dr_alignment_support supportable_dr_alignment;
1473 /* Ensure that all data refs can be vectorized after the peel. */
1474 FOR_EACH_VEC_ELT (datarefs, i, dr)
1476 int save_misalignment;
1478 if (dr == dr0)
1479 continue;
1481 stmt = DR_STMT (dr);
1482 stmt_info = vinfo_for_stmt (stmt);
1483 /* For interleaving, only the alignment of the first access
1484 matters. */
1485 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1486 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1487 continue;
1489 /* Strided accesses perform only component accesses, alignment is
1490 irrelevant for them. */
1491 if (STMT_VINFO_STRIDED_P (stmt_info)
1492 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1493 continue;
1495 save_misalignment = DR_MISALIGNMENT (dr);
1496 vect_update_misalignment_for_peel (dr, dr0, npeel);
1497 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1498 SET_DR_MISALIGNMENT (dr, save_misalignment);
1500 if (!supportable_dr_alignment)
1501 return false;
1504 return true;
1507 /* Function vect_enhance_data_refs_alignment
1509 This pass will use loop versioning and loop peeling in order to enhance
1510 the alignment of data references in the loop.
1512 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1513 original loop is to be vectorized. Any other loops that are created by
1514 the transformations performed in this pass - are not supposed to be
1515 vectorized. This restriction will be relaxed.
1517 This pass will require a cost model to guide it whether to apply peeling
1518 or versioning or a combination of the two. For example, the scheme that
1519 intel uses when given a loop with several memory accesses, is as follows:
1520 choose one memory access ('p') which alignment you want to force by doing
1521 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1522 other accesses are not necessarily aligned, or (2) use loop versioning to
1523 generate one loop in which all accesses are aligned, and another loop in
1524 which only 'p' is necessarily aligned.
1526 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1527 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1528 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1530 Devising a cost model is the most critical aspect of this work. It will
1531 guide us on which access to peel for, whether to use loop versioning, how
1532 many versions to create, etc. The cost model will probably consist of
1533 generic considerations as well as target specific considerations (on
1534 powerpc for example, misaligned stores are more painful than misaligned
1535 loads).
1537 Here are the general steps involved in alignment enhancements:
1539 -- original loop, before alignment analysis:
1540 for (i=0; i<N; i++){
1541 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1542 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1545 -- After vect_compute_data_refs_alignment:
1546 for (i=0; i<N; i++){
1547 x = q[i]; # DR_MISALIGNMENT(q) = 3
1548 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1551 -- Possibility 1: we do loop versioning:
1552 if (p is aligned) {
1553 for (i=0; i<N; i++){ # loop 1A
1554 x = q[i]; # DR_MISALIGNMENT(q) = 3
1555 p[i] = y; # DR_MISALIGNMENT(p) = 0
1558 else {
1559 for (i=0; i<N; i++){ # loop 1B
1560 x = q[i]; # DR_MISALIGNMENT(q) = 3
1561 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1565 -- Possibility 2: we do loop peeling:
1566 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1567 x = q[i];
1568 p[i] = y;
1570 for (i = 3; i < N; i++){ # loop 2A
1571 x = q[i]; # DR_MISALIGNMENT(q) = 0
1572 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1575 -- Possibility 3: combination of loop peeling and versioning:
1576 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1577 x = q[i];
1578 p[i] = y;
1580 if (p is aligned) {
1581 for (i = 3; i<N; i++){ # loop 3A
1582 x = q[i]; # DR_MISALIGNMENT(q) = 0
1583 p[i] = y; # DR_MISALIGNMENT(p) = 0
1586 else {
1587 for (i = 3; i<N; i++){ # loop 3B
1588 x = q[i]; # DR_MISALIGNMENT(q) = 0
1589 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1593 These loops are later passed to loop_transform to be vectorized. The
1594 vectorizer will use the alignment information to guide the transformation
1595 (whether to generate regular loads/stores, or with special handling for
1596 misalignment). */
1598 bool
1599 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1601 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1602 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1603 enum dr_alignment_support supportable_dr_alignment;
1604 struct data_reference *dr0 = NULL, *first_store = NULL;
1605 struct data_reference *dr;
1606 unsigned int i, j;
1607 bool do_peeling = false;
1608 bool do_versioning = false;
1609 bool stat;
1610 gimple *stmt;
1611 stmt_vec_info stmt_info;
1612 unsigned int npeel = 0;
1613 bool one_misalignment_known = false;
1614 bool one_misalignment_unknown = false;
1615 bool one_dr_unsupportable = false;
1616 struct data_reference *unsupportable_dr = NULL;
1617 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1618 unsigned possible_npeel_number = 1;
1619 tree vectype;
1620 unsigned int mis, same_align_drs_max = 0;
1621 hash_table<peel_info_hasher> peeling_htab (1);
1623 if (dump_enabled_p ())
1624 dump_printf_loc (MSG_NOTE, vect_location,
1625 "=== vect_enhance_data_refs_alignment ===\n");
1627 /* Reset data so we can safely be called multiple times. */
1628 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1629 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1631 /* While cost model enhancements are expected in the future, the high level
1632 view of the code at this time is as follows:
1634 A) If there is a misaligned access then see if peeling to align
1635 this access can make all data references satisfy
1636 vect_supportable_dr_alignment. If so, update data structures
1637 as needed and return true.
1639 B) If peeling wasn't possible and there is a data reference with an
1640 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1641 then see if loop versioning checks can be used to make all data
1642 references satisfy vect_supportable_dr_alignment. If so, update
1643 data structures as needed and return true.
1645 C) If neither peeling nor versioning were successful then return false if
1646 any data reference does not satisfy vect_supportable_dr_alignment.
1648 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1650 Note, Possibility 3 above (which is peeling and versioning together) is not
1651 being done at this time. */
1653 /* (1) Peeling to force alignment. */
1655 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1656 Considerations:
1657 + How many accesses will become aligned due to the peeling
1658 - How many accesses will become unaligned due to the peeling,
1659 and the cost of misaligned accesses.
1660 - The cost of peeling (the extra runtime checks, the increase
1661 in code size). */
1663 FOR_EACH_VEC_ELT (datarefs, i, dr)
1665 stmt = DR_STMT (dr);
1666 stmt_info = vinfo_for_stmt (stmt);
1668 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1669 continue;
1671 /* For interleaving, only the alignment of the first access
1672 matters. */
1673 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1674 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1675 continue;
1677 /* For invariant accesses there is nothing to enhance. */
1678 if (integer_zerop (DR_STEP (dr)))
1679 continue;
1681 /* Strided accesses perform only component accesses, alignment is
1682 irrelevant for them. */
1683 if (STMT_VINFO_STRIDED_P (stmt_info)
1684 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1685 continue;
1687 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1688 do_peeling = vector_alignment_reachable_p (dr);
1689 if (do_peeling)
1691 if (known_alignment_for_access_p (dr))
1693 unsigned int npeel_tmp = 0;
1694 bool negative = tree_int_cst_compare (DR_STEP (dr),
1695 size_zero_node) < 0;
1697 vectype = STMT_VINFO_VECTYPE (stmt_info);
1698 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1699 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1700 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1701 if (DR_MISALIGNMENT (dr) != 0)
1702 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1704 /* For multiple types, it is possible that the bigger type access
1705 will have more than one peeling option. E.g., a loop with two
1706 types: one of size (vector size / 4), and the other one of
1707 size (vector size / 8). Vectorization factor will 8. If both
1708 accesses are misaligned by 3, the first one needs one scalar
1709 iteration to be aligned, and the second one needs 5. But the
1710 first one will be aligned also by peeling 5 scalar
1711 iterations, and in that case both accesses will be aligned.
1712 Hence, except for the immediate peeling amount, we also want
1713 to try to add full vector size, while we don't exceed
1714 vectorization factor.
1715 We do this automatically for cost model, since we calculate
1716 cost for every peeling option. */
1717 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1719 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1720 ? vf * GROUP_SIZE (stmt_info) : vf);
1721 possible_npeel_number
1722 = vect_get_num_vectors (nscalars, vectype);
1724 /* NPEEL_TMP is 0 when there is no misalignment, but also
1725 allow peeling NELEMENTS. */
1726 if (DR_MISALIGNMENT (dr) == 0)
1727 possible_npeel_number++;
1730 /* Save info about DR in the hash table. Also include peeling
1731 amounts according to the explanation above. */
1732 for (j = 0; j < possible_npeel_number; j++)
1734 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1735 dr, npeel_tmp);
1736 npeel_tmp += target_align / dr_size;
1739 one_misalignment_known = true;
1741 else
1743 /* If we don't know any misalignment values, we prefer
1744 peeling for data-ref that has the maximum number of data-refs
1745 with the same alignment, unless the target prefers to align
1746 stores over load. */
1747 unsigned same_align_drs
1748 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1749 if (!dr0
1750 || same_align_drs_max < same_align_drs)
1752 same_align_drs_max = same_align_drs;
1753 dr0 = dr;
1755 /* For data-refs with the same number of related
1756 accesses prefer the one where the misalign
1757 computation will be invariant in the outermost loop. */
1758 else if (same_align_drs_max == same_align_drs)
1760 struct loop *ivloop0, *ivloop;
1761 ivloop0 = outermost_invariant_loop_for_expr
1762 (loop, DR_BASE_ADDRESS (dr0));
1763 ivloop = outermost_invariant_loop_for_expr
1764 (loop, DR_BASE_ADDRESS (dr));
1765 if ((ivloop && !ivloop0)
1766 || (ivloop && ivloop0
1767 && flow_loop_nested_p (ivloop, ivloop0)))
1768 dr0 = dr;
1771 one_misalignment_unknown = true;
1773 /* Check for data refs with unsupportable alignment that
1774 can be peeled. */
1775 if (!supportable_dr_alignment)
1777 one_dr_unsupportable = true;
1778 unsupportable_dr = dr;
1781 if (!first_store && DR_IS_WRITE (dr))
1782 first_store = dr;
1785 else
1787 if (!aligned_access_p (dr))
1789 if (dump_enabled_p ())
1790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1791 "vector alignment may not be reachable\n");
1792 break;
1797 /* Check if we can possibly peel the loop. */
1798 if (!vect_can_advance_ivs_p (loop_vinfo)
1799 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1800 || loop->inner)
1801 do_peeling = false;
1803 struct _vect_peel_extended_info peel_for_known_alignment;
1804 struct _vect_peel_extended_info peel_for_unknown_alignment;
1805 struct _vect_peel_extended_info best_peel;
1807 peel_for_unknown_alignment.inside_cost = INT_MAX;
1808 peel_for_unknown_alignment.outside_cost = INT_MAX;
1809 peel_for_unknown_alignment.peel_info.count = 0;
1811 if (do_peeling
1812 && one_misalignment_unknown)
1814 /* Check if the target requires to prefer stores over loads, i.e., if
1815 misaligned stores are more expensive than misaligned loads (taking
1816 drs with same alignment into account). */
1817 unsigned int load_inside_cost = 0;
1818 unsigned int load_outside_cost = 0;
1819 unsigned int store_inside_cost = 0;
1820 unsigned int store_outside_cost = 0;
1821 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1823 stmt_vector_for_cost dummy;
1824 dummy.create (2);
1825 vect_get_peeling_costs_all_drs (datarefs, dr0,
1826 &load_inside_cost,
1827 &load_outside_cost,
1828 &dummy, estimated_npeels, true);
1829 dummy.release ();
1831 if (first_store)
1833 dummy.create (2);
1834 vect_get_peeling_costs_all_drs (datarefs, first_store,
1835 &store_inside_cost,
1836 &store_outside_cost,
1837 &dummy, estimated_npeels, true);
1838 dummy.release ();
1840 else
1842 store_inside_cost = INT_MAX;
1843 store_outside_cost = INT_MAX;
1846 if (load_inside_cost > store_inside_cost
1847 || (load_inside_cost == store_inside_cost
1848 && load_outside_cost > store_outside_cost))
1850 dr0 = first_store;
1851 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1852 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1854 else
1856 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1857 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1860 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1861 prologue_cost_vec.create (2);
1862 epilogue_cost_vec.create (2);
1864 int dummy2;
1865 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1866 (loop_vinfo, estimated_npeels, &dummy2,
1867 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1868 &prologue_cost_vec, &epilogue_cost_vec);
1870 prologue_cost_vec.release ();
1871 epilogue_cost_vec.release ();
1873 peel_for_unknown_alignment.peel_info.count = 1
1874 + STMT_VINFO_SAME_ALIGN_REFS
1875 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1878 peel_for_unknown_alignment.peel_info.npeel = 0;
1879 peel_for_unknown_alignment.peel_info.dr = dr0;
1881 best_peel = peel_for_unknown_alignment;
1883 peel_for_known_alignment.inside_cost = INT_MAX;
1884 peel_for_known_alignment.outside_cost = INT_MAX;
1885 peel_for_known_alignment.peel_info.count = 0;
1886 peel_for_known_alignment.peel_info.dr = NULL;
1888 if (do_peeling && one_misalignment_known)
1890 /* Peeling is possible, but there is no data access that is not supported
1891 unless aligned. So we try to choose the best possible peeling from
1892 the hash table. */
1893 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1894 (&peeling_htab, loop_vinfo);
1897 /* Compare costs of peeling for known and unknown alignment. */
1898 if (peel_for_known_alignment.peel_info.dr != NULL
1899 && peel_for_unknown_alignment.inside_cost
1900 >= peel_for_known_alignment.inside_cost)
1902 best_peel = peel_for_known_alignment;
1904 /* If the best peeling for known alignment has NPEEL == 0, perform no
1905 peeling at all except if there is an unsupportable dr that we can
1906 align. */
1907 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1908 do_peeling = false;
1911 /* If there is an unsupportable data ref, prefer this over all choices so far
1912 since we'd have to discard a chosen peeling except when it accidentally
1913 aligned the unsupportable data ref. */
1914 if (one_dr_unsupportable)
1915 dr0 = unsupportable_dr;
1916 else if (do_peeling)
1918 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1919 TODO: Use nopeel_outside_cost or get rid of it? */
1920 unsigned nopeel_inside_cost = 0;
1921 unsigned nopeel_outside_cost = 0;
1923 stmt_vector_for_cost dummy;
1924 dummy.create (2);
1925 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1926 &nopeel_outside_cost, &dummy, 0, false);
1927 dummy.release ();
1929 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1930 costs will be recorded. */
1931 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1932 prologue_cost_vec.create (2);
1933 epilogue_cost_vec.create (2);
1935 int dummy2;
1936 nopeel_outside_cost += vect_get_known_peeling_cost
1937 (loop_vinfo, 0, &dummy2,
1938 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1939 &prologue_cost_vec, &epilogue_cost_vec);
1941 prologue_cost_vec.release ();
1942 epilogue_cost_vec.release ();
1944 npeel = best_peel.peel_info.npeel;
1945 dr0 = best_peel.peel_info.dr;
1947 /* If no peeling is not more expensive than the best peeling we
1948 have so far, don't perform any peeling. */
1949 if (nopeel_inside_cost <= best_peel.inside_cost)
1950 do_peeling = false;
1953 if (do_peeling)
1955 stmt = DR_STMT (dr0);
1956 stmt_info = vinfo_for_stmt (stmt);
1957 vectype = STMT_VINFO_VECTYPE (stmt_info);
1959 if (known_alignment_for_access_p (dr0))
1961 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1962 size_zero_node) < 0;
1963 if (!npeel)
1965 /* Since it's known at compile time, compute the number of
1966 iterations in the peeled loop (the peeling factor) for use in
1967 updating DR_MISALIGNMENT values. The peeling factor is the
1968 vectorization factor minus the misalignment as an element
1969 count. */
1970 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1971 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1972 npeel = ((mis & (target_align - 1))
1973 / vect_get_scalar_dr_size (dr0));
1976 /* For interleaved data access every iteration accesses all the
1977 members of the group, therefore we divide the number of iterations
1978 by the group size. */
1979 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1980 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1981 npeel /= GROUP_SIZE (stmt_info);
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_NOTE, vect_location,
1985 "Try peeling by %d\n", npeel);
1988 /* Ensure that all datarefs can be vectorized after the peel. */
1989 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1990 do_peeling = false;
1992 /* Check if all datarefs are supportable and log. */
1993 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1995 stat = vect_verify_datarefs_alignment (loop_vinfo);
1996 if (!stat)
1997 do_peeling = false;
1998 else
1999 return stat;
2002 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2003 if (do_peeling)
2005 unsigned max_allowed_peel
2006 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2007 if (max_allowed_peel != (unsigned)-1)
2009 unsigned max_peel = npeel;
2010 if (max_peel == 0)
2012 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2013 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2015 if (max_peel > max_allowed_peel)
2017 do_peeling = false;
2018 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_NOTE, vect_location,
2020 "Disable peeling, max peels reached: %d\n", max_peel);
2025 /* Cost model #2 - if peeling may result in a remaining loop not
2026 iterating enough to be vectorized then do not peel. Since this
2027 is a cost heuristic rather than a correctness decision, use the
2028 most likely runtime value for variable vectorization factors. */
2029 if (do_peeling
2030 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2032 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2033 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2034 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2035 < assumed_vf + max_peel)
2036 do_peeling = false;
2039 if (do_peeling)
2041 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2042 If the misalignment of DR_i is identical to that of dr0 then set
2043 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2044 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2045 by the peeling factor times the element size of DR_i (MOD the
2046 vectorization factor times the size). Otherwise, the
2047 misalignment of DR_i must be set to unknown. */
2048 FOR_EACH_VEC_ELT (datarefs, i, dr)
2049 if (dr != dr0)
2051 /* Strided accesses perform only component accesses, alignment
2052 is irrelevant for them. */
2053 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2054 if (STMT_VINFO_STRIDED_P (stmt_info)
2055 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2056 continue;
2058 vect_update_misalignment_for_peel (dr, dr0, npeel);
2061 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2062 if (npeel)
2063 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2064 else
2065 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2066 = DR_MISALIGNMENT (dr0);
2067 SET_DR_MISALIGNMENT (dr0, 0);
2068 if (dump_enabled_p ())
2070 dump_printf_loc (MSG_NOTE, vect_location,
2071 "Alignment of access forced using peeling.\n");
2072 dump_printf_loc (MSG_NOTE, vect_location,
2073 "Peeling for alignment will be applied.\n");
2076 /* The inside-loop cost will be accounted for in vectorizable_load
2077 and vectorizable_store correctly with adjusted alignments.
2078 Drop the body_cst_vec on the floor here. */
2079 stat = vect_verify_datarefs_alignment (loop_vinfo);
2080 gcc_assert (stat);
2081 return stat;
2085 /* (2) Versioning to force alignment. */
2087 /* Try versioning if:
2088 1) optimize loop for speed
2089 2) there is at least one unsupported misaligned data ref with an unknown
2090 misalignment, and
2091 3) all misaligned data refs with a known misalignment are supported, and
2092 4) the number of runtime alignment checks is within reason. */
2094 do_versioning =
2095 optimize_loop_nest_for_speed_p (loop)
2096 && (!loop->inner); /* FORNOW */
2098 if (do_versioning)
2100 FOR_EACH_VEC_ELT (datarefs, i, dr)
2102 stmt = DR_STMT (dr);
2103 stmt_info = vinfo_for_stmt (stmt);
2105 /* For interleaving, only the alignment of the first access
2106 matters. */
2107 if (aligned_access_p (dr)
2108 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2109 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2110 continue;
2112 if (STMT_VINFO_STRIDED_P (stmt_info))
2114 /* Strided loads perform only component accesses, alignment is
2115 irrelevant for them. */
2116 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2117 continue;
2118 do_versioning = false;
2119 break;
2122 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2124 if (!supportable_dr_alignment)
2126 gimple *stmt;
2127 int mask;
2128 tree vectype;
2130 if (known_alignment_for_access_p (dr)
2131 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2132 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2134 do_versioning = false;
2135 break;
2138 stmt = DR_STMT (dr);
2139 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2140 gcc_assert (vectype);
2142 /* The rightmost bits of an aligned address must be zeros.
2143 Construct the mask needed for this test. For example,
2144 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2145 mask must be 15 = 0xf. */
2146 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2148 /* FORNOW: use the same mask to test all potentially unaligned
2149 references in the loop. The vectorizer currently supports
2150 a single vector size, see the reference to
2151 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2152 vectorization factor is computed. */
2153 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2154 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2155 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2156 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2157 DR_STMT (dr));
2161 /* Versioning requires at least one misaligned data reference. */
2162 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2163 do_versioning = false;
2164 else if (!do_versioning)
2165 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2168 if (do_versioning)
2170 vec<gimple *> may_misalign_stmts
2171 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2172 gimple *stmt;
2174 /* It can now be assumed that the data references in the statements
2175 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2176 of the loop being vectorized. */
2177 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2179 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2180 dr = STMT_VINFO_DATA_REF (stmt_info);
2181 SET_DR_MISALIGNMENT (dr, 0);
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_NOTE, vect_location,
2184 "Alignment of access forced using versioning.\n");
2187 if (dump_enabled_p ())
2188 dump_printf_loc (MSG_NOTE, vect_location,
2189 "Versioning for alignment will be applied.\n");
2191 /* Peeling and versioning can't be done together at this time. */
2192 gcc_assert (! (do_peeling && do_versioning));
2194 stat = vect_verify_datarefs_alignment (loop_vinfo);
2195 gcc_assert (stat);
2196 return stat;
2199 /* This point is reached if neither peeling nor versioning is being done. */
2200 gcc_assert (! (do_peeling || do_versioning));
2202 stat = vect_verify_datarefs_alignment (loop_vinfo);
2203 return stat;
2207 /* Function vect_find_same_alignment_drs.
2209 Update group and alignment relations according to the chosen
2210 vectorization factor. */
2212 static void
2213 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2215 struct data_reference *dra = DDR_A (ddr);
2216 struct data_reference *drb = DDR_B (ddr);
2217 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2218 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2220 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2221 return;
2223 if (dra == drb)
2224 return;
2226 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2227 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2228 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2229 return;
2231 /* Two references with distance zero have the same alignment. */
2232 offset_int diff = (wi::to_offset (DR_INIT (dra))
2233 - wi::to_offset (DR_INIT (drb)));
2234 if (diff != 0)
2236 /* Get the wider of the two alignments. */
2237 unsigned int align_a = (vect_calculate_target_alignment (dra)
2238 / BITS_PER_UNIT);
2239 unsigned int align_b = (vect_calculate_target_alignment (drb)
2240 / BITS_PER_UNIT);
2241 unsigned int max_align = MAX (align_a, align_b);
2243 /* Require the gap to be a multiple of the larger vector alignment. */
2244 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2245 return;
2248 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2249 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2250 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "accesses have the same alignment: ");
2254 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2255 dump_printf (MSG_NOTE, " and ");
2256 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2257 dump_printf (MSG_NOTE, "\n");
2262 /* Function vect_analyze_data_refs_alignment
2264 Analyze the alignment of the data-references in the loop.
2265 Return FALSE if a data reference is found that cannot be vectorized. */
2267 bool
2268 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_NOTE, vect_location,
2272 "=== vect_analyze_data_refs_alignment ===\n");
2274 /* Mark groups of data references with same alignment using
2275 data dependence information. */
2276 vec<ddr_p> ddrs = vinfo->ddrs;
2277 struct data_dependence_relation *ddr;
2278 unsigned int i;
2280 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2281 vect_find_same_alignment_drs (ddr);
2283 vec<data_reference_p> datarefs = vinfo->datarefs;
2284 struct data_reference *dr;
2286 vect_record_base_alignments (vinfo);
2287 FOR_EACH_VEC_ELT (datarefs, i, dr)
2289 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2290 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2291 && !vect_compute_data_ref_alignment (dr))
2293 /* Strided accesses perform only component accesses, misalignment
2294 information is irrelevant for them. */
2295 if (STMT_VINFO_STRIDED_P (stmt_info)
2296 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2297 continue;
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301 "not vectorized: can't calculate alignment "
2302 "for data ref.\n");
2304 return false;
2308 return true;
2312 /* Analyze alignment of DRs of stmts in NODE. */
2314 static bool
2315 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2317 /* We vectorize from the first scalar stmt in the node unless
2318 the node is permuted in which case we start from the first
2319 element in the group. */
2320 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2321 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2322 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2323 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2325 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2326 if (! vect_compute_data_ref_alignment (dr)
2327 /* For creating the data-ref pointer we need alignment of the
2328 first element anyway. */
2329 || (dr != first_dr
2330 && ! vect_compute_data_ref_alignment (first_dr))
2331 || ! verify_data_ref_alignment (dr))
2333 if (dump_enabled_p ())
2334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2335 "not vectorized: bad data alignment in basic "
2336 "block.\n");
2337 return false;
2340 return true;
2343 /* Function vect_slp_analyze_instance_alignment
2345 Analyze the alignment of the data-references in the SLP instance.
2346 Return FALSE if a data reference is found that cannot be vectorized. */
2348 bool
2349 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2351 if (dump_enabled_p ())
2352 dump_printf_loc (MSG_NOTE, vect_location,
2353 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2355 slp_tree node;
2356 unsigned i;
2357 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2358 if (! vect_slp_analyze_and_verify_node_alignment (node))
2359 return false;
2361 node = SLP_INSTANCE_TREE (instance);
2362 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2363 && ! vect_slp_analyze_and_verify_node_alignment
2364 (SLP_INSTANCE_TREE (instance)))
2365 return false;
2367 return true;
2371 /* Analyze groups of accesses: check that DR belongs to a group of
2372 accesses of legal size, step, etc. Detect gaps, single element
2373 interleaving, and other special cases. Set grouped access info.
2374 Collect groups of strided stores for further use in SLP analysis.
2375 Worker for vect_analyze_group_access. */
2377 static bool
2378 vect_analyze_group_access_1 (struct data_reference *dr)
2380 tree step = DR_STEP (dr);
2381 tree scalar_type = TREE_TYPE (DR_REF (dr));
2382 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2383 gimple *stmt = DR_STMT (dr);
2384 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2385 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2386 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2387 HOST_WIDE_INT dr_step = -1;
2388 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2389 bool slp_impossible = false;
2391 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2392 size of the interleaving group (including gaps). */
2393 if (tree_fits_shwi_p (step))
2395 dr_step = tree_to_shwi (step);
2396 /* Check that STEP is a multiple of type size. Otherwise there is
2397 a non-element-sized gap at the end of the group which we
2398 cannot represent in GROUP_GAP or GROUP_SIZE.
2399 ??? As we can handle non-constant step fine here we should
2400 simply remove uses of GROUP_GAP between the last and first
2401 element and instead rely on DR_STEP. GROUP_SIZE then would
2402 simply not include that gap. */
2403 if ((dr_step % type_size) != 0)
2405 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_NOTE, vect_location,
2408 "Step ");
2409 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2410 dump_printf (MSG_NOTE,
2411 " is not a multiple of the element size for ");
2412 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2413 dump_printf (MSG_NOTE, "\n");
2415 return false;
2417 groupsize = absu_hwi (dr_step) / type_size;
2419 else
2420 groupsize = 0;
2422 /* Not consecutive access is possible only if it is a part of interleaving. */
2423 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2425 /* Check if it this DR is a part of interleaving, and is a single
2426 element of the group that is accessed in the loop. */
2428 /* Gaps are supported only for loads. STEP must be a multiple of the type
2429 size. The size of the group must be a power of 2. */
2430 if (DR_IS_READ (dr)
2431 && (dr_step % type_size) == 0
2432 && groupsize > 0
2433 && pow2p_hwi (groupsize))
2435 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2436 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2437 GROUP_GAP (stmt_info) = groupsize - 1;
2438 if (dump_enabled_p ())
2440 dump_printf_loc (MSG_NOTE, vect_location,
2441 "Detected single element interleaving ");
2442 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2443 dump_printf (MSG_NOTE, " step ");
2444 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2445 dump_printf (MSG_NOTE, "\n");
2448 return true;
2451 if (dump_enabled_p ())
2453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2454 "not consecutive access ");
2455 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2458 if (bb_vinfo)
2460 /* Mark the statement as unvectorizable. */
2461 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2462 return true;
2465 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2466 STMT_VINFO_STRIDED_P (stmt_info) = true;
2467 return true;
2470 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2472 /* First stmt in the interleaving chain. Check the chain. */
2473 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2474 struct data_reference *data_ref = dr;
2475 unsigned int count = 1;
2476 tree prev_init = DR_INIT (data_ref);
2477 gimple *prev = stmt;
2478 HOST_WIDE_INT diff, gaps = 0;
2480 while (next)
2482 /* Skip same data-refs. In case that two or more stmts share
2483 data-ref (supported only for loads), we vectorize only the first
2484 stmt, and the rest get their vectorized loads from the first
2485 one. */
2486 if (!tree_int_cst_compare (DR_INIT (data_ref),
2487 DR_INIT (STMT_VINFO_DATA_REF (
2488 vinfo_for_stmt (next)))))
2490 if (DR_IS_WRITE (data_ref))
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494 "Two store stmts share the same dr.\n");
2495 return false;
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2500 "Two or more load stmts share the same dr.\n");
2502 /* For load use the same data-ref load. */
2503 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2505 prev = next;
2506 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2507 continue;
2510 prev = next;
2511 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2513 /* All group members have the same STEP by construction. */
2514 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2516 /* Check that the distance between two accesses is equal to the type
2517 size. Otherwise, we have gaps. */
2518 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2519 - TREE_INT_CST_LOW (prev_init)) / type_size;
2520 if (diff != 1)
2522 /* FORNOW: SLP of accesses with gaps is not supported. */
2523 slp_impossible = true;
2524 if (DR_IS_WRITE (data_ref))
2526 if (dump_enabled_p ())
2527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2528 "interleaved store with gaps\n");
2529 return false;
2532 gaps += diff - 1;
2535 last_accessed_element += diff;
2537 /* Store the gap from the previous member of the group. If there is no
2538 gap in the access, GROUP_GAP is always 1. */
2539 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2541 prev_init = DR_INIT (data_ref);
2542 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2543 /* Count the number of data-refs in the chain. */
2544 count++;
2547 if (groupsize == 0)
2548 groupsize = count + gaps;
2550 /* This could be UINT_MAX but as we are generating code in a very
2551 inefficient way we have to cap earlier. See PR78699 for example. */
2552 if (groupsize > 4096)
2554 if (dump_enabled_p ())
2555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2556 "group is too large\n");
2557 return false;
2560 /* Check that the size of the interleaving is equal to count for stores,
2561 i.e., that there are no gaps. */
2562 if (groupsize != count
2563 && !DR_IS_READ (dr))
2565 if (dump_enabled_p ())
2566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2567 "interleaved store with gaps\n");
2568 return false;
2571 /* If there is a gap after the last load in the group it is the
2572 difference between the groupsize and the last accessed
2573 element.
2574 When there is no gap, this difference should be 0. */
2575 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2577 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2578 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_NOTE, vect_location,
2581 "Detected interleaving ");
2582 if (DR_IS_READ (dr))
2583 dump_printf (MSG_NOTE, "load ");
2584 else
2585 dump_printf (MSG_NOTE, "store ");
2586 dump_printf (MSG_NOTE, "of size %u starting with ",
2587 (unsigned)groupsize);
2588 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2589 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2590 dump_printf_loc (MSG_NOTE, vect_location,
2591 "There is a gap of %u elements after the group\n",
2592 GROUP_GAP (vinfo_for_stmt (stmt)));
2595 /* SLP: create an SLP data structure for every interleaving group of
2596 stores for further analysis in vect_analyse_slp. */
2597 if (DR_IS_WRITE (dr) && !slp_impossible)
2599 if (loop_vinfo)
2600 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2601 if (bb_vinfo)
2602 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2606 return true;
2609 /* Analyze groups of accesses: check that DR belongs to a group of
2610 accesses of legal size, step, etc. Detect gaps, single element
2611 interleaving, and other special cases. Set grouped access info.
2612 Collect groups of strided stores for further use in SLP analysis. */
2614 static bool
2615 vect_analyze_group_access (struct data_reference *dr)
2617 if (!vect_analyze_group_access_1 (dr))
2619 /* Dissolve the group if present. */
2620 gimple *next;
2621 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2622 while (stmt)
2624 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2625 next = GROUP_NEXT_ELEMENT (vinfo);
2626 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2627 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2628 stmt = next;
2630 return false;
2632 return true;
2635 /* Analyze the access pattern of the data-reference DR.
2636 In case of non-consecutive accesses call vect_analyze_group_access() to
2637 analyze groups of accesses. */
2639 static bool
2640 vect_analyze_data_ref_access (struct data_reference *dr)
2642 tree step = DR_STEP (dr);
2643 tree scalar_type = TREE_TYPE (DR_REF (dr));
2644 gimple *stmt = DR_STMT (dr);
2645 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2646 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2647 struct loop *loop = NULL;
2649 if (loop_vinfo)
2650 loop = LOOP_VINFO_LOOP (loop_vinfo);
2652 if (loop_vinfo && !step)
2654 if (dump_enabled_p ())
2655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2656 "bad data-ref access in loop\n");
2657 return false;
2660 /* Allow loads with zero step in inner-loop vectorization. */
2661 if (loop_vinfo && integer_zerop (step))
2663 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2664 if (!nested_in_vect_loop_p (loop, stmt))
2665 return DR_IS_READ (dr);
2666 /* Allow references with zero step for outer loops marked
2667 with pragma omp simd only - it guarantees absence of
2668 loop-carried dependencies between inner loop iterations. */
2669 if (!loop->force_vectorize)
2671 if (dump_enabled_p ())
2672 dump_printf_loc (MSG_NOTE, vect_location,
2673 "zero step in inner loop of nest\n");
2674 return false;
2678 if (loop && nested_in_vect_loop_p (loop, stmt))
2680 /* Interleaved accesses are not yet supported within outer-loop
2681 vectorization for references in the inner-loop. */
2682 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2684 /* For the rest of the analysis we use the outer-loop step. */
2685 step = STMT_VINFO_DR_STEP (stmt_info);
2686 if (integer_zerop (step))
2688 if (dump_enabled_p ())
2689 dump_printf_loc (MSG_NOTE, vect_location,
2690 "zero step in outer loop.\n");
2691 return DR_IS_READ (dr);
2695 /* Consecutive? */
2696 if (TREE_CODE (step) == INTEGER_CST)
2698 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2699 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2700 || (dr_step < 0
2701 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2703 /* Mark that it is not interleaving. */
2704 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2705 return true;
2709 if (loop && nested_in_vect_loop_p (loop, stmt))
2711 if (dump_enabled_p ())
2712 dump_printf_loc (MSG_NOTE, vect_location,
2713 "grouped access in outer loop.\n");
2714 return false;
2718 /* Assume this is a DR handled by non-constant strided load case. */
2719 if (TREE_CODE (step) != INTEGER_CST)
2720 return (STMT_VINFO_STRIDED_P (stmt_info)
2721 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2722 || vect_analyze_group_access (dr)));
2724 /* Not consecutive access - check if it's a part of interleaving group. */
2725 return vect_analyze_group_access (dr);
2728 /* Compare two data-references DRA and DRB to group them into chunks
2729 suitable for grouping. */
2731 static int
2732 dr_group_sort_cmp (const void *dra_, const void *drb_)
2734 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2735 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2736 int cmp;
2738 /* Stabilize sort. */
2739 if (dra == drb)
2740 return 0;
2742 /* DRs in different loops never belong to the same group. */
2743 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2744 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2745 if (loopa != loopb)
2746 return loopa->num < loopb->num ? -1 : 1;
2748 /* Ordering of DRs according to base. */
2749 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2750 DR_BASE_ADDRESS (drb));
2751 if (cmp != 0)
2752 return cmp;
2754 /* And according to DR_OFFSET. */
2755 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2756 if (cmp != 0)
2757 return cmp;
2759 /* Put reads before writes. */
2760 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2761 return DR_IS_READ (dra) ? -1 : 1;
2763 /* Then sort after access size. */
2764 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2765 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2766 if (cmp != 0)
2767 return cmp;
2769 /* And after step. */
2770 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2771 if (cmp != 0)
2772 return cmp;
2774 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2775 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2776 if (cmp == 0)
2777 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2778 return cmp;
2781 /* Function vect_analyze_data_ref_accesses.
2783 Analyze the access pattern of all the data references in the loop.
2785 FORNOW: the only access pattern that is considered vectorizable is a
2786 simple step 1 (consecutive) access.
2788 FORNOW: handle only arrays and pointer accesses. */
2790 bool
2791 vect_analyze_data_ref_accesses (vec_info *vinfo)
2793 unsigned int i;
2794 vec<data_reference_p> datarefs = vinfo->datarefs;
2795 struct data_reference *dr;
2797 if (dump_enabled_p ())
2798 dump_printf_loc (MSG_NOTE, vect_location,
2799 "=== vect_analyze_data_ref_accesses ===\n");
2801 if (datarefs.is_empty ())
2802 return true;
2804 /* Sort the array of datarefs to make building the interleaving chains
2805 linear. Don't modify the original vector's order, it is needed for
2806 determining what dependencies are reversed. */
2807 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2808 datarefs_copy.qsort (dr_group_sort_cmp);
2810 /* Build the interleaving chains. */
2811 for (i = 0; i < datarefs_copy.length () - 1;)
2813 data_reference_p dra = datarefs_copy[i];
2814 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2815 stmt_vec_info lastinfo = NULL;
2816 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2818 ++i;
2819 continue;
2821 for (i = i + 1; i < datarefs_copy.length (); ++i)
2823 data_reference_p drb = datarefs_copy[i];
2824 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2825 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2826 break;
2828 /* ??? Imperfect sorting (non-compatible types, non-modulo
2829 accesses, same accesses) can lead to a group to be artificially
2830 split here as we don't just skip over those. If it really
2831 matters we can push those to a worklist and re-iterate
2832 over them. The we can just skip ahead to the next DR here. */
2834 /* DRs in a different loop should not be put into the same
2835 interleaving group. */
2836 if (gimple_bb (DR_STMT (dra))->loop_father
2837 != gimple_bb (DR_STMT (drb))->loop_father)
2838 break;
2840 /* Check that the data-refs have same first location (except init)
2841 and they are both either store or load (not load and store,
2842 not masked loads or stores). */
2843 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2844 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2845 DR_BASE_ADDRESS (drb)) != 0
2846 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2847 || !gimple_assign_single_p (DR_STMT (dra))
2848 || !gimple_assign_single_p (DR_STMT (drb)))
2849 break;
2851 /* Check that the data-refs have the same constant size. */
2852 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2853 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2854 if (!tree_fits_uhwi_p (sza)
2855 || !tree_fits_uhwi_p (szb)
2856 || !tree_int_cst_equal (sza, szb))
2857 break;
2859 /* Check that the data-refs have the same step. */
2860 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2861 break;
2863 /* Check the types are compatible.
2864 ??? We don't distinguish this during sorting. */
2865 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2866 TREE_TYPE (DR_REF (drb))))
2867 break;
2869 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2870 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2871 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2872 HOST_WIDE_INT init_prev
2873 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2874 gcc_assert (init_a <= init_b
2875 && init_a <= init_prev
2876 && init_prev <= init_b);
2878 /* Do not place the same access in the interleaving chain twice. */
2879 if (init_b == init_prev)
2881 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2882 < gimple_uid (DR_STMT (drb)));
2883 /* ??? For now we simply "drop" the later reference which is
2884 otherwise the same rather than finishing off this group.
2885 In the end we'd want to re-process duplicates forming
2886 multiple groups from the refs, likely by just collecting
2887 all candidates (including duplicates and split points
2888 below) in a vector and then process them together. */
2889 continue;
2892 /* If init_b == init_a + the size of the type * k, we have an
2893 interleaving, and DRA is accessed before DRB. */
2894 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2895 if (type_size_a == 0
2896 || (init_b - init_a) % type_size_a != 0)
2897 break;
2899 /* If we have a store, the accesses are adjacent. This splits
2900 groups into chunks we support (we don't support vectorization
2901 of stores with gaps). */
2902 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2903 break;
2905 /* If the step (if not zero or non-constant) is greater than the
2906 difference between data-refs' inits this splits groups into
2907 suitable sizes. */
2908 if (tree_fits_shwi_p (DR_STEP (dra)))
2910 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2911 if (step != 0 && step <= (init_b - init_a))
2912 break;
2915 if (dump_enabled_p ())
2917 dump_printf_loc (MSG_NOTE, vect_location,
2918 "Detected interleaving ");
2919 if (DR_IS_READ (dra))
2920 dump_printf (MSG_NOTE, "load ");
2921 else
2922 dump_printf (MSG_NOTE, "store ");
2923 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2924 dump_printf (MSG_NOTE, " and ");
2925 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2926 dump_printf (MSG_NOTE, "\n");
2929 /* Link the found element into the group list. */
2930 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2932 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2933 lastinfo = stmtinfo_a;
2935 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2936 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2937 lastinfo = stmtinfo_b;
2941 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2942 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2943 && !vect_analyze_data_ref_access (dr))
2945 if (dump_enabled_p ())
2946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2947 "not vectorized: complicated access pattern.\n");
2949 if (is_a <bb_vec_info> (vinfo))
2951 /* Mark the statement as not vectorizable. */
2952 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2953 continue;
2955 else
2957 datarefs_copy.release ();
2958 return false;
2962 datarefs_copy.release ();
2963 return true;
2966 /* Function vect_vfa_segment_size.
2968 Create an expression that computes the size of segment
2969 that will be accessed for a data reference. The functions takes into
2970 account that realignment loads may access one more vector.
2972 Input:
2973 DR: The data reference.
2974 LENGTH_FACTOR: segment length to consider.
2976 Return an expression whose value is the size of segment which will be
2977 accessed by DR. */
2979 static tree
2980 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2982 tree segment_length;
2984 if (integer_zerop (DR_STEP (dr)))
2985 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2986 else
2987 segment_length = size_binop (MULT_EXPR,
2988 fold_convert (sizetype, DR_STEP (dr)),
2989 fold_convert (sizetype, length_factor));
2991 if (vect_supportable_dr_alignment (dr, false)
2992 == dr_explicit_realign_optimized)
2994 tree vector_size = TYPE_SIZE_UNIT
2995 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2997 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2999 return segment_length;
3002 /* Function vect_no_alias_p.
3004 Given data references A and B with equal base and offset, see whether
3005 the alias relation can be decided at compilation time. Return 1 if
3006 it can and the references alias, 0 if it can and the references do
3007 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A
3008 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3009 respectively. */
3011 static int
3012 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3013 tree segment_length_a, tree segment_length_b)
3015 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3016 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3017 poly_uint64 const_length_a;
3018 poly_uint64 const_length_b;
3020 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3021 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3022 [a, a+12) */
3023 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3025 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3026 offset_a = (offset_a + vect_get_scalar_dr_size (a)) - const_length_a;
3028 else
3029 const_length_a = tree_to_poly_uint64 (segment_length_a);
3030 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3032 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3033 offset_b = (offset_b + vect_get_scalar_dr_size (b)) - const_length_b;
3035 else
3036 const_length_b = tree_to_poly_uint64 (segment_length_b);
3038 if (ranges_known_overlap_p (offset_a, const_length_a,
3039 offset_b, const_length_b))
3040 return 1;
3042 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3043 offset_b, const_length_b))
3044 return 0;
3046 return -1;
3049 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3050 in DDR is >= VF. */
3052 static bool
3053 dependence_distance_ge_vf (data_dependence_relation *ddr,
3054 unsigned int loop_depth, poly_uint64 vf)
3056 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3057 || DDR_NUM_DIST_VECTS (ddr) == 0)
3058 return false;
3060 /* If the dependence is exact, we should have limited the VF instead. */
3061 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3063 unsigned int i;
3064 lambda_vector dist_v;
3065 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3067 HOST_WIDE_INT dist = dist_v[loop_depth];
3068 if (dist != 0
3069 && !(dist > 0 && DDR_REVERSED_P (ddr))
3070 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3071 return false;
3074 if (dump_enabled_p ())
3076 dump_printf_loc (MSG_NOTE, vect_location,
3077 "dependence distance between ");
3078 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3079 dump_printf (MSG_NOTE, " and ");
3080 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3081 dump_printf (MSG_NOTE, " is >= VF\n");
3084 return true;
3087 /* Function vect_prune_runtime_alias_test_list.
3089 Prune a list of ddrs to be tested at run-time by versioning for alias.
3090 Merge several alias checks into one if possible.
3091 Return FALSE if resulting list of ddrs is longer then allowed by
3092 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3094 bool
3095 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3097 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3098 hash_set <tree_pair_hash> compared_objects;
3100 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3101 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3102 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3103 vec<vec_object_pair> &check_unequal_addrs
3104 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3105 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3106 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3108 ddr_p ddr;
3109 unsigned int i;
3110 tree length_factor;
3112 if (dump_enabled_p ())
3113 dump_printf_loc (MSG_NOTE, vect_location,
3114 "=== vect_prune_runtime_alias_test_list ===\n");
3116 if (may_alias_ddrs.is_empty ())
3117 return true;
3119 comp_alias_ddrs.create (may_alias_ddrs.length ());
3121 unsigned int loop_depth
3122 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3123 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3125 /* First, we collect all data ref pairs for aliasing checks. */
3126 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3128 int comp_res;
3129 struct data_reference *dr_a, *dr_b;
3130 gimple *dr_group_first_a, *dr_group_first_b;
3131 tree segment_length_a, segment_length_b;
3132 gimple *stmt_a, *stmt_b;
3134 /* Ignore the alias if the VF we chose ended up being no greater
3135 than the dependence distance. */
3136 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3137 continue;
3139 if (DDR_OBJECT_A (ddr))
3141 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3142 if (!compared_objects.add (new_pair))
3144 if (dump_enabled_p ())
3146 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3147 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3148 dump_printf (MSG_NOTE, " and ");
3149 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3150 dump_printf (MSG_NOTE, " have different addresses\n");
3152 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3154 continue;
3157 dr_a = DDR_A (ddr);
3158 stmt_a = DR_STMT (DDR_A (ddr));
3159 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3160 if (dr_group_first_a)
3162 stmt_a = dr_group_first_a;
3163 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3166 dr_b = DDR_B (ddr);
3167 stmt_b = DR_STMT (DDR_B (ddr));
3168 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3169 if (dr_group_first_b)
3171 stmt_b = dr_group_first_b;
3172 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3175 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3176 length_factor = scalar_loop_iters;
3177 else
3178 length_factor = size_int (vect_factor);
3179 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3180 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3182 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3183 DR_BASE_ADDRESS (dr_b));
3184 if (comp_res == 0)
3185 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3186 DR_OFFSET (dr_b));
3188 /* See whether the alias is known at compilation time. */
3189 if (comp_res == 0
3190 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3191 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3192 && poly_int_tree_p (segment_length_a)
3193 && poly_int_tree_p (segment_length_b))
3195 int res = vect_compile_time_alias (dr_a, dr_b,
3196 segment_length_a,
3197 segment_length_b);
3198 if (res == 0)
3199 continue;
3201 if (res == 1)
3203 if (dump_enabled_p ())
3204 dump_printf_loc (MSG_NOTE, vect_location,
3205 "not vectorized: compilation time alias.\n");
3206 return false;
3210 dr_with_seg_len_pair_t dr_with_seg_len_pair
3211 (dr_with_seg_len (dr_a, segment_length_a),
3212 dr_with_seg_len (dr_b, segment_length_b));
3214 /* Canonicalize pairs by sorting the two DR members. */
3215 if (comp_res > 0)
3216 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3218 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3221 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3223 unsigned int count = (comp_alias_ddrs.length ()
3224 + check_unequal_addrs.length ());
3225 dump_printf_loc (MSG_NOTE, vect_location,
3226 "improved number of alias checks from %d to %d\n",
3227 may_alias_ddrs.length (), count);
3228 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3230 if (dump_enabled_p ())
3231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3232 "number of versioning for alias "
3233 "run-time tests exceeds %d "
3234 "(--param vect-max-version-for-alias-checks)\n",
3235 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3236 return false;
3239 return true;
3242 /* Return true if a non-affine read or write in STMT is suitable for a
3243 gather load or scatter store. Describe the operation in *INFO if so. */
3245 bool
3246 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3247 gather_scatter_info *info)
3249 HOST_WIDE_INT scale = 1;
3250 poly_int64 pbitpos, pbitsize;
3251 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3252 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3253 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3254 tree offtype = NULL_TREE;
3255 tree decl, base, off;
3256 machine_mode pmode;
3257 int punsignedp, reversep, pvolatilep = 0;
3259 base = DR_REF (dr);
3260 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3261 see if we can use the def stmt of the address. */
3262 if (is_gimple_call (stmt)
3263 && gimple_call_internal_p (stmt)
3264 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3265 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3266 && TREE_CODE (base) == MEM_REF
3267 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3268 && integer_zerop (TREE_OPERAND (base, 1))
3269 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3271 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3272 if (is_gimple_assign (def_stmt)
3273 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3274 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3277 /* The gather and scatter builtins need address of the form
3278 loop_invariant + vector * {1, 2, 4, 8}
3280 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3281 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3282 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3283 multiplications and additions in it. To get a vector, we need
3284 a single SSA_NAME that will be defined in the loop and will
3285 contain everything that is not loop invariant and that can be
3286 vectorized. The following code attempts to find such a preexistng
3287 SSA_NAME OFF and put the loop invariants into a tree BASE
3288 that can be gimplified before the loop. */
3289 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3290 &punsignedp, &reversep, &pvolatilep);
3291 gcc_assert (base && !reversep);
3292 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3294 if (TREE_CODE (base) == MEM_REF)
3296 if (!integer_zerop (TREE_OPERAND (base, 1)))
3298 if (off == NULL_TREE)
3299 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3300 else
3301 off = size_binop (PLUS_EXPR, off,
3302 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3304 base = TREE_OPERAND (base, 0);
3306 else
3307 base = build_fold_addr_expr (base);
3309 if (off == NULL_TREE)
3310 off = size_zero_node;
3312 /* If base is not loop invariant, either off is 0, then we start with just
3313 the constant offset in the loop invariant BASE and continue with base
3314 as OFF, otherwise give up.
3315 We could handle that case by gimplifying the addition of base + off
3316 into some SSA_NAME and use that as off, but for now punt. */
3317 if (!expr_invariant_in_loop_p (loop, base))
3319 if (!integer_zerop (off))
3320 return false;
3321 off = base;
3322 base = size_int (pbytepos);
3324 /* Otherwise put base + constant offset into the loop invariant BASE
3325 and continue with OFF. */
3326 else
3328 base = fold_convert (sizetype, base);
3329 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3332 /* OFF at this point may be either a SSA_NAME or some tree expression
3333 from get_inner_reference. Try to peel off loop invariants from it
3334 into BASE as long as possible. */
3335 STRIP_NOPS (off);
3336 while (offtype == NULL_TREE)
3338 enum tree_code code;
3339 tree op0, op1, add = NULL_TREE;
3341 if (TREE_CODE (off) == SSA_NAME)
3343 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3345 if (expr_invariant_in_loop_p (loop, off))
3346 return false;
3348 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3349 break;
3351 op0 = gimple_assign_rhs1 (def_stmt);
3352 code = gimple_assign_rhs_code (def_stmt);
3353 op1 = gimple_assign_rhs2 (def_stmt);
3355 else
3357 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3358 return false;
3359 code = TREE_CODE (off);
3360 extract_ops_from_tree (off, &code, &op0, &op1);
3362 switch (code)
3364 case POINTER_PLUS_EXPR:
3365 case PLUS_EXPR:
3366 if (expr_invariant_in_loop_p (loop, op0))
3368 add = op0;
3369 off = op1;
3370 do_add:
3371 add = fold_convert (sizetype, add);
3372 if (scale != 1)
3373 add = size_binop (MULT_EXPR, add, size_int (scale));
3374 base = size_binop (PLUS_EXPR, base, add);
3375 continue;
3377 if (expr_invariant_in_loop_p (loop, op1))
3379 add = op1;
3380 off = op0;
3381 goto do_add;
3383 break;
3384 case MINUS_EXPR:
3385 if (expr_invariant_in_loop_p (loop, op1))
3387 add = fold_convert (sizetype, op1);
3388 add = size_binop (MINUS_EXPR, size_zero_node, add);
3389 off = op0;
3390 goto do_add;
3392 break;
3393 case MULT_EXPR:
3394 if (scale == 1 && tree_fits_shwi_p (op1))
3396 scale = tree_to_shwi (op1);
3397 off = op0;
3398 continue;
3400 break;
3401 case SSA_NAME:
3402 off = op0;
3403 continue;
3404 CASE_CONVERT:
3405 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3406 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3407 break;
3408 if (TYPE_PRECISION (TREE_TYPE (op0))
3409 == TYPE_PRECISION (TREE_TYPE (off)))
3411 off = op0;
3412 continue;
3414 if (TYPE_PRECISION (TREE_TYPE (op0))
3415 < TYPE_PRECISION (TREE_TYPE (off)))
3417 off = op0;
3418 offtype = TREE_TYPE (off);
3419 STRIP_NOPS (off);
3420 continue;
3422 break;
3423 default:
3424 break;
3426 break;
3429 /* If at the end OFF still isn't a SSA_NAME or isn't
3430 defined in the loop, punt. */
3431 if (TREE_CODE (off) != SSA_NAME
3432 || expr_invariant_in_loop_p (loop, off))
3433 return false;
3435 if (offtype == NULL_TREE)
3436 offtype = TREE_TYPE (off);
3438 if (DR_IS_READ (dr))
3439 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3440 offtype, scale);
3441 else
3442 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3443 offtype, scale);
3445 if (decl == NULL_TREE)
3446 return false;
3448 info->decl = decl;
3449 info->base = base;
3450 info->offset = off;
3451 info->offset_dt = vect_unknown_def_type;
3452 info->offset_vectype = NULL_TREE;
3453 info->scale = scale;
3454 return true;
3457 /* Function vect_analyze_data_refs.
3459 Find all the data references in the loop or basic block.
3461 The general structure of the analysis of data refs in the vectorizer is as
3462 follows:
3463 1- vect_analyze_data_refs(loop/bb): call
3464 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3465 in the loop/bb and their dependences.
3466 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3467 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3468 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3472 bool
3473 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3475 struct loop *loop = NULL;
3476 unsigned int i;
3477 struct data_reference *dr;
3478 tree scalar_type;
3480 if (dump_enabled_p ())
3481 dump_printf_loc (MSG_NOTE, vect_location,
3482 "=== vect_analyze_data_refs ===\n");
3484 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3485 loop = LOOP_VINFO_LOOP (loop_vinfo);
3487 /* Go through the data-refs, check that the analysis succeeded. Update
3488 pointer from stmt_vec_info struct to DR and vectype. */
3490 vec<data_reference_p> datarefs = vinfo->datarefs;
3491 FOR_EACH_VEC_ELT (datarefs, i, dr)
3493 gimple *stmt;
3494 stmt_vec_info stmt_info;
3495 tree base, offset, init;
3496 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3497 bool simd_lane_access = false;
3498 poly_uint64 vf;
3500 again:
3501 if (!dr || !DR_REF (dr))
3503 if (dump_enabled_p ())
3504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3505 "not vectorized: unhandled data-ref\n");
3506 return false;
3509 stmt = DR_STMT (dr);
3510 stmt_info = vinfo_for_stmt (stmt);
3512 /* Discard clobbers from the dataref vector. We will remove
3513 clobber stmts during vectorization. */
3514 if (gimple_clobber_p (stmt))
3516 free_data_ref (dr);
3517 if (i == datarefs.length () - 1)
3519 datarefs.pop ();
3520 break;
3522 datarefs.ordered_remove (i);
3523 dr = datarefs[i];
3524 goto again;
3527 /* Check that analysis of the data-ref succeeded. */
3528 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3529 || !DR_STEP (dr))
3531 bool maybe_gather
3532 = DR_IS_READ (dr)
3533 && !TREE_THIS_VOLATILE (DR_REF (dr))
3534 && targetm.vectorize.builtin_gather != NULL;
3535 bool maybe_scatter
3536 = DR_IS_WRITE (dr)
3537 && !TREE_THIS_VOLATILE (DR_REF (dr))
3538 && targetm.vectorize.builtin_scatter != NULL;
3539 bool maybe_simd_lane_access
3540 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3542 /* If target supports vector gather loads or scatter stores, or if
3543 this might be a SIMD lane access, see if they can't be used. */
3544 if (is_a <loop_vec_info> (vinfo)
3545 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3546 && !nested_in_vect_loop_p (loop, stmt))
3548 struct data_reference *newdr
3549 = create_data_ref (NULL, loop_containing_stmt (stmt),
3550 DR_REF (dr), stmt, !maybe_scatter,
3551 DR_IS_CONDITIONAL_IN_STMT (dr));
3552 gcc_assert (newdr != NULL && DR_REF (newdr));
3553 if (DR_BASE_ADDRESS (newdr)
3554 && DR_OFFSET (newdr)
3555 && DR_INIT (newdr)
3556 && DR_STEP (newdr)
3557 && integer_zerop (DR_STEP (newdr)))
3559 if (maybe_simd_lane_access)
3561 tree off = DR_OFFSET (newdr);
3562 STRIP_NOPS (off);
3563 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3564 && TREE_CODE (off) == MULT_EXPR
3565 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3567 tree step = TREE_OPERAND (off, 1);
3568 off = TREE_OPERAND (off, 0);
3569 STRIP_NOPS (off);
3570 if (CONVERT_EXPR_P (off)
3571 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3572 0)))
3573 < TYPE_PRECISION (TREE_TYPE (off)))
3574 off = TREE_OPERAND (off, 0);
3575 if (TREE_CODE (off) == SSA_NAME)
3577 gimple *def = SSA_NAME_DEF_STMT (off);
3578 tree reft = TREE_TYPE (DR_REF (newdr));
3579 if (is_gimple_call (def)
3580 && gimple_call_internal_p (def)
3581 && (gimple_call_internal_fn (def)
3582 == IFN_GOMP_SIMD_LANE))
3584 tree arg = gimple_call_arg (def, 0);
3585 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3586 arg = SSA_NAME_VAR (arg);
3587 if (arg == loop->simduid
3588 /* For now. */
3589 && tree_int_cst_equal
3590 (TYPE_SIZE_UNIT (reft),
3591 step))
3593 DR_OFFSET (newdr) = ssize_int (0);
3594 DR_STEP (newdr) = step;
3595 DR_OFFSET_ALIGNMENT (newdr)
3596 = BIGGEST_ALIGNMENT;
3597 DR_STEP_ALIGNMENT (newdr)
3598 = highest_pow2_factor (step);
3599 dr = newdr;
3600 simd_lane_access = true;
3606 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3608 dr = newdr;
3609 if (maybe_gather)
3610 gatherscatter = GATHER;
3611 else
3612 gatherscatter = SCATTER;
3615 if (gatherscatter == SG_NONE && !simd_lane_access)
3616 free_data_ref (newdr);
3619 if (gatherscatter == SG_NONE && !simd_lane_access)
3621 if (dump_enabled_p ())
3623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3624 "not vectorized: data ref analysis "
3625 "failed ");
3626 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3629 if (is_a <bb_vec_info> (vinfo))
3630 break;
3632 return false;
3636 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3638 if (dump_enabled_p ())
3639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3640 "not vectorized: base addr of dr is a "
3641 "constant\n");
3643 if (is_a <bb_vec_info> (vinfo))
3644 break;
3646 if (gatherscatter != SG_NONE || simd_lane_access)
3647 free_data_ref (dr);
3648 return false;
3651 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3653 if (dump_enabled_p ())
3655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3656 "not vectorized: volatile type ");
3657 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3660 if (is_a <bb_vec_info> (vinfo))
3661 break;
3663 return false;
3666 if (stmt_can_throw_internal (stmt))
3668 if (dump_enabled_p ())
3670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3671 "not vectorized: statement can throw an "
3672 "exception ");
3673 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3676 if (is_a <bb_vec_info> (vinfo))
3677 break;
3679 if (gatherscatter != SG_NONE || simd_lane_access)
3680 free_data_ref (dr);
3681 return false;
3684 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3685 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3687 if (dump_enabled_p ())
3689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3690 "not vectorized: statement is bitfield "
3691 "access ");
3692 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3695 if (is_a <bb_vec_info> (vinfo))
3696 break;
3698 if (gatherscatter != SG_NONE || simd_lane_access)
3699 free_data_ref (dr);
3700 return false;
3703 base = unshare_expr (DR_BASE_ADDRESS (dr));
3704 offset = unshare_expr (DR_OFFSET (dr));
3705 init = unshare_expr (DR_INIT (dr));
3707 if (is_gimple_call (stmt)
3708 && (!gimple_call_internal_p (stmt)
3709 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3710 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3712 if (dump_enabled_p ())
3714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3715 "not vectorized: dr in a call ");
3716 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3719 if (is_a <bb_vec_info> (vinfo))
3720 break;
3722 if (gatherscatter != SG_NONE || simd_lane_access)
3723 free_data_ref (dr);
3724 return false;
3727 /* Update DR field in stmt_vec_info struct. */
3729 /* If the dataref is in an inner-loop of the loop that is considered for
3730 for vectorization, we also want to analyze the access relative to
3731 the outer-loop (DR contains information only relative to the
3732 inner-most enclosing loop). We do that by building a reference to the
3733 first location accessed by the inner-loop, and analyze it relative to
3734 the outer-loop. */
3735 if (loop && nested_in_vect_loop_p (loop, stmt))
3737 /* Build a reference to the first location accessed by the
3738 inner loop: *(BASE + INIT + OFFSET). By construction,
3739 this address must be invariant in the inner loop, so we
3740 can consider it as being used in the outer loop. */
3741 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3742 init, offset);
3743 tree init_addr = fold_build_pointer_plus (base, init_offset);
3744 tree init_ref = build_fold_indirect_ref (init_addr);
3746 if (dump_enabled_p ())
3748 dump_printf_loc (MSG_NOTE, vect_location,
3749 "analyze in outer loop: ");
3750 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3751 dump_printf (MSG_NOTE, "\n");
3754 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3755 init_ref, loop))
3756 /* dr_analyze_innermost already explained the failure. */
3757 return false;
3759 if (dump_enabled_p ())
3761 dump_printf_loc (MSG_NOTE, vect_location,
3762 "\touter base_address: ");
3763 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3764 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3765 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3766 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3767 STMT_VINFO_DR_OFFSET (stmt_info));
3768 dump_printf (MSG_NOTE,
3769 "\n\touter constant offset from base address: ");
3770 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3771 STMT_VINFO_DR_INIT (stmt_info));
3772 dump_printf (MSG_NOTE, "\n\touter step: ");
3773 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3774 STMT_VINFO_DR_STEP (stmt_info));
3775 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3776 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3777 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3778 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3779 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3780 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3781 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3782 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3786 if (STMT_VINFO_DATA_REF (stmt_info))
3788 if (dump_enabled_p ())
3790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3791 "not vectorized: more than one data ref "
3792 "in stmt: ");
3793 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3796 if (is_a <bb_vec_info> (vinfo))
3797 break;
3799 if (gatherscatter != SG_NONE || simd_lane_access)
3800 free_data_ref (dr);
3801 return false;
3804 STMT_VINFO_DATA_REF (stmt_info) = dr;
3805 if (simd_lane_access)
3807 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3808 free_data_ref (datarefs[i]);
3809 datarefs[i] = dr;
3812 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3813 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3814 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3816 if (dump_enabled_p ())
3818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3819 "not vectorized: base object not addressable "
3820 "for stmt: ");
3821 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3823 if (is_a <bb_vec_info> (vinfo))
3825 /* In BB vectorization the ref can still participate
3826 in dependence analysis, we just can't vectorize it. */
3827 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3828 continue;
3830 return false;
3833 /* Set vectype for STMT. */
3834 scalar_type = TREE_TYPE (DR_REF (dr));
3835 STMT_VINFO_VECTYPE (stmt_info)
3836 = get_vectype_for_scalar_type (scalar_type);
3837 if (!STMT_VINFO_VECTYPE (stmt_info))
3839 if (dump_enabled_p ())
3841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3842 "not vectorized: no vectype for stmt: ");
3843 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3844 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3845 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3846 scalar_type);
3847 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3850 if (is_a <bb_vec_info> (vinfo))
3852 /* No vector type is fine, the ref can still participate
3853 in dependence analysis, we just can't vectorize it. */
3854 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3855 continue;
3858 if (gatherscatter != SG_NONE || simd_lane_access)
3860 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3861 if (gatherscatter != SG_NONE)
3862 free_data_ref (dr);
3864 return false;
3866 else
3868 if (dump_enabled_p ())
3870 dump_printf_loc (MSG_NOTE, vect_location,
3871 "got vectype for stmt: ");
3872 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3873 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3874 STMT_VINFO_VECTYPE (stmt_info));
3875 dump_printf (MSG_NOTE, "\n");
3879 /* Adjust the minimal vectorization factor according to the
3880 vector type. */
3881 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3882 *min_vf = upper_bound (*min_vf, vf);
3884 if (gatherscatter != SG_NONE)
3886 gather_scatter_info gs_info;
3887 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3888 &gs_info)
3889 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3891 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3892 free_data_ref (dr);
3893 if (dump_enabled_p ())
3895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3896 (gatherscatter == GATHER) ?
3897 "not vectorized: not suitable for gather "
3898 "load " :
3899 "not vectorized: not suitable for scatter "
3900 "store ");
3901 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3903 return false;
3906 free_data_ref (datarefs[i]);
3907 datarefs[i] = dr;
3908 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3911 else if (is_a <loop_vec_info> (vinfo)
3912 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3914 if (nested_in_vect_loop_p (loop, stmt))
3916 if (dump_enabled_p ())
3918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3919 "not vectorized: not suitable for strided "
3920 "load ");
3921 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3923 return false;
3925 STMT_VINFO_STRIDED_P (stmt_info) = true;
3929 /* If we stopped analysis at the first dataref we could not analyze
3930 when trying to vectorize a basic-block mark the rest of the datarefs
3931 as not vectorizable and truncate the vector of datarefs. That
3932 avoids spending useless time in analyzing their dependence. */
3933 if (i != datarefs.length ())
3935 gcc_assert (is_a <bb_vec_info> (vinfo));
3936 for (unsigned j = i; j < datarefs.length (); ++j)
3938 data_reference_p dr = datarefs[j];
3939 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3940 free_data_ref (dr);
3942 datarefs.truncate (i);
3945 return true;
3949 /* Function vect_get_new_vect_var.
3951 Returns a name for a new variable. The current naming scheme appends the
3952 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3953 the name of vectorizer generated variables, and appends that to NAME if
3954 provided. */
3956 tree
3957 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3959 const char *prefix;
3960 tree new_vect_var;
3962 switch (var_kind)
3964 case vect_simple_var:
3965 prefix = "vect";
3966 break;
3967 case vect_scalar_var:
3968 prefix = "stmp";
3969 break;
3970 case vect_mask_var:
3971 prefix = "mask";
3972 break;
3973 case vect_pointer_var:
3974 prefix = "vectp";
3975 break;
3976 default:
3977 gcc_unreachable ();
3980 if (name)
3982 char* tmp = concat (prefix, "_", name, NULL);
3983 new_vect_var = create_tmp_reg (type, tmp);
3984 free (tmp);
3986 else
3987 new_vect_var = create_tmp_reg (type, prefix);
3989 return new_vect_var;
3992 /* Like vect_get_new_vect_var but return an SSA name. */
3994 tree
3995 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3997 const char *prefix;
3998 tree new_vect_var;
4000 switch (var_kind)
4002 case vect_simple_var:
4003 prefix = "vect";
4004 break;
4005 case vect_scalar_var:
4006 prefix = "stmp";
4007 break;
4008 case vect_pointer_var:
4009 prefix = "vectp";
4010 break;
4011 default:
4012 gcc_unreachable ();
4015 if (name)
4017 char* tmp = concat (prefix, "_", name, NULL);
4018 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4019 free (tmp);
4021 else
4022 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4024 return new_vect_var;
4027 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4029 static void
4030 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4032 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4033 int misalign = DR_MISALIGNMENT (dr);
4034 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4035 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4036 else
4037 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4038 DR_TARGET_ALIGNMENT (dr), misalign);
4041 /* Function vect_create_addr_base_for_vector_ref.
4043 Create an expression that computes the address of the first memory location
4044 that will be accessed for a data reference.
4046 Input:
4047 STMT: The statement containing the data reference.
4048 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4049 OFFSET: Optional. If supplied, it is be added to the initial address.
4050 LOOP: Specify relative to which loop-nest should the address be computed.
4051 For example, when the dataref is in an inner-loop nested in an
4052 outer-loop that is now being vectorized, LOOP can be either the
4053 outer-loop, or the inner-loop. The first memory location accessed
4054 by the following dataref ('in' points to short):
4056 for (i=0; i<N; i++)
4057 for (j=0; j<M; j++)
4058 s += in[i+j]
4060 is as follows:
4061 if LOOP=i_loop: &in (relative to i_loop)
4062 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4063 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4064 initial address. Unlike OFFSET, which is number of elements to
4065 be added, BYTE_OFFSET is measured in bytes.
4067 Output:
4068 1. Return an SSA_NAME whose value is the address of the memory location of
4069 the first vector of the data reference.
4070 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4071 these statement(s) which define the returned SSA_NAME.
4073 FORNOW: We are only handling array accesses with step 1. */
4075 tree
4076 vect_create_addr_base_for_vector_ref (gimple *stmt,
4077 gimple_seq *new_stmt_list,
4078 tree offset,
4079 tree byte_offset)
4081 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4082 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4083 const char *base_name;
4084 tree addr_base;
4085 tree dest;
4086 gimple_seq seq = NULL;
4087 tree vect_ptr_type;
4088 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4089 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4090 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4092 tree data_ref_base = unshare_expr (drb->base_address);
4093 tree base_offset = unshare_expr (drb->offset);
4094 tree init = unshare_expr (drb->init);
4096 if (loop_vinfo)
4097 base_name = get_name (data_ref_base);
4098 else
4100 base_offset = ssize_int (0);
4101 init = ssize_int (0);
4102 base_name = get_name (DR_REF (dr));
4105 /* Create base_offset */
4106 base_offset = size_binop (PLUS_EXPR,
4107 fold_convert (sizetype, base_offset),
4108 fold_convert (sizetype, init));
4110 if (offset)
4112 offset = fold_build2 (MULT_EXPR, sizetype,
4113 fold_convert (sizetype, offset), step);
4114 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4115 base_offset, offset);
4117 if (byte_offset)
4119 byte_offset = fold_convert (sizetype, byte_offset);
4120 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4121 base_offset, byte_offset);
4124 /* base + base_offset */
4125 if (loop_vinfo)
4126 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4127 else
4129 addr_base = build1 (ADDR_EXPR,
4130 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4131 unshare_expr (DR_REF (dr)));
4134 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4135 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4136 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4137 gimple_seq_add_seq (new_stmt_list, seq);
4139 if (DR_PTR_INFO (dr)
4140 && TREE_CODE (addr_base) == SSA_NAME
4141 && !SSA_NAME_PTR_INFO (addr_base))
4143 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4144 if (offset || byte_offset)
4145 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4148 if (dump_enabled_p ())
4150 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4151 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4152 dump_printf (MSG_NOTE, "\n");
4155 return addr_base;
4159 /* Function vect_create_data_ref_ptr.
4161 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4162 location accessed in the loop by STMT, along with the def-use update
4163 chain to appropriately advance the pointer through the loop iterations.
4164 Also set aliasing information for the pointer. This pointer is used by
4165 the callers to this function to create a memory reference expression for
4166 vector load/store access.
4168 Input:
4169 1. STMT: a stmt that references memory. Expected to be of the form
4170 GIMPLE_ASSIGN <name, data-ref> or
4171 GIMPLE_ASSIGN <data-ref, name>.
4172 2. AGGR_TYPE: the type of the reference, which should be either a vector
4173 or an array.
4174 3. AT_LOOP: the loop where the vector memref is to be created.
4175 4. OFFSET (optional): an offset to be added to the initial address accessed
4176 by the data-ref in STMT.
4177 5. BSI: location where the new stmts are to be placed if there is no loop
4178 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4179 pointing to the initial address.
4180 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4181 to the initial address accessed by the data-ref in STMT. This is
4182 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4183 in bytes.
4185 Output:
4186 1. Declare a new ptr to vector_type, and have it point to the base of the
4187 data reference (initial addressed accessed by the data reference).
4188 For example, for vector of type V8HI, the following code is generated:
4190 v8hi *ap;
4191 ap = (v8hi *)initial_address;
4193 if OFFSET is not supplied:
4194 initial_address = &a[init];
4195 if OFFSET is supplied:
4196 initial_address = &a[init + OFFSET];
4197 if BYTE_OFFSET is supplied:
4198 initial_address = &a[init] + BYTE_OFFSET;
4200 Return the initial_address in INITIAL_ADDRESS.
4202 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4203 update the pointer in each iteration of the loop.
4205 Return the increment stmt that updates the pointer in PTR_INCR.
4207 3. Set INV_P to true if the access pattern of the data reference in the
4208 vectorized loop is invariant. Set it to false otherwise.
4210 4. Return the pointer. */
4212 tree
4213 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4214 tree offset, tree *initial_address,
4215 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4216 bool only_init, bool *inv_p, tree byte_offset)
4218 const char *base_name;
4219 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4220 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4221 struct loop *loop = NULL;
4222 bool nested_in_vect_loop = false;
4223 struct loop *containing_loop = NULL;
4224 tree aggr_ptr_type;
4225 tree aggr_ptr;
4226 tree new_temp;
4227 gimple_seq new_stmt_list = NULL;
4228 edge pe = NULL;
4229 basic_block new_bb;
4230 tree aggr_ptr_init;
4231 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4232 tree aptr;
4233 gimple_stmt_iterator incr_gsi;
4234 bool insert_after;
4235 tree indx_before_incr, indx_after_incr;
4236 gimple *incr;
4237 tree step;
4238 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4240 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4241 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4243 if (loop_vinfo)
4245 loop = LOOP_VINFO_LOOP (loop_vinfo);
4246 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4247 containing_loop = (gimple_bb (stmt))->loop_father;
4248 pe = loop_preheader_edge (loop);
4250 else
4252 gcc_assert (bb_vinfo);
4253 only_init = true;
4254 *ptr_incr = NULL;
4257 /* Check the step (evolution) of the load in LOOP, and record
4258 whether it's invariant. */
4259 step = vect_dr_behavior (dr)->step;
4260 if (integer_zerop (step))
4261 *inv_p = true;
4262 else
4263 *inv_p = false;
4265 /* Create an expression for the first address accessed by this load
4266 in LOOP. */
4267 base_name = get_name (DR_BASE_ADDRESS (dr));
4269 if (dump_enabled_p ())
4271 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4272 dump_printf_loc (MSG_NOTE, vect_location,
4273 "create %s-pointer variable to type: ",
4274 get_tree_code_name (TREE_CODE (aggr_type)));
4275 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4276 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4277 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4278 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4279 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4280 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4281 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4282 else
4283 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4284 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4285 dump_printf (MSG_NOTE, "\n");
4288 /* (1) Create the new aggregate-pointer variable.
4289 Vector and array types inherit the alias set of their component
4290 type by default so we need to use a ref-all pointer if the data
4291 reference does not conflict with the created aggregated data
4292 reference because it is not addressable. */
4293 bool need_ref_all = false;
4294 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4295 get_alias_set (DR_REF (dr))))
4296 need_ref_all = true;
4297 /* Likewise for any of the data references in the stmt group. */
4298 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4300 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4303 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4304 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4305 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4306 get_alias_set (DR_REF (sdr))))
4308 need_ref_all = true;
4309 break;
4311 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4313 while (orig_stmt);
4315 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4316 need_ref_all);
4317 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4320 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4321 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4322 def-use update cycles for the pointer: one relative to the outer-loop
4323 (LOOP), which is what steps (3) and (4) below do. The other is relative
4324 to the inner-loop (which is the inner-most loop containing the dataref),
4325 and this is done be step (5) below.
4327 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4328 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4329 redundant. Steps (3),(4) create the following:
4331 vp0 = &base_addr;
4332 LOOP: vp1 = phi(vp0,vp2)
4335 vp2 = vp1 + step
4336 goto LOOP
4338 If there is an inner-loop nested in loop, then step (5) will also be
4339 applied, and an additional update in the inner-loop will be created:
4341 vp0 = &base_addr;
4342 LOOP: vp1 = phi(vp0,vp2)
4344 inner: vp3 = phi(vp1,vp4)
4345 vp4 = vp3 + inner_step
4346 if () goto inner
4348 vp2 = vp1 + step
4349 if () goto LOOP */
4351 /* (2) Calculate the initial address of the aggregate-pointer, and set
4352 the aggregate-pointer to point to it before the loop. */
4354 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4356 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4357 offset, byte_offset);
4358 if (new_stmt_list)
4360 if (pe)
4362 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4363 gcc_assert (!new_bb);
4365 else
4366 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4369 *initial_address = new_temp;
4370 aggr_ptr_init = new_temp;
4372 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4373 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4374 inner-loop nested in LOOP (during outer-loop vectorization). */
4376 /* No update in loop is required. */
4377 if (only_init && (!loop_vinfo || at_loop == loop))
4378 aptr = aggr_ptr_init;
4379 else
4381 /* The step of the aggregate pointer is the type size. */
4382 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4383 /* One exception to the above is when the scalar step of the load in
4384 LOOP is zero. In this case the step here is also zero. */
4385 if (*inv_p)
4386 iv_step = size_zero_node;
4387 else if (tree_int_cst_sgn (step) == -1)
4388 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4390 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4392 create_iv (aggr_ptr_init,
4393 fold_convert (aggr_ptr_type, iv_step),
4394 aggr_ptr, loop, &incr_gsi, insert_after,
4395 &indx_before_incr, &indx_after_incr);
4396 incr = gsi_stmt (incr_gsi);
4397 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4399 /* Copy the points-to information if it exists. */
4400 if (DR_PTR_INFO (dr))
4402 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4403 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4405 if (ptr_incr)
4406 *ptr_incr = incr;
4408 aptr = indx_before_incr;
4411 if (!nested_in_vect_loop || only_init)
4412 return aptr;
4415 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4416 nested in LOOP, if exists. */
4418 gcc_assert (nested_in_vect_loop);
4419 if (!only_init)
4421 standard_iv_increment_position (containing_loop, &incr_gsi,
4422 &insert_after);
4423 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4424 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4425 &indx_after_incr);
4426 incr = gsi_stmt (incr_gsi);
4427 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4429 /* Copy the points-to information if it exists. */
4430 if (DR_PTR_INFO (dr))
4432 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4433 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4435 if (ptr_incr)
4436 *ptr_incr = incr;
4438 return indx_before_incr;
4440 else
4441 gcc_unreachable ();
4445 /* Function bump_vector_ptr
4447 Increment a pointer (to a vector type) by vector-size. If requested,
4448 i.e. if PTR-INCR is given, then also connect the new increment stmt
4449 to the existing def-use update-chain of the pointer, by modifying
4450 the PTR_INCR as illustrated below:
4452 The pointer def-use update-chain before this function:
4453 DATAREF_PTR = phi (p_0, p_2)
4454 ....
4455 PTR_INCR: p_2 = DATAREF_PTR + step
4457 The pointer def-use update-chain after this function:
4458 DATAREF_PTR = phi (p_0, p_2)
4459 ....
4460 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4461 ....
4462 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4464 Input:
4465 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4466 in the loop.
4467 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4468 the loop. The increment amount across iterations is expected
4469 to be vector_size.
4470 BSI - location where the new update stmt is to be placed.
4471 STMT - the original scalar memory-access stmt that is being vectorized.
4472 BUMP - optional. The offset by which to bump the pointer. If not given,
4473 the offset is assumed to be vector_size.
4475 Output: Return NEW_DATAREF_PTR as illustrated above.
4479 tree
4480 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4481 gimple *stmt, tree bump)
4483 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4484 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4485 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4486 tree update = TYPE_SIZE_UNIT (vectype);
4487 gassign *incr_stmt;
4488 ssa_op_iter iter;
4489 use_operand_p use_p;
4490 tree new_dataref_ptr;
4492 if (bump)
4493 update = bump;
4495 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4496 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4497 else
4498 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4499 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4500 dataref_ptr, update);
4501 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4503 /* Copy the points-to information if it exists. */
4504 if (DR_PTR_INFO (dr))
4506 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4507 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4510 if (!ptr_incr)
4511 return new_dataref_ptr;
4513 /* Update the vector-pointer's cross-iteration increment. */
4514 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4516 tree use = USE_FROM_PTR (use_p);
4518 if (use == dataref_ptr)
4519 SET_USE (use_p, new_dataref_ptr);
4520 else
4521 gcc_assert (tree_int_cst_compare (use, update) == 0);
4524 return new_dataref_ptr;
4528 /* Function vect_create_destination_var.
4530 Create a new temporary of type VECTYPE. */
4532 tree
4533 vect_create_destination_var (tree scalar_dest, tree vectype)
4535 tree vec_dest;
4536 const char *name;
4537 char *new_name;
4538 tree type;
4539 enum vect_var_kind kind;
4541 kind = vectype
4542 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4543 ? vect_mask_var
4544 : vect_simple_var
4545 : vect_scalar_var;
4546 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4548 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4550 name = get_name (scalar_dest);
4551 if (name)
4552 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4553 else
4554 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4555 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4556 free (new_name);
4558 return vec_dest;
4561 /* Function vect_grouped_store_supported.
4563 Returns TRUE if interleave high and interleave low permutations
4564 are supported, and FALSE otherwise. */
4566 bool
4567 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4569 machine_mode mode = TYPE_MODE (vectype);
4571 /* vect_permute_store_chain requires the group size to be equal to 3 or
4572 be a power of two. */
4573 if (count != 3 && exact_log2 (count) == -1)
4575 if (dump_enabled_p ())
4576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4577 "the size of the group of accesses"
4578 " is not a power of 2 or not eqaul to 3\n");
4579 return false;
4582 /* Check that the permutation is supported. */
4583 if (VECTOR_MODE_P (mode))
4585 unsigned int i;
4586 if (count == 3)
4588 unsigned int j0 = 0, j1 = 0, j2 = 0;
4589 unsigned int i, j;
4591 unsigned int nelt;
4592 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4594 if (dump_enabled_p ())
4595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4596 "cannot handle groups of 3 stores for"
4597 " variable-length vectors\n");
4598 return false;
4601 vec_perm_builder sel (nelt, nelt, 1);
4602 sel.quick_grow (nelt);
4603 vec_perm_indices indices;
4604 for (j = 0; j < 3; j++)
4606 int nelt0 = ((3 - j) * nelt) % 3;
4607 int nelt1 = ((3 - j) * nelt + 1) % 3;
4608 int nelt2 = ((3 - j) * nelt + 2) % 3;
4609 for (i = 0; i < nelt; i++)
4611 if (3 * i + nelt0 < nelt)
4612 sel[3 * i + nelt0] = j0++;
4613 if (3 * i + nelt1 < nelt)
4614 sel[3 * i + nelt1] = nelt + j1++;
4615 if (3 * i + nelt2 < nelt)
4616 sel[3 * i + nelt2] = 0;
4618 indices.new_vector (sel, 2, nelt);
4619 if (!can_vec_perm_const_p (mode, indices))
4621 if (dump_enabled_p ())
4622 dump_printf (MSG_MISSED_OPTIMIZATION,
4623 "permutation op not supported by target.\n");
4624 return false;
4627 for (i = 0; i < nelt; i++)
4629 if (3 * i + nelt0 < nelt)
4630 sel[3 * i + nelt0] = 3 * i + nelt0;
4631 if (3 * i + nelt1 < nelt)
4632 sel[3 * i + nelt1] = 3 * i + nelt1;
4633 if (3 * i + nelt2 < nelt)
4634 sel[3 * i + nelt2] = nelt + j2++;
4636 indices.new_vector (sel, 2, nelt);
4637 if (!can_vec_perm_const_p (mode, indices))
4639 if (dump_enabled_p ())
4640 dump_printf (MSG_MISSED_OPTIMIZATION,
4641 "permutation op not supported by target.\n");
4642 return false;
4645 return true;
4647 else
4649 /* If length is not equal to 3 then only power of 2 is supported. */
4650 gcc_assert (pow2p_hwi (count));
4651 poly_uint64 nelt = GET_MODE_NUNITS (mode);
4653 /* The encoding has 2 interleaved stepped patterns. */
4654 vec_perm_builder sel (nelt, 2, 3);
4655 sel.quick_grow (6);
4656 for (i = 0; i < 3; i++)
4658 sel[i * 2] = i;
4659 sel[i * 2 + 1] = i + nelt;
4661 vec_perm_indices indices (sel, 2, nelt);
4662 if (can_vec_perm_const_p (mode, indices))
4664 for (i = 0; i < 6; i++)
4665 sel[i] += exact_div (nelt, 2);
4666 indices.new_vector (sel, 2, nelt);
4667 if (can_vec_perm_const_p (mode, indices))
4668 return true;
4673 if (dump_enabled_p ())
4674 dump_printf (MSG_MISSED_OPTIMIZATION,
4675 "permutaion op not supported by target.\n");
4676 return false;
4680 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4681 type VECTYPE. */
4683 bool
4684 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4686 return vect_lanes_optab_supported_p ("vec_store_lanes",
4687 vec_store_lanes_optab,
4688 vectype, count);
4692 /* Function vect_permute_store_chain.
4694 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4695 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4696 the data correctly for the stores. Return the final references for stores
4697 in RESULT_CHAIN.
4699 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4700 The input is 4 vectors each containing 8 elements. We assign a number to
4701 each element, the input sequence is:
4703 1st vec: 0 1 2 3 4 5 6 7
4704 2nd vec: 8 9 10 11 12 13 14 15
4705 3rd vec: 16 17 18 19 20 21 22 23
4706 4th vec: 24 25 26 27 28 29 30 31
4708 The output sequence should be:
4710 1st vec: 0 8 16 24 1 9 17 25
4711 2nd vec: 2 10 18 26 3 11 19 27
4712 3rd vec: 4 12 20 28 5 13 21 30
4713 4th vec: 6 14 22 30 7 15 23 31
4715 i.e., we interleave the contents of the four vectors in their order.
4717 We use interleave_high/low instructions to create such output. The input of
4718 each interleave_high/low operation is two vectors:
4719 1st vec 2nd vec
4720 0 1 2 3 4 5 6 7
4721 the even elements of the result vector are obtained left-to-right from the
4722 high/low elements of the first vector. The odd elements of the result are
4723 obtained left-to-right from the high/low elements of the second vector.
4724 The output of interleave_high will be: 0 4 1 5
4725 and of interleave_low: 2 6 3 7
4728 The permutation is done in log LENGTH stages. In each stage interleave_high
4729 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4730 where the first argument is taken from the first half of DR_CHAIN and the
4731 second argument from it's second half.
4732 In our example,
4734 I1: interleave_high (1st vec, 3rd vec)
4735 I2: interleave_low (1st vec, 3rd vec)
4736 I3: interleave_high (2nd vec, 4th vec)
4737 I4: interleave_low (2nd vec, 4th vec)
4739 The output for the first stage is:
4741 I1: 0 16 1 17 2 18 3 19
4742 I2: 4 20 5 21 6 22 7 23
4743 I3: 8 24 9 25 10 26 11 27
4744 I4: 12 28 13 29 14 30 15 31
4746 The output of the second stage, i.e. the final result is:
4748 I1: 0 8 16 24 1 9 17 25
4749 I2: 2 10 18 26 3 11 19 27
4750 I3: 4 12 20 28 5 13 21 30
4751 I4: 6 14 22 30 7 15 23 31. */
4753 void
4754 vect_permute_store_chain (vec<tree> dr_chain,
4755 unsigned int length,
4756 gimple *stmt,
4757 gimple_stmt_iterator *gsi,
4758 vec<tree> *result_chain)
4760 tree vect1, vect2, high, low;
4761 gimple *perm_stmt;
4762 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4763 tree perm_mask_low, perm_mask_high;
4764 tree data_ref;
4765 tree perm3_mask_low, perm3_mask_high;
4766 unsigned int i, n, log_length = exact_log2 (length);
4767 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4769 result_chain->quick_grow (length);
4770 memcpy (result_chain->address (), dr_chain.address (),
4771 length * sizeof (tree));
4773 if (length == 3)
4775 unsigned int j0 = 0, j1 = 0, j2 = 0;
4777 vec_perm_builder sel (nelt, nelt, 1);
4778 sel.quick_grow (nelt);
4779 vec_perm_indices indices;
4780 for (j = 0; j < 3; j++)
4782 int nelt0 = ((3 - j) * nelt) % 3;
4783 int nelt1 = ((3 - j) * nelt + 1) % 3;
4784 int nelt2 = ((3 - j) * nelt + 2) % 3;
4786 for (i = 0; i < nelt; i++)
4788 if (3 * i + nelt0 < nelt)
4789 sel[3 * i + nelt0] = j0++;
4790 if (3 * i + nelt1 < nelt)
4791 sel[3 * i + nelt1] = nelt + j1++;
4792 if (3 * i + nelt2 < nelt)
4793 sel[3 * i + nelt2] = 0;
4795 indices.new_vector (sel, 2, nelt);
4796 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4798 for (i = 0; i < nelt; i++)
4800 if (3 * i + nelt0 < nelt)
4801 sel[3 * i + nelt0] = 3 * i + nelt0;
4802 if (3 * i + nelt1 < nelt)
4803 sel[3 * i + nelt1] = 3 * i + nelt1;
4804 if (3 * i + nelt2 < nelt)
4805 sel[3 * i + nelt2] = nelt + j2++;
4807 indices.new_vector (sel, 2, nelt);
4808 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4810 vect1 = dr_chain[0];
4811 vect2 = dr_chain[1];
4813 /* Create interleaving stmt:
4814 low = VEC_PERM_EXPR <vect1, vect2,
4815 {j, nelt, *, j + 1, nelt + j + 1, *,
4816 j + 2, nelt + j + 2, *, ...}> */
4817 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4818 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4819 vect2, perm3_mask_low);
4820 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4822 vect1 = data_ref;
4823 vect2 = dr_chain[2];
4824 /* Create interleaving stmt:
4825 low = VEC_PERM_EXPR <vect1, vect2,
4826 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4827 6, 7, nelt + j + 2, ...}> */
4828 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4829 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4830 vect2, perm3_mask_high);
4831 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4832 (*result_chain)[j] = data_ref;
4835 else
4837 /* If length is not equal to 3 then only power of 2 is supported. */
4838 gcc_assert (pow2p_hwi (length));
4840 /* The encoding has 2 interleaved stepped patterns. */
4841 vec_perm_builder sel (nelt, 2, 3);
4842 sel.quick_grow (6);
4843 for (i = 0; i < 3; i++)
4845 sel[i * 2] = i;
4846 sel[i * 2 + 1] = i + nelt;
4848 vec_perm_indices indices (sel, 2, nelt);
4849 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4851 for (i = 0; i < 6; i++)
4852 sel[i] += nelt / 2;
4853 indices.new_vector (sel, 2, nelt);
4854 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4856 for (i = 0, n = log_length; i < n; i++)
4858 for (j = 0; j < length/2; j++)
4860 vect1 = dr_chain[j];
4861 vect2 = dr_chain[j+length/2];
4863 /* Create interleaving stmt:
4864 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4865 ...}> */
4866 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4867 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4868 vect2, perm_mask_high);
4869 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4870 (*result_chain)[2*j] = high;
4872 /* Create interleaving stmt:
4873 low = VEC_PERM_EXPR <vect1, vect2,
4874 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4875 ...}> */
4876 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4877 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4878 vect2, perm_mask_low);
4879 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4880 (*result_chain)[2*j+1] = low;
4882 memcpy (dr_chain.address (), result_chain->address (),
4883 length * sizeof (tree));
4888 /* Function vect_setup_realignment
4890 This function is called when vectorizing an unaligned load using
4891 the dr_explicit_realign[_optimized] scheme.
4892 This function generates the following code at the loop prolog:
4894 p = initial_addr;
4895 x msq_init = *(floor(p)); # prolog load
4896 realignment_token = call target_builtin;
4897 loop:
4898 x msq = phi (msq_init, ---)
4900 The stmts marked with x are generated only for the case of
4901 dr_explicit_realign_optimized.
4903 The code above sets up a new (vector) pointer, pointing to the first
4904 location accessed by STMT, and a "floor-aligned" load using that pointer.
4905 It also generates code to compute the "realignment-token" (if the relevant
4906 target hook was defined), and creates a phi-node at the loop-header bb
4907 whose arguments are the result of the prolog-load (created by this
4908 function) and the result of a load that takes place in the loop (to be
4909 created by the caller to this function).
4911 For the case of dr_explicit_realign_optimized:
4912 The caller to this function uses the phi-result (msq) to create the
4913 realignment code inside the loop, and sets up the missing phi argument,
4914 as follows:
4915 loop:
4916 msq = phi (msq_init, lsq)
4917 lsq = *(floor(p')); # load in loop
4918 result = realign_load (msq, lsq, realignment_token);
4920 For the case of dr_explicit_realign:
4921 loop:
4922 msq = *(floor(p)); # load in loop
4923 p' = p + (VS-1);
4924 lsq = *(floor(p')); # load in loop
4925 result = realign_load (msq, lsq, realignment_token);
4927 Input:
4928 STMT - (scalar) load stmt to be vectorized. This load accesses
4929 a memory location that may be unaligned.
4930 BSI - place where new code is to be inserted.
4931 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4932 is used.
4934 Output:
4935 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4936 target hook, if defined.
4937 Return value - the result of the loop-header phi node. */
4939 tree
4940 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4941 tree *realignment_token,
4942 enum dr_alignment_support alignment_support_scheme,
4943 tree init_addr,
4944 struct loop **at_loop)
4946 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4947 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4948 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4949 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4950 struct loop *loop = NULL;
4951 edge pe = NULL;
4952 tree scalar_dest = gimple_assign_lhs (stmt);
4953 tree vec_dest;
4954 gimple *inc;
4955 tree ptr;
4956 tree data_ref;
4957 basic_block new_bb;
4958 tree msq_init = NULL_TREE;
4959 tree new_temp;
4960 gphi *phi_stmt;
4961 tree msq = NULL_TREE;
4962 gimple_seq stmts = NULL;
4963 bool inv_p;
4964 bool compute_in_loop = false;
4965 bool nested_in_vect_loop = false;
4966 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4967 struct loop *loop_for_initial_load = NULL;
4969 if (loop_vinfo)
4971 loop = LOOP_VINFO_LOOP (loop_vinfo);
4972 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4975 gcc_assert (alignment_support_scheme == dr_explicit_realign
4976 || alignment_support_scheme == dr_explicit_realign_optimized);
4978 /* We need to generate three things:
4979 1. the misalignment computation
4980 2. the extra vector load (for the optimized realignment scheme).
4981 3. the phi node for the two vectors from which the realignment is
4982 done (for the optimized realignment scheme). */
4984 /* 1. Determine where to generate the misalignment computation.
4986 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4987 calculation will be generated by this function, outside the loop (in the
4988 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4989 caller, inside the loop.
4991 Background: If the misalignment remains fixed throughout the iterations of
4992 the loop, then both realignment schemes are applicable, and also the
4993 misalignment computation can be done outside LOOP. This is because we are
4994 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4995 are a multiple of VS (the Vector Size), and therefore the misalignment in
4996 different vectorized LOOP iterations is always the same.
4997 The problem arises only if the memory access is in an inner-loop nested
4998 inside LOOP, which is now being vectorized using outer-loop vectorization.
4999 This is the only case when the misalignment of the memory access may not
5000 remain fixed throughout the iterations of the inner-loop (as explained in
5001 detail in vect_supportable_dr_alignment). In this case, not only is the
5002 optimized realignment scheme not applicable, but also the misalignment
5003 computation (and generation of the realignment token that is passed to
5004 REALIGN_LOAD) have to be done inside the loop.
5006 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5007 or not, which in turn determines if the misalignment is computed inside
5008 the inner-loop, or outside LOOP. */
5010 if (init_addr != NULL_TREE || !loop_vinfo)
5012 compute_in_loop = true;
5013 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5017 /* 2. Determine where to generate the extra vector load.
5019 For the optimized realignment scheme, instead of generating two vector
5020 loads in each iteration, we generate a single extra vector load in the
5021 preheader of the loop, and in each iteration reuse the result of the
5022 vector load from the previous iteration. In case the memory access is in
5023 an inner-loop nested inside LOOP, which is now being vectorized using
5024 outer-loop vectorization, we need to determine whether this initial vector
5025 load should be generated at the preheader of the inner-loop, or can be
5026 generated at the preheader of LOOP. If the memory access has no evolution
5027 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5028 to be generated inside LOOP (in the preheader of the inner-loop). */
5030 if (nested_in_vect_loop)
5032 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5033 bool invariant_in_outerloop =
5034 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5035 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5037 else
5038 loop_for_initial_load = loop;
5039 if (at_loop)
5040 *at_loop = loop_for_initial_load;
5042 if (loop_for_initial_load)
5043 pe = loop_preheader_edge (loop_for_initial_load);
5045 /* 3. For the case of the optimized realignment, create the first vector
5046 load at the loop preheader. */
5048 if (alignment_support_scheme == dr_explicit_realign_optimized)
5050 /* Create msq_init = *(floor(p1)) in the loop preheader */
5051 gassign *new_stmt;
5053 gcc_assert (!compute_in_loop);
5054 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5055 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5056 NULL_TREE, &init_addr, NULL, &inc,
5057 true, &inv_p);
5058 if (TREE_CODE (ptr) == SSA_NAME)
5059 new_temp = copy_ssa_name (ptr);
5060 else
5061 new_temp = make_ssa_name (TREE_TYPE (ptr));
5062 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5063 new_stmt = gimple_build_assign
5064 (new_temp, BIT_AND_EXPR, ptr,
5065 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5066 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5067 gcc_assert (!new_bb);
5068 data_ref
5069 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5070 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5071 new_stmt = gimple_build_assign (vec_dest, data_ref);
5072 new_temp = make_ssa_name (vec_dest, new_stmt);
5073 gimple_assign_set_lhs (new_stmt, new_temp);
5074 if (pe)
5076 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5077 gcc_assert (!new_bb);
5079 else
5080 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5082 msq_init = gimple_assign_lhs (new_stmt);
5085 /* 4. Create realignment token using a target builtin, if available.
5086 It is done either inside the containing loop, or before LOOP (as
5087 determined above). */
5089 if (targetm.vectorize.builtin_mask_for_load)
5091 gcall *new_stmt;
5092 tree builtin_decl;
5094 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5095 if (!init_addr)
5097 /* Generate the INIT_ADDR computation outside LOOP. */
5098 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5099 NULL_TREE);
5100 if (loop)
5102 pe = loop_preheader_edge (loop);
5103 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5104 gcc_assert (!new_bb);
5106 else
5107 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5110 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5111 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5112 vec_dest =
5113 vect_create_destination_var (scalar_dest,
5114 gimple_call_return_type (new_stmt));
5115 new_temp = make_ssa_name (vec_dest, new_stmt);
5116 gimple_call_set_lhs (new_stmt, new_temp);
5118 if (compute_in_loop)
5119 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5120 else
5122 /* Generate the misalignment computation outside LOOP. */
5123 pe = loop_preheader_edge (loop);
5124 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5125 gcc_assert (!new_bb);
5128 *realignment_token = gimple_call_lhs (new_stmt);
5130 /* The result of the CALL_EXPR to this builtin is determined from
5131 the value of the parameter and no global variables are touched
5132 which makes the builtin a "const" function. Requiring the
5133 builtin to have the "const" attribute makes it unnecessary
5134 to call mark_call_clobbered. */
5135 gcc_assert (TREE_READONLY (builtin_decl));
5138 if (alignment_support_scheme == dr_explicit_realign)
5139 return msq;
5141 gcc_assert (!compute_in_loop);
5142 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5145 /* 5. Create msq = phi <msq_init, lsq> in loop */
5147 pe = loop_preheader_edge (containing_loop);
5148 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5149 msq = make_ssa_name (vec_dest);
5150 phi_stmt = create_phi_node (msq, containing_loop->header);
5151 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5153 return msq;
5157 /* Function vect_grouped_load_supported.
5159 COUNT is the size of the load group (the number of statements plus the
5160 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5161 only one statement, with a gap of COUNT - 1.
5163 Returns true if a suitable permute exists. */
5165 bool
5166 vect_grouped_load_supported (tree vectype, bool single_element_p,
5167 unsigned HOST_WIDE_INT count)
5169 machine_mode mode = TYPE_MODE (vectype);
5171 /* If this is single-element interleaving with an element distance
5172 that leaves unused vector loads around punt - we at least create
5173 very sub-optimal code in that case (and blow up memory,
5174 see PR65518). */
5175 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5177 if (dump_enabled_p ())
5178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5179 "single-element interleaving not supported "
5180 "for not adjacent vector loads\n");
5181 return false;
5184 /* vect_permute_load_chain requires the group size to be equal to 3 or
5185 be a power of two. */
5186 if (count != 3 && exact_log2 (count) == -1)
5188 if (dump_enabled_p ())
5189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5190 "the size of the group of accesses"
5191 " is not a power of 2 or not equal to 3\n");
5192 return false;
5195 /* Check that the permutation is supported. */
5196 if (VECTOR_MODE_P (mode))
5198 unsigned int i, j;
5199 if (count == 3)
5201 unsigned int nelt;
5202 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5204 if (dump_enabled_p ())
5205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5206 "cannot handle groups of 3 loads for"
5207 " variable-length vectors\n");
5208 return false;
5211 vec_perm_builder sel (nelt, nelt, 1);
5212 sel.quick_grow (nelt);
5213 vec_perm_indices indices;
5214 unsigned int k;
5215 for (k = 0; k < 3; k++)
5217 for (i = 0; i < nelt; i++)
5218 if (3 * i + k < 2 * nelt)
5219 sel[i] = 3 * i + k;
5220 else
5221 sel[i] = 0;
5222 indices.new_vector (sel, 2, nelt);
5223 if (!can_vec_perm_const_p (mode, indices))
5225 if (dump_enabled_p ())
5226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5227 "shuffle of 3 loads is not supported by"
5228 " target\n");
5229 return false;
5231 for (i = 0, j = 0; i < nelt; i++)
5232 if (3 * i + k < 2 * nelt)
5233 sel[i] = i;
5234 else
5235 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5236 indices.new_vector (sel, 2, nelt);
5237 if (!can_vec_perm_const_p (mode, indices))
5239 if (dump_enabled_p ())
5240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5241 "shuffle of 3 loads is not supported by"
5242 " target\n");
5243 return false;
5246 return true;
5248 else
5250 /* If length is not equal to 3 then only power of 2 is supported. */
5251 gcc_assert (pow2p_hwi (count));
5252 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5254 /* The encoding has a single stepped pattern. */
5255 vec_perm_builder sel (nelt, 1, 3);
5256 sel.quick_grow (3);
5257 for (i = 0; i < 3; i++)
5258 sel[i] = i * 2;
5259 vec_perm_indices indices (sel, 2, nelt);
5260 if (can_vec_perm_const_p (mode, indices))
5262 for (i = 0; i < 3; i++)
5263 sel[i] = i * 2 + 1;
5264 indices.new_vector (sel, 2, nelt);
5265 if (can_vec_perm_const_p (mode, indices))
5266 return true;
5271 if (dump_enabled_p ())
5272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5273 "extract even/odd not supported by target\n");
5274 return false;
5277 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5278 type VECTYPE. */
5280 bool
5281 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5283 return vect_lanes_optab_supported_p ("vec_load_lanes",
5284 vec_load_lanes_optab,
5285 vectype, count);
5288 /* Function vect_permute_load_chain.
5290 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5291 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5292 the input data correctly. Return the final references for loads in
5293 RESULT_CHAIN.
5295 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5296 The input is 4 vectors each containing 8 elements. We assign a number to each
5297 element, the input sequence is:
5299 1st vec: 0 1 2 3 4 5 6 7
5300 2nd vec: 8 9 10 11 12 13 14 15
5301 3rd vec: 16 17 18 19 20 21 22 23
5302 4th vec: 24 25 26 27 28 29 30 31
5304 The output sequence should be:
5306 1st vec: 0 4 8 12 16 20 24 28
5307 2nd vec: 1 5 9 13 17 21 25 29
5308 3rd vec: 2 6 10 14 18 22 26 30
5309 4th vec: 3 7 11 15 19 23 27 31
5311 i.e., the first output vector should contain the first elements of each
5312 interleaving group, etc.
5314 We use extract_even/odd instructions to create such output. The input of
5315 each extract_even/odd operation is two vectors
5316 1st vec 2nd vec
5317 0 1 2 3 4 5 6 7
5319 and the output is the vector of extracted even/odd elements. The output of
5320 extract_even will be: 0 2 4 6
5321 and of extract_odd: 1 3 5 7
5324 The permutation is done in log LENGTH stages. In each stage extract_even
5325 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5326 their order. In our example,
5328 E1: extract_even (1st vec, 2nd vec)
5329 E2: extract_odd (1st vec, 2nd vec)
5330 E3: extract_even (3rd vec, 4th vec)
5331 E4: extract_odd (3rd vec, 4th vec)
5333 The output for the first stage will be:
5335 E1: 0 2 4 6 8 10 12 14
5336 E2: 1 3 5 7 9 11 13 15
5337 E3: 16 18 20 22 24 26 28 30
5338 E4: 17 19 21 23 25 27 29 31
5340 In order to proceed and create the correct sequence for the next stage (or
5341 for the correct output, if the second stage is the last one, as in our
5342 example), we first put the output of extract_even operation and then the
5343 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5344 The input for the second stage is:
5346 1st vec (E1): 0 2 4 6 8 10 12 14
5347 2nd vec (E3): 16 18 20 22 24 26 28 30
5348 3rd vec (E2): 1 3 5 7 9 11 13 15
5349 4th vec (E4): 17 19 21 23 25 27 29 31
5351 The output of the second stage:
5353 E1: 0 4 8 12 16 20 24 28
5354 E2: 2 6 10 14 18 22 26 30
5355 E3: 1 5 9 13 17 21 25 29
5356 E4: 3 7 11 15 19 23 27 31
5358 And RESULT_CHAIN after reordering:
5360 1st vec (E1): 0 4 8 12 16 20 24 28
5361 2nd vec (E3): 1 5 9 13 17 21 25 29
5362 3rd vec (E2): 2 6 10 14 18 22 26 30
5363 4th vec (E4): 3 7 11 15 19 23 27 31. */
5365 static void
5366 vect_permute_load_chain (vec<tree> dr_chain,
5367 unsigned int length,
5368 gimple *stmt,
5369 gimple_stmt_iterator *gsi,
5370 vec<tree> *result_chain)
5372 tree data_ref, first_vect, second_vect;
5373 tree perm_mask_even, perm_mask_odd;
5374 tree perm3_mask_low, perm3_mask_high;
5375 gimple *perm_stmt;
5376 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5377 unsigned int i, j, log_length = exact_log2 (length);
5378 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5380 result_chain->quick_grow (length);
5381 memcpy (result_chain->address (), dr_chain.address (),
5382 length * sizeof (tree));
5384 if (length == 3)
5386 unsigned int k;
5388 vec_perm_builder sel (nelt, nelt, 1);
5389 sel.quick_grow (nelt);
5390 vec_perm_indices indices;
5391 for (k = 0; k < 3; k++)
5393 for (i = 0; i < nelt; i++)
5394 if (3 * i + k < 2 * nelt)
5395 sel[i] = 3 * i + k;
5396 else
5397 sel[i] = 0;
5398 indices.new_vector (sel, 2, nelt);
5399 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5401 for (i = 0, j = 0; i < nelt; i++)
5402 if (3 * i + k < 2 * nelt)
5403 sel[i] = i;
5404 else
5405 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5406 indices.new_vector (sel, 2, nelt);
5407 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5409 first_vect = dr_chain[0];
5410 second_vect = dr_chain[1];
5412 /* Create interleaving stmt (low part of):
5413 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5414 ...}> */
5415 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5416 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5417 second_vect, perm3_mask_low);
5418 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5420 /* Create interleaving stmt (high part of):
5421 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5422 ...}> */
5423 first_vect = data_ref;
5424 second_vect = dr_chain[2];
5425 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5426 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5427 second_vect, perm3_mask_high);
5428 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5429 (*result_chain)[k] = data_ref;
5432 else
5434 /* If length is not equal to 3 then only power of 2 is supported. */
5435 gcc_assert (pow2p_hwi (length));
5437 /* The encoding has a single stepped pattern. */
5438 vec_perm_builder sel (nelt, 1, 3);
5439 sel.quick_grow (3);
5440 for (i = 0; i < 3; ++i)
5441 sel[i] = i * 2;
5442 vec_perm_indices indices (sel, 2, nelt);
5443 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5445 for (i = 0; i < 3; ++i)
5446 sel[i] = i * 2 + 1;
5447 indices.new_vector (sel, 2, nelt);
5448 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5450 for (i = 0; i < log_length; i++)
5452 for (j = 0; j < length; j += 2)
5454 first_vect = dr_chain[j];
5455 second_vect = dr_chain[j+1];
5457 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5458 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5459 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5460 first_vect, second_vect,
5461 perm_mask_even);
5462 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5463 (*result_chain)[j/2] = data_ref;
5465 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5466 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5467 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5468 first_vect, second_vect,
5469 perm_mask_odd);
5470 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5471 (*result_chain)[j/2+length/2] = data_ref;
5473 memcpy (dr_chain.address (), result_chain->address (),
5474 length * sizeof (tree));
5479 /* Function vect_shift_permute_load_chain.
5481 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5482 sequence of stmts to reorder the input data accordingly.
5483 Return the final references for loads in RESULT_CHAIN.
5484 Return true if successed, false otherwise.
5486 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5487 The input is 3 vectors each containing 8 elements. We assign a
5488 number to each element, the input sequence is:
5490 1st vec: 0 1 2 3 4 5 6 7
5491 2nd vec: 8 9 10 11 12 13 14 15
5492 3rd vec: 16 17 18 19 20 21 22 23
5494 The output sequence should be:
5496 1st vec: 0 3 6 9 12 15 18 21
5497 2nd vec: 1 4 7 10 13 16 19 22
5498 3rd vec: 2 5 8 11 14 17 20 23
5500 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5502 First we shuffle all 3 vectors to get correct elements order:
5504 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5505 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5506 3rd vec: (16 19 22) (17 20 23) (18 21)
5508 Next we unite and shift vector 3 times:
5510 1st step:
5511 shift right by 6 the concatenation of:
5512 "1st vec" and "2nd vec"
5513 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5514 "2nd vec" and "3rd vec"
5515 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5516 "3rd vec" and "1st vec"
5517 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5518 | New vectors |
5520 So that now new vectors are:
5522 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5523 2nd vec: (10 13) (16 19 22) (17 20 23)
5524 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5526 2nd step:
5527 shift right by 5 the concatenation of:
5528 "1st vec" and "3rd vec"
5529 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5530 "2nd vec" and "1st vec"
5531 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5532 "3rd vec" and "2nd vec"
5533 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5534 | New vectors |
5536 So that now new vectors are:
5538 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5539 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5540 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5542 3rd step:
5543 shift right by 5 the concatenation of:
5544 "1st vec" and "1st vec"
5545 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5546 shift right by 3 the concatenation of:
5547 "2nd vec" and "2nd vec"
5548 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5549 | New vectors |
5551 So that now all vectors are READY:
5552 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5553 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5554 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5556 This algorithm is faster than one in vect_permute_load_chain if:
5557 1. "shift of a concatination" is faster than general permutation.
5558 This is usually so.
5559 2. The TARGET machine can't execute vector instructions in parallel.
5560 This is because each step of the algorithm depends on previous.
5561 The algorithm in vect_permute_load_chain is much more parallel.
5563 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5566 static bool
5567 vect_shift_permute_load_chain (vec<tree> dr_chain,
5568 unsigned int length,
5569 gimple *stmt,
5570 gimple_stmt_iterator *gsi,
5571 vec<tree> *result_chain)
5573 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5574 tree perm2_mask1, perm2_mask2, perm3_mask;
5575 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5576 gimple *perm_stmt;
5578 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5579 unsigned int i;
5580 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5581 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5582 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5584 unsigned HOST_WIDE_INT vf;
5585 if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5586 /* Not supported for variable-length vectors. */
5587 return false;
5589 vec_perm_builder sel (nelt, nelt, 1);
5590 sel.quick_grow (nelt);
5592 result_chain->quick_grow (length);
5593 memcpy (result_chain->address (), dr_chain.address (),
5594 length * sizeof (tree));
5596 if (pow2p_hwi (length) && vf > 4)
5598 unsigned int j, log_length = exact_log2 (length);
5599 for (i = 0; i < nelt / 2; ++i)
5600 sel[i] = i * 2;
5601 for (i = 0; i < nelt / 2; ++i)
5602 sel[nelt / 2 + i] = i * 2 + 1;
5603 vec_perm_indices indices (sel, 2, nelt);
5604 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5606 if (dump_enabled_p ())
5607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5608 "shuffle of 2 fields structure is not \
5609 supported by target\n");
5610 return false;
5612 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5614 for (i = 0; i < nelt / 2; ++i)
5615 sel[i] = i * 2 + 1;
5616 for (i = 0; i < nelt / 2; ++i)
5617 sel[nelt / 2 + i] = i * 2;
5618 indices.new_vector (sel, 2, nelt);
5619 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5621 if (dump_enabled_p ())
5622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5623 "shuffle of 2 fields structure is not \
5624 supported by target\n");
5625 return false;
5627 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5629 /* Generating permutation constant to shift all elements.
5630 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5631 for (i = 0; i < nelt; i++)
5632 sel[i] = nelt / 2 + i;
5633 indices.new_vector (sel, 2, nelt);
5634 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5636 if (dump_enabled_p ())
5637 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5638 "shift permutation is not supported by target\n");
5639 return false;
5641 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5643 /* Generating permutation constant to select vector from 2.
5644 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5645 for (i = 0; i < nelt / 2; i++)
5646 sel[i] = i;
5647 for (i = nelt / 2; i < nelt; i++)
5648 sel[i] = nelt + i;
5649 indices.new_vector (sel, 2, nelt);
5650 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5652 if (dump_enabled_p ())
5653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5654 "select is not supported by target\n");
5655 return false;
5657 select_mask = vect_gen_perm_mask_checked (vectype, indices);
5659 for (i = 0; i < log_length; i++)
5661 for (j = 0; j < length; j += 2)
5663 first_vect = dr_chain[j];
5664 second_vect = dr_chain[j + 1];
5666 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5667 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5668 first_vect, first_vect,
5669 perm2_mask1);
5670 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5671 vect[0] = data_ref;
5673 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5674 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5675 second_vect, second_vect,
5676 perm2_mask2);
5677 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5678 vect[1] = data_ref;
5680 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5681 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5682 vect[0], vect[1], shift1_mask);
5683 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5684 (*result_chain)[j/2 + length/2] = data_ref;
5686 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5687 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5688 vect[0], vect[1], select_mask);
5689 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5690 (*result_chain)[j/2] = data_ref;
5692 memcpy (dr_chain.address (), result_chain->address (),
5693 length * sizeof (tree));
5695 return true;
5697 if (length == 3 && vf > 2)
5699 unsigned int k = 0, l = 0;
5701 /* Generating permutation constant to get all elements in rigth order.
5702 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5703 for (i = 0; i < nelt; i++)
5705 if (3 * k + (l % 3) >= nelt)
5707 k = 0;
5708 l += (3 - (nelt % 3));
5710 sel[i] = 3 * k + (l % 3);
5711 k++;
5713 vec_perm_indices indices (sel, 2, nelt);
5714 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5716 if (dump_enabled_p ())
5717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5718 "shuffle of 3 fields structure is not \
5719 supported by target\n");
5720 return false;
5722 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5724 /* Generating permutation constant to shift all elements.
5725 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5726 for (i = 0; i < nelt; i++)
5727 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5728 indices.new_vector (sel, 2, nelt);
5729 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5731 if (dump_enabled_p ())
5732 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5733 "shift permutation is not supported by target\n");
5734 return false;
5736 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5738 /* Generating permutation constant to shift all elements.
5739 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5740 for (i = 0; i < nelt; i++)
5741 sel[i] = 2 * (nelt / 3) + 1 + i;
5742 indices.new_vector (sel, 2, nelt);
5743 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5745 if (dump_enabled_p ())
5746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5747 "shift permutation is not supported by target\n");
5748 return false;
5750 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5752 /* Generating permutation constant to shift all elements.
5753 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5754 for (i = 0; i < nelt; i++)
5755 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5756 indices.new_vector (sel, 2, nelt);
5757 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5759 if (dump_enabled_p ())
5760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5761 "shift permutation is not supported by target\n");
5762 return false;
5764 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5766 /* Generating permutation constant to shift all elements.
5767 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5768 for (i = 0; i < nelt; i++)
5769 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5770 indices.new_vector (sel, 2, nelt);
5771 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5773 if (dump_enabled_p ())
5774 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5775 "shift permutation is not supported by target\n");
5776 return false;
5778 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5780 for (k = 0; k < 3; k++)
5782 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5783 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5784 dr_chain[k], dr_chain[k],
5785 perm3_mask);
5786 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5787 vect[k] = data_ref;
5790 for (k = 0; k < 3; k++)
5792 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5793 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5794 vect[k % 3], vect[(k + 1) % 3],
5795 shift1_mask);
5796 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5797 vect_shift[k] = data_ref;
5800 for (k = 0; k < 3; k++)
5802 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5803 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5804 vect_shift[(4 - k) % 3],
5805 vect_shift[(3 - k) % 3],
5806 shift2_mask);
5807 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5808 vect[k] = data_ref;
5811 (*result_chain)[3 - (nelt % 3)] = vect[2];
5813 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5814 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5815 vect[0], shift3_mask);
5816 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5817 (*result_chain)[nelt % 3] = data_ref;
5819 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5820 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5821 vect[1], shift4_mask);
5822 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5823 (*result_chain)[0] = data_ref;
5824 return true;
5826 return false;
5829 /* Function vect_transform_grouped_load.
5831 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5832 to perform their permutation and ascribe the result vectorized statements to
5833 the scalar statements.
5836 void
5837 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5838 gimple_stmt_iterator *gsi)
5840 machine_mode mode;
5841 vec<tree> result_chain = vNULL;
5843 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5844 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5845 vectors, that are ready for vector computation. */
5846 result_chain.create (size);
5848 /* If reassociation width for vector type is 2 or greater target machine can
5849 execute 2 or more vector instructions in parallel. Otherwise try to
5850 get chain for loads group using vect_shift_permute_load_chain. */
5851 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5852 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5853 || pow2p_hwi (size)
5854 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5855 gsi, &result_chain))
5856 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5857 vect_record_grouped_load_vectors (stmt, result_chain);
5858 result_chain.release ();
5861 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5862 generated as part of the vectorization of STMT. Assign the statement
5863 for each vector to the associated scalar statement. */
5865 void
5866 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5868 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5869 gimple *next_stmt, *new_stmt;
5870 unsigned int i, gap_count;
5871 tree tmp_data_ref;
5873 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5874 Since we scan the chain starting from it's first node, their order
5875 corresponds the order of data-refs in RESULT_CHAIN. */
5876 next_stmt = first_stmt;
5877 gap_count = 1;
5878 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5880 if (!next_stmt)
5881 break;
5883 /* Skip the gaps. Loads created for the gaps will be removed by dead
5884 code elimination pass later. No need to check for the first stmt in
5885 the group, since it always exists.
5886 GROUP_GAP is the number of steps in elements from the previous
5887 access (if there is no gap GROUP_GAP is 1). We skip loads that
5888 correspond to the gaps. */
5889 if (next_stmt != first_stmt
5890 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5892 gap_count++;
5893 continue;
5896 while (next_stmt)
5898 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5899 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5900 copies, and we put the new vector statement in the first available
5901 RELATED_STMT. */
5902 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5903 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5904 else
5906 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5908 gimple *prev_stmt =
5909 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5910 gimple *rel_stmt =
5911 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5912 while (rel_stmt)
5914 prev_stmt = rel_stmt;
5915 rel_stmt =
5916 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5919 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5920 new_stmt;
5924 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5925 gap_count = 1;
5926 /* If NEXT_STMT accesses the same DR as the previous statement,
5927 put the same TMP_DATA_REF as its vectorized statement; otherwise
5928 get the next data-ref from RESULT_CHAIN. */
5929 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5930 break;
5935 /* Function vect_force_dr_alignment_p.
5937 Returns whether the alignment of a DECL can be forced to be aligned
5938 on ALIGNMENT bit boundary. */
5940 bool
5941 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5943 if (!VAR_P (decl))
5944 return false;
5946 if (decl_in_symtab_p (decl)
5947 && !symtab_node::get (decl)->can_increase_alignment_p ())
5948 return false;
5950 if (TREE_STATIC (decl))
5951 return (alignment <= MAX_OFILE_ALIGNMENT);
5952 else
5953 return (alignment <= MAX_STACK_ALIGNMENT);
5957 /* Return whether the data reference DR is supported with respect to its
5958 alignment.
5959 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5960 it is aligned, i.e., check if it is possible to vectorize it with different
5961 alignment. */
5963 enum dr_alignment_support
5964 vect_supportable_dr_alignment (struct data_reference *dr,
5965 bool check_aligned_accesses)
5967 gimple *stmt = DR_STMT (dr);
5968 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5969 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5970 machine_mode mode = TYPE_MODE (vectype);
5971 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5972 struct loop *vect_loop = NULL;
5973 bool nested_in_vect_loop = false;
5975 if (aligned_access_p (dr) && !check_aligned_accesses)
5976 return dr_aligned;
5978 /* For now assume all conditional loads/stores support unaligned
5979 access without any special code. */
5980 if (is_gimple_call (stmt)
5981 && gimple_call_internal_p (stmt)
5982 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5983 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5984 return dr_unaligned_supported;
5986 if (loop_vinfo)
5988 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5989 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5992 /* Possibly unaligned access. */
5994 /* We can choose between using the implicit realignment scheme (generating
5995 a misaligned_move stmt) and the explicit realignment scheme (generating
5996 aligned loads with a REALIGN_LOAD). There are two variants to the
5997 explicit realignment scheme: optimized, and unoptimized.
5998 We can optimize the realignment only if the step between consecutive
5999 vector loads is equal to the vector size. Since the vector memory
6000 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6001 is guaranteed that the misalignment amount remains the same throughout the
6002 execution of the vectorized loop. Therefore, we can create the
6003 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6004 at the loop preheader.
6006 However, in the case of outer-loop vectorization, when vectorizing a
6007 memory access in the inner-loop nested within the LOOP that is now being
6008 vectorized, while it is guaranteed that the misalignment of the
6009 vectorized memory access will remain the same in different outer-loop
6010 iterations, it is *not* guaranteed that is will remain the same throughout
6011 the execution of the inner-loop. This is because the inner-loop advances
6012 with the original scalar step (and not in steps of VS). If the inner-loop
6013 step happens to be a multiple of VS, then the misalignment remains fixed
6014 and we can use the optimized realignment scheme. For example:
6016 for (i=0; i<N; i++)
6017 for (j=0; j<M; j++)
6018 s += a[i+j];
6020 When vectorizing the i-loop in the above example, the step between
6021 consecutive vector loads is 1, and so the misalignment does not remain
6022 fixed across the execution of the inner-loop, and the realignment cannot
6023 be optimized (as illustrated in the following pseudo vectorized loop):
6025 for (i=0; i<N; i+=4)
6026 for (j=0; j<M; j++){
6027 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6028 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6029 // (assuming that we start from an aligned address).
6032 We therefore have to use the unoptimized realignment scheme:
6034 for (i=0; i<N; i+=4)
6035 for (j=k; j<M; j+=4)
6036 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6037 // that the misalignment of the initial address is
6038 // 0).
6040 The loop can then be vectorized as follows:
6042 for (k=0; k<4; k++){
6043 rt = get_realignment_token (&vp[k]);
6044 for (i=0; i<N; i+=4){
6045 v1 = vp[i+k];
6046 for (j=k; j<M; j+=4){
6047 v2 = vp[i+j+VS-1];
6048 va = REALIGN_LOAD <v1,v2,rt>;
6049 vs += va;
6050 v1 = v2;
6053 } */
6055 if (DR_IS_READ (dr))
6057 bool is_packed = false;
6058 tree type = (TREE_TYPE (DR_REF (dr)));
6060 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6061 && (!targetm.vectorize.builtin_mask_for_load
6062 || targetm.vectorize.builtin_mask_for_load ()))
6064 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6066 /* If we are doing SLP then the accesses need not have the
6067 same alignment, instead it depends on the SLP group size. */
6068 if (loop_vinfo
6069 && STMT_SLP_TYPE (stmt_info)
6070 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6071 * GROUP_SIZE (vinfo_for_stmt
6072 (GROUP_FIRST_ELEMENT (stmt_info))),
6073 TYPE_VECTOR_SUBPARTS (vectype)))
6075 else if (!loop_vinfo
6076 || (nested_in_vect_loop
6077 && (TREE_INT_CST_LOW (DR_STEP (dr))
6078 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6079 return dr_explicit_realign;
6080 else
6081 return dr_explicit_realign_optimized;
6083 if (!known_alignment_for_access_p (dr))
6084 is_packed = not_size_aligned (DR_REF (dr));
6086 if (targetm.vectorize.support_vector_misalignment
6087 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6088 /* Can't software pipeline the loads, but can at least do them. */
6089 return dr_unaligned_supported;
6091 else
6093 bool is_packed = false;
6094 tree type = (TREE_TYPE (DR_REF (dr)));
6096 if (!known_alignment_for_access_p (dr))
6097 is_packed = not_size_aligned (DR_REF (dr));
6099 if (targetm.vectorize.support_vector_misalignment
6100 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6101 return dr_unaligned_supported;
6104 /* Unsupported. */
6105 return dr_unaligned_unsupported;