gcc/ChangeLog:
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob60f2539b3c082176e9335ed72db28fd3f3895c16
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
56 /* Return true if load- or store-lanes optab OPTAB is implemented for
57 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59 static bool
60 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
61 tree vectype, unsigned HOST_WIDE_INT count)
63 machine_mode mode, array_mode;
64 bool limit_p;
66 mode = TYPE_MODE (vectype);
67 limit_p = !targetm.array_mode_supported_p (mode, count);
68 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
69 MODE_INT, limit_p);
71 if (array_mode == BLKmode)
73 if (dump_enabled_p ())
74 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
75 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
76 GET_MODE_NAME (mode), count);
77 return false;
80 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
82 if (dump_enabled_p ())
83 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
84 "cannot use %s<%s><%s>\n", name,
85 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
86 return false;
89 if (dump_enabled_p ())
90 dump_printf_loc (MSG_NOTE, vect_location,
91 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
92 GET_MODE_NAME (mode));
94 return true;
98 /* Return the smallest scalar part of STMT.
99 This is used to determine the vectype of the stmt. We generally set the
100 vectype according to the type of the result (lhs). For stmts whose
101 result-type is different than the type of the arguments (e.g., demotion,
102 promotion), vectype will be reset appropriately (later). Note that we have
103 to visit the smallest datatype in this function, because that determines the
104 VF. If the smallest datatype in the loop is present only as the rhs of a
105 promotion operation - we'd miss it.
106 Such a case, where a variable of this datatype does not appear in the lhs
107 anywhere in the loop, can only occur if it's an invariant: e.g.:
108 'int_x = (int) short_inv', which we'd expect to have been optimized away by
109 invariant motion. However, we cannot rely on invariant motion to always
110 take invariants out of the loop, and so in the case of promotion we also
111 have to check the rhs.
112 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
113 types. */
115 tree
116 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
117 HOST_WIDE_INT *rhs_size_unit)
119 tree scalar_type = gimple_expr_type (stmt);
120 HOST_WIDE_INT lhs, rhs;
122 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
124 if (is_gimple_assign (stmt)
125 && (gimple_assign_cast_p (stmt)
126 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
127 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
128 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
130 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
132 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
133 if (rhs < lhs)
134 scalar_type = rhs_type;
137 *lhs_size_unit = lhs;
138 *rhs_size_unit = rhs;
139 return scalar_type;
143 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
144 tested at run-time. Return TRUE if DDR was successfully inserted.
145 Return false if versioning is not supported. */
147 static bool
148 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
150 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
152 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
153 return false;
155 if (!runtime_alias_check_p (ddr, loop,
156 optimize_loop_nest_for_speed_p (loop)))
157 return false;
159 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
160 return true;
164 /* A subroutine of vect_analyze_data_ref_dependence. Handle
165 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
166 distances. These distances are conservatively correct but they don't
167 reflect a guaranteed dependence.
169 Return true if this function does all the work necessary to avoid
170 an alias or false if the caller should use the dependence distances
171 to limit the vectorization factor in the usual way. LOOP_DEPTH is
172 the depth of the loop described by LOOP_VINFO and the other arguments
173 are as for vect_analyze_data_ref_dependence. */
175 static bool
176 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
177 loop_vec_info loop_vinfo,
178 int loop_depth, int *max_vf)
180 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
181 lambda_vector dist_v;
182 unsigned int i;
183 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
185 int dist = dist_v[loop_depth];
186 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
188 /* If the user asserted safelen >= DIST consecutive iterations
189 can be executed concurrently, assume independence.
191 ??? An alternative would be to add the alias check even
192 in this case, and vectorize the fallback loop with the
193 maximum VF set to safelen. However, if the user has
194 explicitly given a length, it's less likely that that
195 would be a win. */
196 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
198 if (loop->safelen < *max_vf)
199 *max_vf = loop->safelen;
200 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
201 continue;
204 /* For dependence distances of 2 or more, we have the option
205 of limiting VF or checking for an alias at runtime.
206 Prefer to check at runtime if we can, to avoid limiting
207 the VF unnecessarily when the bases are in fact independent.
209 Note that the alias checks will be removed if the VF ends up
210 being small enough. */
211 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
214 return true;
218 /* Function vect_analyze_data_ref_dependence.
220 Return TRUE if there (might) exist a dependence between a memory-reference
221 DRA and a memory-reference DRB. When versioning for alias may check a
222 dependence at run-time, return FALSE. Adjust *MAX_VF according to
223 the data dependence. */
225 static bool
226 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
227 loop_vec_info loop_vinfo, int *max_vf)
229 unsigned int i;
230 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
231 struct data_reference *dra = DDR_A (ddr);
232 struct data_reference *drb = DDR_B (ddr);
233 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
234 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
235 lambda_vector dist_v;
236 unsigned int loop_depth;
238 /* In loop analysis all data references should be vectorizable. */
239 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
240 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
241 gcc_unreachable ();
243 /* Independent data accesses. */
244 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
245 return false;
247 if (dra == drb
248 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
249 return false;
251 /* We do not have to consider dependences between accesses that belong
252 to the same group. */
253 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
254 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
255 return false;
257 /* Even if we have an anti-dependence then, as the vectorized loop covers at
258 least two scalar iterations, there is always also a true dependence.
259 As the vectorizer does not re-order loads and stores we can ignore
260 the anti-dependence if TBAA can disambiguate both DRs similar to the
261 case with known negative distance anti-dependences (positive
262 distance anti-dependences would violate TBAA constraints). */
263 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
264 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
265 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
266 get_alias_set (DR_REF (drb))))
267 return false;
269 /* Unknown data dependence. */
270 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
272 /* If user asserted safelen consecutive iterations can be
273 executed concurrently, assume independence. */
274 if (loop->safelen >= 2)
276 if (loop->safelen < *max_vf)
277 *max_vf = loop->safelen;
278 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
279 return false;
282 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
283 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
285 if (dump_enabled_p ())
287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
288 "versioning for alias not supported for: "
289 "can't determine dependence between ");
290 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
291 DR_REF (dra));
292 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
293 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
294 DR_REF (drb));
295 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
297 return true;
300 if (dump_enabled_p ())
302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
303 "versioning for alias required: "
304 "can't determine dependence between ");
305 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
306 DR_REF (dra));
307 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
308 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
309 DR_REF (drb));
310 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
313 /* Add to list of ddrs that need to be tested at run-time. */
314 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
317 /* Known data dependence. */
318 if (DDR_NUM_DIST_VECTS (ddr) == 0)
320 /* If user asserted safelen consecutive iterations can be
321 executed concurrently, assume independence. */
322 if (loop->safelen >= 2)
324 if (loop->safelen < *max_vf)
325 *max_vf = loop->safelen;
326 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
327 return false;
330 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
331 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
333 if (dump_enabled_p ())
335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
336 "versioning for alias not supported for: "
337 "bad dist vector for ");
338 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
339 DR_REF (dra));
340 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
341 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
342 DR_REF (drb));
343 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
345 return true;
348 if (dump_enabled_p ())
350 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
351 "versioning for alias required: "
352 "bad dist vector for ");
353 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
354 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
355 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
356 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
358 /* Add to list of ddrs that need to be tested at run-time. */
359 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
362 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
364 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
365 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
366 loop_depth, max_vf))
367 return false;
369 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
371 int dist = dist_v[loop_depth];
373 if (dump_enabled_p ())
374 dump_printf_loc (MSG_NOTE, vect_location,
375 "dependence distance = %d.\n", dist);
377 if (dist == 0)
379 if (dump_enabled_p ())
381 dump_printf_loc (MSG_NOTE, vect_location,
382 "dependence distance == 0 between ");
383 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
384 dump_printf (MSG_NOTE, " and ");
385 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
386 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
389 /* When we perform grouped accesses and perform implicit CSE
390 by detecting equal accesses and doing disambiguation with
391 runtime alias tests like for
392 .. = a[i];
393 .. = a[i+1];
394 a[i] = ..;
395 a[i+1] = ..;
396 *p = ..;
397 .. = a[i];
398 .. = a[i+1];
399 where we will end up loading { a[i], a[i+1] } once, make
400 sure that inserting group loads before the first load and
401 stores after the last store will do the right thing.
402 Similar for groups like
403 a[i] = ...;
404 ... = a[i];
405 a[i+1] = ...;
406 where loads from the group interleave with the store. */
407 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
408 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
410 gimple *earlier_stmt;
411 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
412 if (DR_IS_WRITE
413 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
415 if (dump_enabled_p ())
416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
417 "READ_WRITE dependence in interleaving."
418 "\n");
419 return true;
423 continue;
426 if (dist > 0 && DDR_REVERSED_P (ddr))
428 /* If DDR_REVERSED_P the order of the data-refs in DDR was
429 reversed (to make distance vector positive), and the actual
430 distance is negative. */
431 if (dump_enabled_p ())
432 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
433 "dependence distance negative.\n");
434 /* Record a negative dependence distance to later limit the
435 amount of stmt copying / unrolling we can perform.
436 Only need to handle read-after-write dependence. */
437 if (DR_IS_READ (drb)
438 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
439 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
440 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
441 continue;
444 if (abs (dist) >= 2
445 && abs (dist) < *max_vf)
447 /* The dependence distance requires reduction of the maximal
448 vectorization factor. */
449 *max_vf = abs (dist);
450 if (dump_enabled_p ())
451 dump_printf_loc (MSG_NOTE, vect_location,
452 "adjusting maximal vectorization factor to %i\n",
453 *max_vf);
456 if (abs (dist) >= *max_vf)
458 /* Dependence distance does not create dependence, as far as
459 vectorization is concerned, in this case. */
460 if (dump_enabled_p ())
461 dump_printf_loc (MSG_NOTE, vect_location,
462 "dependence distance >= VF.\n");
463 continue;
466 if (dump_enabled_p ())
468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
469 "not vectorized, possible dependence "
470 "between data-refs ");
471 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
472 dump_printf (MSG_NOTE, " and ");
473 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
474 dump_printf (MSG_NOTE, "\n");
477 return true;
480 return false;
483 /* Function vect_analyze_data_ref_dependences.
485 Examine all the data references in the loop, and make sure there do not
486 exist any data dependences between them. Set *MAX_VF according to
487 the maximum vectorization factor the data dependences allow. */
489 bool
490 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
492 unsigned int i;
493 struct data_dependence_relation *ddr;
495 if (dump_enabled_p ())
496 dump_printf_loc (MSG_NOTE, vect_location,
497 "=== vect_analyze_data_ref_dependences ===\n");
499 LOOP_VINFO_DDRS (loop_vinfo)
500 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
501 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
502 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
503 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
504 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
505 &LOOP_VINFO_DDRS (loop_vinfo),
506 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
507 return false;
509 /* For epilogues we either have no aliases or alias versioning
510 was applied to original loop. Therefore we may just get max_vf
511 using VF of original loop. */
512 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
513 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
514 else
515 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
516 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
517 return false;
519 return true;
523 /* Function vect_slp_analyze_data_ref_dependence.
525 Return TRUE if there (might) exist a dependence between a memory-reference
526 DRA and a memory-reference DRB. When versioning for alias may check a
527 dependence at run-time, return FALSE. Adjust *MAX_VF according to
528 the data dependence. */
530 static bool
531 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
533 struct data_reference *dra = DDR_A (ddr);
534 struct data_reference *drb = DDR_B (ddr);
536 /* We need to check dependences of statements marked as unvectorizable
537 as well, they still can prohibit vectorization. */
539 /* Independent data accesses. */
540 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
541 return false;
543 if (dra == drb)
544 return false;
546 /* Read-read is OK. */
547 if (DR_IS_READ (dra) && DR_IS_READ (drb))
548 return false;
550 /* If dra and drb are part of the same interleaving chain consider
551 them independent. */
552 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
553 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
554 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
555 return false;
557 /* Unknown data dependence. */
558 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
560 if (dump_enabled_p ())
562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
563 "can't determine dependence between ");
564 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
565 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
566 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
567 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
570 else if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "determined dependence between ");
574 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
575 dump_printf (MSG_NOTE, " and ");
576 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
577 dump_printf (MSG_NOTE, "\n");
580 return true;
584 /* Analyze dependences involved in the transform of SLP NODE. STORES
585 contain the vector of scalar stores of this instance if we are
586 disambiguating the loads. */
588 static bool
589 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
590 vec<gimple *> stores, gimple *last_store)
592 /* This walks over all stmts involved in the SLP load/store done
593 in NODE verifying we can sink them up to the last stmt in the
594 group. */
595 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
596 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
598 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
599 if (access == last_access)
600 continue;
601 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
602 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
603 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
605 gimple *stmt = gsi_stmt (gsi);
606 if (! gimple_vuse (stmt)
607 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
608 continue;
610 /* If we couldn't record a (single) data reference for this
611 stmt we have to give up. */
612 /* ??? Here and below if dependence analysis fails we can resort
613 to the alias oracle which can handle more kinds of stmts. */
614 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
615 if (!dr_b)
616 return false;
618 bool dependent = false;
619 /* If we run into a store of this same instance (we've just
620 marked those) then delay dependence checking until we run
621 into the last store because this is where it will have
622 been sunk to (and we verify if we can do that as well). */
623 if (gimple_visited_p (stmt))
625 if (stmt != last_store)
626 continue;
627 unsigned i;
628 gimple *store;
629 FOR_EACH_VEC_ELT (stores, i, store)
631 data_reference *store_dr
632 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
633 ddr_p ddr = initialize_data_dependence_relation
634 (dr_a, store_dr, vNULL);
635 dependent = vect_slp_analyze_data_ref_dependence (ddr);
636 free_dependence_relation (ddr);
637 if (dependent)
638 break;
641 else
643 ddr_p ddr = initialize_data_dependence_relation (dr_a,
644 dr_b, vNULL);
645 dependent = vect_slp_analyze_data_ref_dependence (ddr);
646 free_dependence_relation (ddr);
648 if (dependent)
649 return false;
652 return true;
656 /* Function vect_analyze_data_ref_dependences.
658 Examine all the data references in the basic-block, and make sure there
659 do not exist any data dependences between them. Set *MAX_VF according to
660 the maximum vectorization factor the data dependences allow. */
662 bool
663 vect_slp_analyze_instance_dependence (slp_instance instance)
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_NOTE, vect_location,
667 "=== vect_slp_analyze_instance_dependence ===\n");
669 /* The stores of this instance are at the root of the SLP tree. */
670 slp_tree store = SLP_INSTANCE_TREE (instance);
671 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
672 store = NULL;
674 /* Verify we can sink stores to the vectorized stmt insert location. */
675 gimple *last_store = NULL;
676 if (store)
678 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
679 return false;
681 /* Mark stores in this instance and remember the last one. */
682 last_store = vect_find_last_scalar_stmt_in_slp (store);
683 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
684 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
687 bool res = true;
689 /* Verify we can sink loads to the vectorized stmt insert location,
690 special-casing stores of this instance. */
691 slp_tree load;
692 unsigned int i;
693 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
694 if (! vect_slp_analyze_node_dependences (instance, load,
695 store
696 ? SLP_TREE_SCALAR_STMTS (store)
697 : vNULL, last_store))
699 res = false;
700 break;
703 /* Unset the visited flag. */
704 if (store)
705 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
706 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
708 return res;
711 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
712 the statement that contains DRB, which is useful for recording in the
713 dump file. */
715 static void
716 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
717 innermost_loop_behavior *drb)
719 bool existed;
720 innermost_loop_behavior *&entry
721 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
722 if (!existed || entry->base_alignment < drb->base_alignment)
724 entry = drb;
725 if (dump_enabled_p ())
727 dump_printf_loc (MSG_NOTE, vect_location,
728 "recording new base alignment for ");
729 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
730 dump_printf (MSG_NOTE, "\n");
731 dump_printf_loc (MSG_NOTE, vect_location,
732 " alignment: %d\n", drb->base_alignment);
733 dump_printf_loc (MSG_NOTE, vect_location,
734 " misalignment: %d\n", drb->base_misalignment);
735 dump_printf_loc (MSG_NOTE, vect_location,
736 " based on: ");
737 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
742 /* If the region we're going to vectorize is reached, all unconditional
743 data references occur at least once. We can therefore pool the base
744 alignment guarantees from each unconditional reference. Do this by
745 going through all the data references in VINFO and checking whether
746 the containing statement makes the reference unconditionally. If so,
747 record the alignment of the base address in VINFO so that it can be
748 used for all other references with the same base. */
750 void
751 vect_record_base_alignments (vec_info *vinfo)
753 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
754 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
755 data_reference *dr;
756 unsigned int i;
757 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
758 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
760 gimple *stmt = DR_STMT (dr);
761 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
763 /* If DR is nested in the loop that is being vectorized, we can also
764 record the alignment of the base wrt the outer loop. */
765 if (loop && nested_in_vect_loop_p (loop, stmt))
767 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
768 vect_record_base_alignment
769 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
774 /* Function vect_compute_data_ref_alignment
776 Compute the misalignment of the data reference DR.
778 Output:
779 1. If during the misalignment computation it is found that the data reference
780 cannot be vectorized then false is returned.
781 2. DR_MISALIGNMENT (DR) is defined.
783 FOR NOW: No analysis is actually performed. Misalignment is calculated
784 only for trivial cases. TODO. */
786 bool
787 vect_compute_data_ref_alignment (struct data_reference *dr)
789 gimple *stmt = DR_STMT (dr);
790 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
791 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
792 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
793 struct loop *loop = NULL;
794 tree ref = DR_REF (dr);
795 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
797 if (dump_enabled_p ())
798 dump_printf_loc (MSG_NOTE, vect_location,
799 "vect_compute_data_ref_alignment:\n");
801 if (loop_vinfo)
802 loop = LOOP_VINFO_LOOP (loop_vinfo);
804 /* Initialize misalignment to unknown. */
805 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
807 innermost_loop_behavior *drb = vect_dr_behavior (dr);
808 bool step_preserves_misalignment_p;
810 /* No step for BB vectorization. */
811 if (!loop)
813 gcc_assert (integer_zerop (drb->step));
814 step_preserves_misalignment_p = true;
817 /* In case the dataref is in an inner-loop of the loop that is being
818 vectorized (LOOP), we use the base and misalignment information
819 relative to the outer-loop (LOOP). This is ok only if the misalignment
820 stays the same throughout the execution of the inner-loop, which is why
821 we have to check that the stride of the dataref in the inner-loop evenly
822 divides by the vector size. */
823 else if (nested_in_vect_loop_p (loop, stmt))
825 step_preserves_misalignment_p
826 = (DR_STEP_ALIGNMENT (dr)
827 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
829 if (dump_enabled_p ())
831 if (step_preserves_misalignment_p)
832 dump_printf_loc (MSG_NOTE, vect_location,
833 "inner step divides the vector-size.\n");
834 else
835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
836 "inner step doesn't divide the vector-size.\n");
840 /* Similarly we can only use base and misalignment information relative to
841 an innermost loop if the misalignment stays the same throughout the
842 execution of the loop. As above, this is the case if the stride of
843 the dataref evenly divides by the vector size. */
844 else
846 unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
847 step_preserves_misalignment_p
848 = ((DR_STEP_ALIGNMENT (dr) * vf)
849 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
851 if (!step_preserves_misalignment_p && dump_enabled_p ())
852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
853 "step doesn't divide the vector-size.\n");
856 unsigned int base_alignment = drb->base_alignment;
857 unsigned int base_misalignment = drb->base_misalignment;
858 unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
860 /* Calculate the maximum of the pooled base address alignment and the
861 alignment that we can compute for DR itself. */
862 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
863 if (entry && base_alignment < (*entry)->base_alignment)
865 base_alignment = (*entry)->base_alignment;
866 base_misalignment = (*entry)->base_misalignment;
869 if (drb->offset_alignment < vector_alignment
870 || !step_preserves_misalignment_p
871 /* We need to know whether the step wrt the vectorized loop is
872 negative when computing the starting misalignment below. */
873 || TREE_CODE (drb->step) != INTEGER_CST)
875 if (dump_enabled_p ())
877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
878 "Unknown alignment for access: ");
879 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
880 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
882 return true;
885 if (base_alignment < vector_alignment)
887 tree base = drb->base_address;
888 if (TREE_CODE (base) == ADDR_EXPR)
889 base = TREE_OPERAND (base, 0);
890 if (!vect_can_force_dr_alignment_p (base,
891 vector_alignment * BITS_PER_UNIT))
893 if (dump_enabled_p ())
895 dump_printf_loc (MSG_NOTE, vect_location,
896 "can't force alignment of ref: ");
897 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
898 dump_printf (MSG_NOTE, "\n");
900 return true;
903 if (DECL_USER_ALIGN (base))
905 if (dump_enabled_p ())
907 dump_printf_loc (MSG_NOTE, vect_location,
908 "not forcing alignment of user-aligned "
909 "variable: ");
910 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
911 dump_printf (MSG_NOTE, "\n");
913 return true;
916 /* Force the alignment of the decl.
917 NOTE: This is the only change to the code we make during
918 the analysis phase, before deciding to vectorize the loop. */
919 if (dump_enabled_p ())
921 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
922 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
923 dump_printf (MSG_NOTE, "\n");
926 DR_VECT_AUX (dr)->base_decl = base;
927 DR_VECT_AUX (dr)->base_misaligned = true;
928 base_misalignment = 0;
930 unsigned int misalignment = (base_misalignment
931 + TREE_INT_CST_LOW (drb->init));
933 /* If this is a backward running DR then first access in the larger
934 vectype actually is N-1 elements before the address in the DR.
935 Adjust misalign accordingly. */
936 if (tree_int_cst_sgn (drb->step) < 0)
937 /* PLUS because STEP is negative. */
938 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
939 * TREE_INT_CST_LOW (drb->step));
941 SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
943 if (dump_enabled_p ())
945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
946 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
947 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
948 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
951 return true;
955 /* Function vect_update_misalignment_for_peel.
956 Sets DR's misalignment
957 - to 0 if it has the same alignment as DR_PEEL,
958 - to the misalignment computed using NPEEL if DR's salignment is known,
959 - to -1 (unknown) otherwise.
961 DR - the data reference whose misalignment is to be adjusted.
962 DR_PEEL - the data reference whose misalignment is being made
963 zero in the vector loop by the peel.
964 NPEEL - the number of iterations in the peel loop if the misalignment
965 of DR_PEEL is known at compile time. */
967 static void
968 vect_update_misalignment_for_peel (struct data_reference *dr,
969 struct data_reference *dr_peel, int npeel)
971 unsigned int i;
972 vec<dr_p> same_aligned_drs;
973 struct data_reference *current_dr;
974 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
975 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
976 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
977 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
979 /* For interleaved data accesses the step in the loop must be multiplied by
980 the size of the interleaving group. */
981 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
982 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
983 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
984 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
986 /* It can be assumed that the data refs with the same alignment as dr_peel
987 are aligned in the vector loop. */
988 same_aligned_drs
989 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
990 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
992 if (current_dr != dr)
993 continue;
994 gcc_assert (!known_alignment_for_access_p (dr)
995 || !known_alignment_for_access_p (dr_peel)
996 || (DR_MISALIGNMENT (dr) / dr_size
997 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
998 SET_DR_MISALIGNMENT (dr, 0);
999 return;
1002 if (known_alignment_for_access_p (dr)
1003 && known_alignment_for_access_p (dr_peel))
1005 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1006 int misal = DR_MISALIGNMENT (dr);
1007 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1008 misal += negative ? -npeel * dr_size : npeel * dr_size;
1009 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
1010 SET_DR_MISALIGNMENT (dr, misal);
1011 return;
1014 if (dump_enabled_p ())
1015 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1016 "to unknown (-1).\n");
1017 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1021 /* Function verify_data_ref_alignment
1023 Return TRUE if DR can be handled with respect to alignment. */
1025 static bool
1026 verify_data_ref_alignment (data_reference_p dr)
1028 enum dr_alignment_support supportable_dr_alignment
1029 = vect_supportable_dr_alignment (dr, false);
1030 if (!supportable_dr_alignment)
1032 if (dump_enabled_p ())
1034 if (DR_IS_READ (dr))
1035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1036 "not vectorized: unsupported unaligned load.");
1037 else
1038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1039 "not vectorized: unsupported unaligned "
1040 "store.");
1042 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1043 DR_REF (dr));
1044 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1046 return false;
1049 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1050 dump_printf_loc (MSG_NOTE, vect_location,
1051 "Vectorizing an unaligned access.\n");
1053 return true;
1056 /* Function vect_verify_datarefs_alignment
1058 Return TRUE if all data references in the loop can be
1059 handled with respect to alignment. */
1061 bool
1062 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1064 vec<data_reference_p> datarefs = vinfo->datarefs;
1065 struct data_reference *dr;
1066 unsigned int i;
1068 FOR_EACH_VEC_ELT (datarefs, i, dr)
1070 gimple *stmt = DR_STMT (dr);
1071 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1073 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1074 continue;
1076 /* For interleaving, only the alignment of the first access matters. */
1077 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1078 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1079 continue;
1081 /* Strided accesses perform only component accesses, alignment is
1082 irrelevant for them. */
1083 if (STMT_VINFO_STRIDED_P (stmt_info)
1084 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1085 continue;
1087 if (! verify_data_ref_alignment (dr))
1088 return false;
1091 return true;
1094 /* Given an memory reference EXP return whether its alignment is less
1095 than its size. */
1097 static bool
1098 not_size_aligned (tree exp)
1100 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1101 return true;
1103 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1104 > get_object_alignment (exp));
1107 /* Function vector_alignment_reachable_p
1109 Return true if vector alignment for DR is reachable by peeling
1110 a few loop iterations. Return false otherwise. */
1112 static bool
1113 vector_alignment_reachable_p (struct data_reference *dr)
1115 gimple *stmt = DR_STMT (dr);
1116 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1117 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1119 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1121 /* For interleaved access we peel only if number of iterations in
1122 the prolog loop ({VF - misalignment}), is a multiple of the
1123 number of the interleaved accesses. */
1124 int elem_size, mis_in_elements;
1125 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1127 /* FORNOW: handle only known alignment. */
1128 if (!known_alignment_for_access_p (dr))
1129 return false;
1131 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1132 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1134 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1135 return false;
1138 /* If misalignment is known at the compile time then allow peeling
1139 only if natural alignment is reachable through peeling. */
1140 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1142 HOST_WIDE_INT elmsize =
1143 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1144 if (dump_enabled_p ())
1146 dump_printf_loc (MSG_NOTE, vect_location,
1147 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1148 dump_printf (MSG_NOTE,
1149 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1151 if (DR_MISALIGNMENT (dr) % elmsize)
1153 if (dump_enabled_p ())
1154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1155 "data size does not divide the misalignment.\n");
1156 return false;
1160 if (!known_alignment_for_access_p (dr))
1162 tree type = TREE_TYPE (DR_REF (dr));
1163 bool is_packed = not_size_aligned (DR_REF (dr));
1164 if (dump_enabled_p ())
1165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1166 "Unknown misalignment, %snaturally aligned\n",
1167 is_packed ? "not " : "");
1168 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1171 return true;
1175 /* Calculate the cost of the memory access represented by DR. */
1177 static void
1178 vect_get_data_access_cost (struct data_reference *dr,
1179 unsigned int *inside_cost,
1180 unsigned int *outside_cost,
1181 stmt_vector_for_cost *body_cost_vec)
1183 gimple *stmt = DR_STMT (dr);
1184 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1185 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1186 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1187 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1188 int ncopies = MAX (1, vf / nunits); /* TODO: Handle SLP properly */
1190 if (DR_IS_READ (dr))
1191 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1192 NULL, body_cost_vec, false);
1193 else
1194 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1196 if (dump_enabled_p ())
1197 dump_printf_loc (MSG_NOTE, vect_location,
1198 "vect_get_data_access_cost: inside_cost = %d, "
1199 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1203 typedef struct _vect_peel_info
1205 struct data_reference *dr;
1206 int npeel;
1207 unsigned int count;
1208 } *vect_peel_info;
1210 typedef struct _vect_peel_extended_info
1212 struct _vect_peel_info peel_info;
1213 unsigned int inside_cost;
1214 unsigned int outside_cost;
1215 } *vect_peel_extended_info;
1218 /* Peeling hashtable helpers. */
1220 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1222 static inline hashval_t hash (const _vect_peel_info *);
1223 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1226 inline hashval_t
1227 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1229 return (hashval_t) peel_info->npeel;
1232 inline bool
1233 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1235 return (a->npeel == b->npeel);
1239 /* Insert DR into peeling hash table with NPEEL as key. */
1241 static void
1242 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1243 loop_vec_info loop_vinfo, struct data_reference *dr,
1244 int npeel)
1246 struct _vect_peel_info elem, *slot;
1247 _vect_peel_info **new_slot;
1248 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1250 elem.npeel = npeel;
1251 slot = peeling_htab->find (&elem);
1252 if (slot)
1253 slot->count++;
1254 else
1256 slot = XNEW (struct _vect_peel_info);
1257 slot->npeel = npeel;
1258 slot->dr = dr;
1259 slot->count = 1;
1260 new_slot = peeling_htab->find_slot (slot, INSERT);
1261 *new_slot = slot;
1264 if (!supportable_dr_alignment
1265 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1266 slot->count += VECT_MAX_COST;
1270 /* Traverse peeling hash table to find peeling option that aligns maximum
1271 number of data accesses. */
1274 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1275 _vect_peel_extended_info *max)
1277 vect_peel_info elem = *slot;
1279 if (elem->count > max->peel_info.count
1280 || (elem->count == max->peel_info.count
1281 && max->peel_info.npeel > elem->npeel))
1283 max->peel_info.npeel = elem->npeel;
1284 max->peel_info.count = elem->count;
1285 max->peel_info.dr = elem->dr;
1288 return 1;
1291 /* Get the costs of peeling NPEEL iterations checking data access costs
1292 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1293 misalignment will be zero after peeling. */
1295 static void
1296 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1297 struct data_reference *dr0,
1298 unsigned int *inside_cost,
1299 unsigned int *outside_cost,
1300 stmt_vector_for_cost *body_cost_vec,
1301 unsigned int npeel,
1302 bool unknown_misalignment)
1304 unsigned i;
1305 data_reference *dr;
1307 FOR_EACH_VEC_ELT (datarefs, i, dr)
1309 gimple *stmt = DR_STMT (dr);
1310 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1311 /* For interleaving, only the alignment of the first access
1312 matters. */
1313 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1314 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1315 continue;
1317 /* Strided accesses perform only component accesses, alignment is
1318 irrelevant for them. */
1319 if (STMT_VINFO_STRIDED_P (stmt_info)
1320 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1321 continue;
1323 int save_misalignment;
1324 save_misalignment = DR_MISALIGNMENT (dr);
1325 if (npeel == 0)
1327 else if (unknown_misalignment && dr == dr0)
1328 SET_DR_MISALIGNMENT (dr, 0);
1329 else
1330 vect_update_misalignment_for_peel (dr, dr0, npeel);
1331 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1332 body_cost_vec);
1333 SET_DR_MISALIGNMENT (dr, save_misalignment);
1337 /* Traverse peeling hash table and calculate cost for each peeling option.
1338 Find the one with the lowest cost. */
1341 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1342 _vect_peel_extended_info *min)
1344 vect_peel_info elem = *slot;
1345 int dummy;
1346 unsigned int inside_cost = 0, outside_cost = 0;
1347 gimple *stmt = DR_STMT (elem->dr);
1348 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1349 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1350 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1351 epilogue_cost_vec;
1353 prologue_cost_vec.create (2);
1354 body_cost_vec.create (2);
1355 epilogue_cost_vec.create (2);
1357 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1358 elem->dr, &inside_cost, &outside_cost,
1359 &body_cost_vec, elem->npeel, false);
1361 body_cost_vec.release ();
1363 outside_cost += vect_get_known_peeling_cost
1364 (loop_vinfo, elem->npeel, &dummy,
1365 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1366 &prologue_cost_vec, &epilogue_cost_vec);
1368 /* Prologue and epilogue costs are added to the target model later.
1369 These costs depend only on the scalar iteration cost, the
1370 number of peeling iterations finally chosen, and the number of
1371 misaligned statements. So discard the information found here. */
1372 prologue_cost_vec.release ();
1373 epilogue_cost_vec.release ();
1375 if (inside_cost < min->inside_cost
1376 || (inside_cost == min->inside_cost
1377 && outside_cost < min->outside_cost))
1379 min->inside_cost = inside_cost;
1380 min->outside_cost = outside_cost;
1381 min->peel_info.dr = elem->dr;
1382 min->peel_info.npeel = elem->npeel;
1383 min->peel_info.count = elem->count;
1386 return 1;
1390 /* Choose best peeling option by traversing peeling hash table and either
1391 choosing an option with the lowest cost (if cost model is enabled) or the
1392 option that aligns as many accesses as possible. */
1394 static struct _vect_peel_extended_info
1395 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1396 loop_vec_info loop_vinfo)
1398 struct _vect_peel_extended_info res;
1400 res.peel_info.dr = NULL;
1402 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1404 res.inside_cost = INT_MAX;
1405 res.outside_cost = INT_MAX;
1406 peeling_htab->traverse <_vect_peel_extended_info *,
1407 vect_peeling_hash_get_lowest_cost> (&res);
1409 else
1411 res.peel_info.count = 0;
1412 peeling_htab->traverse <_vect_peel_extended_info *,
1413 vect_peeling_hash_get_most_frequent> (&res);
1414 res.inside_cost = 0;
1415 res.outside_cost = 0;
1418 return res;
1421 /* Return true if the new peeling NPEEL is supported. */
1423 static bool
1424 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1425 unsigned npeel)
1427 unsigned i;
1428 struct data_reference *dr = NULL;
1429 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1430 gimple *stmt;
1431 stmt_vec_info stmt_info;
1432 enum dr_alignment_support supportable_dr_alignment;
1434 /* Ensure that all data refs can be vectorized after the peel. */
1435 FOR_EACH_VEC_ELT (datarefs, i, dr)
1437 int save_misalignment;
1439 if (dr == dr0)
1440 continue;
1442 stmt = DR_STMT (dr);
1443 stmt_info = vinfo_for_stmt (stmt);
1444 /* For interleaving, only the alignment of the first access
1445 matters. */
1446 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1447 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1448 continue;
1450 /* Strided accesses perform only component accesses, alignment is
1451 irrelevant for them. */
1452 if (STMT_VINFO_STRIDED_P (stmt_info)
1453 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1454 continue;
1456 save_misalignment = DR_MISALIGNMENT (dr);
1457 vect_update_misalignment_for_peel (dr, dr0, npeel);
1458 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1459 SET_DR_MISALIGNMENT (dr, save_misalignment);
1461 if (!supportable_dr_alignment)
1462 return false;
1465 return true;
1468 /* Function vect_enhance_data_refs_alignment
1470 This pass will use loop versioning and loop peeling in order to enhance
1471 the alignment of data references in the loop.
1473 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1474 original loop is to be vectorized. Any other loops that are created by
1475 the transformations performed in this pass - are not supposed to be
1476 vectorized. This restriction will be relaxed.
1478 This pass will require a cost model to guide it whether to apply peeling
1479 or versioning or a combination of the two. For example, the scheme that
1480 intel uses when given a loop with several memory accesses, is as follows:
1481 choose one memory access ('p') which alignment you want to force by doing
1482 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1483 other accesses are not necessarily aligned, or (2) use loop versioning to
1484 generate one loop in which all accesses are aligned, and another loop in
1485 which only 'p' is necessarily aligned.
1487 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1488 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1489 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1491 Devising a cost model is the most critical aspect of this work. It will
1492 guide us on which access to peel for, whether to use loop versioning, how
1493 many versions to create, etc. The cost model will probably consist of
1494 generic considerations as well as target specific considerations (on
1495 powerpc for example, misaligned stores are more painful than misaligned
1496 loads).
1498 Here are the general steps involved in alignment enhancements:
1500 -- original loop, before alignment analysis:
1501 for (i=0; i<N; i++){
1502 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1503 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1506 -- After vect_compute_data_refs_alignment:
1507 for (i=0; i<N; i++){
1508 x = q[i]; # DR_MISALIGNMENT(q) = 3
1509 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1512 -- Possibility 1: we do loop versioning:
1513 if (p is aligned) {
1514 for (i=0; i<N; i++){ # loop 1A
1515 x = q[i]; # DR_MISALIGNMENT(q) = 3
1516 p[i] = y; # DR_MISALIGNMENT(p) = 0
1519 else {
1520 for (i=0; i<N; i++){ # loop 1B
1521 x = q[i]; # DR_MISALIGNMENT(q) = 3
1522 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1526 -- Possibility 2: we do loop peeling:
1527 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1528 x = q[i];
1529 p[i] = y;
1531 for (i = 3; i < N; i++){ # loop 2A
1532 x = q[i]; # DR_MISALIGNMENT(q) = 0
1533 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1536 -- Possibility 3: combination of loop peeling and versioning:
1537 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1538 x = q[i];
1539 p[i] = y;
1541 if (p is aligned) {
1542 for (i = 3; i<N; i++){ # loop 3A
1543 x = q[i]; # DR_MISALIGNMENT(q) = 0
1544 p[i] = y; # DR_MISALIGNMENT(p) = 0
1547 else {
1548 for (i = 3; i<N; i++){ # loop 3B
1549 x = q[i]; # DR_MISALIGNMENT(q) = 0
1550 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1554 These loops are later passed to loop_transform to be vectorized. The
1555 vectorizer will use the alignment information to guide the transformation
1556 (whether to generate regular loads/stores, or with special handling for
1557 misalignment). */
1559 bool
1560 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1562 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1563 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1564 enum dr_alignment_support supportable_dr_alignment;
1565 struct data_reference *dr0 = NULL, *first_store = NULL;
1566 struct data_reference *dr;
1567 unsigned int i, j;
1568 bool do_peeling = false;
1569 bool do_versioning = false;
1570 bool stat;
1571 gimple *stmt;
1572 stmt_vec_info stmt_info;
1573 unsigned int npeel = 0;
1574 bool one_misalignment_known = false;
1575 bool one_misalignment_unknown = false;
1576 bool one_dr_unsupportable = false;
1577 struct data_reference *unsupportable_dr = NULL;
1578 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1579 unsigned possible_npeel_number = 1;
1580 tree vectype;
1581 unsigned int nelements, mis, same_align_drs_max = 0;
1582 hash_table<peel_info_hasher> peeling_htab (1);
1584 if (dump_enabled_p ())
1585 dump_printf_loc (MSG_NOTE, vect_location,
1586 "=== vect_enhance_data_refs_alignment ===\n");
1588 /* Reset data so we can safely be called multiple times. */
1589 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1590 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1592 /* While cost model enhancements are expected in the future, the high level
1593 view of the code at this time is as follows:
1595 A) If there is a misaligned access then see if peeling to align
1596 this access can make all data references satisfy
1597 vect_supportable_dr_alignment. If so, update data structures
1598 as needed and return true.
1600 B) If peeling wasn't possible and there is a data reference with an
1601 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1602 then see if loop versioning checks can be used to make all data
1603 references satisfy vect_supportable_dr_alignment. If so, update
1604 data structures as needed and return true.
1606 C) If neither peeling nor versioning were successful then return false if
1607 any data reference does not satisfy vect_supportable_dr_alignment.
1609 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1611 Note, Possibility 3 above (which is peeling and versioning together) is not
1612 being done at this time. */
1614 /* (1) Peeling to force alignment. */
1616 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1617 Considerations:
1618 + How many accesses will become aligned due to the peeling
1619 - How many accesses will become unaligned due to the peeling,
1620 and the cost of misaligned accesses.
1621 - The cost of peeling (the extra runtime checks, the increase
1622 in code size). */
1624 FOR_EACH_VEC_ELT (datarefs, i, dr)
1626 stmt = DR_STMT (dr);
1627 stmt_info = vinfo_for_stmt (stmt);
1629 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1630 continue;
1632 /* For interleaving, only the alignment of the first access
1633 matters. */
1634 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1635 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1636 continue;
1638 /* For invariant accesses there is nothing to enhance. */
1639 if (integer_zerop (DR_STEP (dr)))
1640 continue;
1642 /* Strided accesses perform only component accesses, alignment is
1643 irrelevant for them. */
1644 if (STMT_VINFO_STRIDED_P (stmt_info)
1645 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1646 continue;
1648 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1649 do_peeling = vector_alignment_reachable_p (dr);
1650 if (do_peeling)
1652 if (known_alignment_for_access_p (dr))
1654 unsigned int npeel_tmp = 0;
1655 bool negative = tree_int_cst_compare (DR_STEP (dr),
1656 size_zero_node) < 0;
1658 vectype = STMT_VINFO_VECTYPE (stmt_info);
1659 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1660 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1661 TREE_TYPE (DR_REF (dr))));
1662 if (DR_MISALIGNMENT (dr) != 0)
1663 npeel_tmp = (negative ? (mis - nelements)
1664 : (nelements - mis)) & (nelements - 1);
1666 /* For multiple types, it is possible that the bigger type access
1667 will have more than one peeling option. E.g., a loop with two
1668 types: one of size (vector size / 4), and the other one of
1669 size (vector size / 8). Vectorization factor will 8. If both
1670 accesses are misaligned by 3, the first one needs one scalar
1671 iteration to be aligned, and the second one needs 5. But the
1672 first one will be aligned also by peeling 5 scalar
1673 iterations, and in that case both accesses will be aligned.
1674 Hence, except for the immediate peeling amount, we also want
1675 to try to add full vector size, while we don't exceed
1676 vectorization factor.
1677 We do this automatically for cost model, since we calculate
1678 cost for every peeling option. */
1679 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1681 if (STMT_SLP_TYPE (stmt_info))
1682 possible_npeel_number
1683 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1684 else
1685 possible_npeel_number = vf / nelements;
1687 /* NPEEL_TMP is 0 when there is no misalignment, but also
1688 allow peeling NELEMENTS. */
1689 if (DR_MISALIGNMENT (dr) == 0)
1690 possible_npeel_number++;
1693 /* Save info about DR in the hash table. Also include peeling
1694 amounts according to the explanation above. */
1695 for (j = 0; j < possible_npeel_number; j++)
1697 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1698 dr, npeel_tmp);
1699 npeel_tmp += nelements;
1702 one_misalignment_known = true;
1704 else
1706 /* If we don't know any misalignment values, we prefer
1707 peeling for data-ref that has the maximum number of data-refs
1708 with the same alignment, unless the target prefers to align
1709 stores over load. */
1710 unsigned same_align_drs
1711 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1712 if (!dr0
1713 || same_align_drs_max < same_align_drs)
1715 same_align_drs_max = same_align_drs;
1716 dr0 = dr;
1718 /* For data-refs with the same number of related
1719 accesses prefer the one where the misalign
1720 computation will be invariant in the outermost loop. */
1721 else if (same_align_drs_max == same_align_drs)
1723 struct loop *ivloop0, *ivloop;
1724 ivloop0 = outermost_invariant_loop_for_expr
1725 (loop, DR_BASE_ADDRESS (dr0));
1726 ivloop = outermost_invariant_loop_for_expr
1727 (loop, DR_BASE_ADDRESS (dr));
1728 if ((ivloop && !ivloop0)
1729 || (ivloop && ivloop0
1730 && flow_loop_nested_p (ivloop, ivloop0)))
1731 dr0 = dr;
1734 one_misalignment_unknown = true;
1736 /* Check for data refs with unsupportable alignment that
1737 can be peeled. */
1738 if (!supportable_dr_alignment)
1740 one_dr_unsupportable = true;
1741 unsupportable_dr = dr;
1744 if (!first_store && DR_IS_WRITE (dr))
1745 first_store = dr;
1748 else
1750 if (!aligned_access_p (dr))
1752 if (dump_enabled_p ())
1753 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1754 "vector alignment may not be reachable\n");
1755 break;
1760 /* Check if we can possibly peel the loop. */
1761 if (!vect_can_advance_ivs_p (loop_vinfo)
1762 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1763 || loop->inner)
1764 do_peeling = false;
1766 struct _vect_peel_extended_info peel_for_known_alignment;
1767 struct _vect_peel_extended_info peel_for_unknown_alignment;
1768 struct _vect_peel_extended_info best_peel;
1770 peel_for_unknown_alignment.inside_cost = INT_MAX;
1771 peel_for_unknown_alignment.outside_cost = INT_MAX;
1772 peel_for_unknown_alignment.peel_info.count = 0;
1774 if (do_peeling
1775 && one_misalignment_unknown)
1777 /* Check if the target requires to prefer stores over loads, i.e., if
1778 misaligned stores are more expensive than misaligned loads (taking
1779 drs with same alignment into account). */
1780 unsigned int load_inside_cost = 0;
1781 unsigned int load_outside_cost = 0;
1782 unsigned int store_inside_cost = 0;
1783 unsigned int store_outside_cost = 0;
1785 stmt_vector_for_cost dummy;
1786 dummy.create (2);
1787 vect_get_peeling_costs_all_drs (datarefs, dr0,
1788 &load_inside_cost,
1789 &load_outside_cost,
1790 &dummy, vf / 2, true);
1791 dummy.release ();
1793 if (first_store)
1795 dummy.create (2);
1796 vect_get_peeling_costs_all_drs (datarefs, first_store,
1797 &store_inside_cost,
1798 &store_outside_cost,
1799 &dummy, vf / 2, true);
1800 dummy.release ();
1802 else
1804 store_inside_cost = INT_MAX;
1805 store_outside_cost = INT_MAX;
1808 if (load_inside_cost > store_inside_cost
1809 || (load_inside_cost == store_inside_cost
1810 && load_outside_cost > store_outside_cost))
1812 dr0 = first_store;
1813 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1814 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1816 else
1818 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1819 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1822 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1823 prologue_cost_vec.create (2);
1824 epilogue_cost_vec.create (2);
1826 int dummy2;
1827 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1828 (loop_vinfo, vf / 2, &dummy2,
1829 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1830 &prologue_cost_vec, &epilogue_cost_vec);
1832 prologue_cost_vec.release ();
1833 epilogue_cost_vec.release ();
1835 peel_for_unknown_alignment.peel_info.count = 1
1836 + STMT_VINFO_SAME_ALIGN_REFS
1837 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1840 peel_for_unknown_alignment.peel_info.npeel = 0;
1841 peel_for_unknown_alignment.peel_info.dr = dr0;
1843 best_peel = peel_for_unknown_alignment;
1845 peel_for_known_alignment.inside_cost = INT_MAX;
1846 peel_for_known_alignment.outside_cost = INT_MAX;
1847 peel_for_known_alignment.peel_info.count = 0;
1848 peel_for_known_alignment.peel_info.dr = NULL;
1850 if (do_peeling && one_misalignment_known)
1852 /* Peeling is possible, but there is no data access that is not supported
1853 unless aligned. So we try to choose the best possible peeling from
1854 the hash table. */
1855 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1856 (&peeling_htab, loop_vinfo);
1859 /* Compare costs of peeling for known and unknown alignment. */
1860 if (peel_for_known_alignment.peel_info.dr != NULL
1861 && peel_for_unknown_alignment.inside_cost
1862 >= peel_for_known_alignment.inside_cost)
1864 best_peel = peel_for_known_alignment;
1866 /* If the best peeling for known alignment has NPEEL == 0, perform no
1867 peeling at all except if there is an unsupportable dr that we can
1868 align. */
1869 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1870 do_peeling = false;
1873 /* If there is an unsupportable data ref, prefer this over all choices so far
1874 since we'd have to discard a chosen peeling except when it accidentally
1875 aligned the unsupportable data ref. */
1876 if (one_dr_unsupportable)
1877 dr0 = unsupportable_dr;
1878 else if (do_peeling)
1880 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1881 TODO: Use nopeel_outside_cost or get rid of it? */
1882 unsigned nopeel_inside_cost = 0;
1883 unsigned nopeel_outside_cost = 0;
1885 stmt_vector_for_cost dummy;
1886 dummy.create (2);
1887 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1888 &nopeel_outside_cost, &dummy, 0, false);
1889 dummy.release ();
1891 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1892 costs will be recorded. */
1893 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1894 prologue_cost_vec.create (2);
1895 epilogue_cost_vec.create (2);
1897 int dummy2;
1898 nopeel_outside_cost += vect_get_known_peeling_cost
1899 (loop_vinfo, 0, &dummy2,
1900 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1901 &prologue_cost_vec, &epilogue_cost_vec);
1903 prologue_cost_vec.release ();
1904 epilogue_cost_vec.release ();
1906 npeel = best_peel.peel_info.npeel;
1907 dr0 = best_peel.peel_info.dr;
1909 /* If no peeling is not more expensive than the best peeling we
1910 have so far, don't perform any peeling. */
1911 if (nopeel_inside_cost <= best_peel.inside_cost)
1912 do_peeling = false;
1915 if (do_peeling)
1917 stmt = DR_STMT (dr0);
1918 stmt_info = vinfo_for_stmt (stmt);
1919 vectype = STMT_VINFO_VECTYPE (stmt_info);
1920 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1922 if (known_alignment_for_access_p (dr0))
1924 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1925 size_zero_node) < 0;
1926 if (!npeel)
1928 /* Since it's known at compile time, compute the number of
1929 iterations in the peeled loop (the peeling factor) for use in
1930 updating DR_MISALIGNMENT values. The peeling factor is the
1931 vectorization factor minus the misalignment as an element
1932 count. */
1933 mis = DR_MISALIGNMENT (dr0);
1934 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1935 npeel = ((negative ? mis - nelements : nelements - mis)
1936 & (nelements - 1));
1939 /* For interleaved data access every iteration accesses all the
1940 members of the group, therefore we divide the number of iterations
1941 by the group size. */
1942 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1943 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1944 npeel /= GROUP_SIZE (stmt_info);
1946 if (dump_enabled_p ())
1947 dump_printf_loc (MSG_NOTE, vect_location,
1948 "Try peeling by %d\n", npeel);
1951 /* Ensure that all datarefs can be vectorized after the peel. */
1952 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1953 do_peeling = false;
1955 /* Check if all datarefs are supportable and log. */
1956 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1958 stat = vect_verify_datarefs_alignment (loop_vinfo);
1959 if (!stat)
1960 do_peeling = false;
1961 else
1962 return stat;
1965 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1966 if (do_peeling)
1968 unsigned max_allowed_peel
1969 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1970 if (max_allowed_peel != (unsigned)-1)
1972 unsigned max_peel = npeel;
1973 if (max_peel == 0)
1975 gimple *dr_stmt = DR_STMT (dr0);
1976 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1977 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1978 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1980 if (max_peel > max_allowed_peel)
1982 do_peeling = false;
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_NOTE, vect_location,
1985 "Disable peeling, max peels reached: %d\n", max_peel);
1990 /* Cost model #2 - if peeling may result in a remaining loop not
1991 iterating enough to be vectorized then do not peel. */
1992 if (do_peeling
1993 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1995 unsigned max_peel
1996 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1997 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1998 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1999 do_peeling = false;
2002 if (do_peeling)
2004 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2005 If the misalignment of DR_i is identical to that of dr0 then set
2006 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2007 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2008 by the peeling factor times the element size of DR_i (MOD the
2009 vectorization factor times the size). Otherwise, the
2010 misalignment of DR_i must be set to unknown. */
2011 FOR_EACH_VEC_ELT (datarefs, i, dr)
2012 if (dr != dr0)
2014 /* Strided accesses perform only component accesses, alignment
2015 is irrelevant for them. */
2016 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2017 if (STMT_VINFO_STRIDED_P (stmt_info)
2018 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2019 continue;
2021 vect_update_misalignment_for_peel (dr, dr0, npeel);
2024 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2025 if (npeel)
2026 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2027 else
2028 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2029 = DR_MISALIGNMENT (dr0);
2030 SET_DR_MISALIGNMENT (dr0, 0);
2031 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_NOTE, vect_location,
2034 "Alignment of access forced using peeling.\n");
2035 dump_printf_loc (MSG_NOTE, vect_location,
2036 "Peeling for alignment will be applied.\n");
2039 /* The inside-loop cost will be accounted for in vectorizable_load
2040 and vectorizable_store correctly with adjusted alignments.
2041 Drop the body_cst_vec on the floor here. */
2042 stat = vect_verify_datarefs_alignment (loop_vinfo);
2043 gcc_assert (stat);
2044 return stat;
2048 /* (2) Versioning to force alignment. */
2050 /* Try versioning if:
2051 1) optimize loop for speed
2052 2) there is at least one unsupported misaligned data ref with an unknown
2053 misalignment, and
2054 3) all misaligned data refs with a known misalignment are supported, and
2055 4) the number of runtime alignment checks is within reason. */
2057 do_versioning =
2058 optimize_loop_nest_for_speed_p (loop)
2059 && (!loop->inner); /* FORNOW */
2061 if (do_versioning)
2063 FOR_EACH_VEC_ELT (datarefs, i, dr)
2065 stmt = DR_STMT (dr);
2066 stmt_info = vinfo_for_stmt (stmt);
2068 /* For interleaving, only the alignment of the first access
2069 matters. */
2070 if (aligned_access_p (dr)
2071 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2072 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2073 continue;
2075 if (STMT_VINFO_STRIDED_P (stmt_info))
2077 /* Strided loads perform only component accesses, alignment is
2078 irrelevant for them. */
2079 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2080 continue;
2081 do_versioning = false;
2082 break;
2085 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2087 if (!supportable_dr_alignment)
2089 gimple *stmt;
2090 int mask;
2091 tree vectype;
2093 if (known_alignment_for_access_p (dr)
2094 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2095 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2097 do_versioning = false;
2098 break;
2101 stmt = DR_STMT (dr);
2102 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2103 gcc_assert (vectype);
2105 /* The rightmost bits of an aligned address must be zeros.
2106 Construct the mask needed for this test. For example,
2107 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2108 mask must be 15 = 0xf. */
2109 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2111 /* FORNOW: use the same mask to test all potentially unaligned
2112 references in the loop. The vectorizer currently supports
2113 a single vector size, see the reference to
2114 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2115 vectorization factor is computed. */
2116 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2117 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2118 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2119 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2120 DR_STMT (dr));
2124 /* Versioning requires at least one misaligned data reference. */
2125 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2126 do_versioning = false;
2127 else if (!do_versioning)
2128 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2131 if (do_versioning)
2133 vec<gimple *> may_misalign_stmts
2134 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2135 gimple *stmt;
2137 /* It can now be assumed that the data references in the statements
2138 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2139 of the loop being vectorized. */
2140 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2142 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2143 dr = STMT_VINFO_DATA_REF (stmt_info);
2144 SET_DR_MISALIGNMENT (dr, 0);
2145 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_NOTE, vect_location,
2147 "Alignment of access forced using versioning.\n");
2150 if (dump_enabled_p ())
2151 dump_printf_loc (MSG_NOTE, vect_location,
2152 "Versioning for alignment will be applied.\n");
2154 /* Peeling and versioning can't be done together at this time. */
2155 gcc_assert (! (do_peeling && do_versioning));
2157 stat = vect_verify_datarefs_alignment (loop_vinfo);
2158 gcc_assert (stat);
2159 return stat;
2162 /* This point is reached if neither peeling nor versioning is being done. */
2163 gcc_assert (! (do_peeling || do_versioning));
2165 stat = vect_verify_datarefs_alignment (loop_vinfo);
2166 return stat;
2170 /* Function vect_find_same_alignment_drs.
2172 Update group and alignment relations according to the chosen
2173 vectorization factor. */
2175 static void
2176 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2178 struct data_reference *dra = DDR_A (ddr);
2179 struct data_reference *drb = DDR_B (ddr);
2180 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2181 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2183 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2184 return;
2186 if (dra == drb)
2187 return;
2189 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2190 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2191 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2192 return;
2194 /* Two references with distance zero have the same alignment. */
2195 offset_int diff = (wi::to_offset (DR_INIT (dra))
2196 - wi::to_offset (DR_INIT (drb)));
2197 if (diff != 0)
2199 /* Get the wider of the two alignments. */
2200 unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2201 unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2202 unsigned int max_align = MAX (align_a, align_b);
2204 /* Require the gap to be a multiple of the larger vector alignment. */
2205 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2206 return;
2209 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2210 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2211 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_NOTE, vect_location,
2214 "accesses have the same alignment: ");
2215 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2216 dump_printf (MSG_NOTE, " and ");
2217 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2218 dump_printf (MSG_NOTE, "\n");
2223 /* Function vect_analyze_data_refs_alignment
2225 Analyze the alignment of the data-references in the loop.
2226 Return FALSE if a data reference is found that cannot be vectorized. */
2228 bool
2229 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2231 if (dump_enabled_p ())
2232 dump_printf_loc (MSG_NOTE, vect_location,
2233 "=== vect_analyze_data_refs_alignment ===\n");
2235 /* Mark groups of data references with same alignment using
2236 data dependence information. */
2237 vec<ddr_p> ddrs = vinfo->ddrs;
2238 struct data_dependence_relation *ddr;
2239 unsigned int i;
2241 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2242 vect_find_same_alignment_drs (ddr);
2244 vec<data_reference_p> datarefs = vinfo->datarefs;
2245 struct data_reference *dr;
2247 vect_record_base_alignments (vinfo);
2248 FOR_EACH_VEC_ELT (datarefs, i, dr)
2250 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2251 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2252 && !vect_compute_data_ref_alignment (dr))
2254 /* Strided accesses perform only component accesses, misalignment
2255 information is irrelevant for them. */
2256 if (STMT_VINFO_STRIDED_P (stmt_info)
2257 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2258 continue;
2260 if (dump_enabled_p ())
2261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262 "not vectorized: can't calculate alignment "
2263 "for data ref.\n");
2265 return false;
2269 return true;
2273 /* Analyze alignment of DRs of stmts in NODE. */
2275 static bool
2276 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2278 /* We vectorize from the first scalar stmt in the node unless
2279 the node is permuted in which case we start from the first
2280 element in the group. */
2281 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2282 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2283 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2284 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2286 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2287 if (! vect_compute_data_ref_alignment (dr)
2288 /* For creating the data-ref pointer we need alignment of the
2289 first element anyway. */
2290 || (dr != first_dr
2291 && ! vect_compute_data_ref_alignment (first_dr))
2292 || ! verify_data_ref_alignment (dr))
2294 if (dump_enabled_p ())
2295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2296 "not vectorized: bad data alignment in basic "
2297 "block.\n");
2298 return false;
2301 return true;
2304 /* Function vect_slp_analyze_instance_alignment
2306 Analyze the alignment of the data-references in the SLP instance.
2307 Return FALSE if a data reference is found that cannot be vectorized. */
2309 bool
2310 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_NOTE, vect_location,
2314 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2316 slp_tree node;
2317 unsigned i;
2318 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2319 if (! vect_slp_analyze_and_verify_node_alignment (node))
2320 return false;
2322 node = SLP_INSTANCE_TREE (instance);
2323 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2324 && ! vect_slp_analyze_and_verify_node_alignment
2325 (SLP_INSTANCE_TREE (instance)))
2326 return false;
2328 return true;
2332 /* Analyze groups of accesses: check that DR belongs to a group of
2333 accesses of legal size, step, etc. Detect gaps, single element
2334 interleaving, and other special cases. Set grouped access info.
2335 Collect groups of strided stores for further use in SLP analysis.
2336 Worker for vect_analyze_group_access. */
2338 static bool
2339 vect_analyze_group_access_1 (struct data_reference *dr)
2341 tree step = DR_STEP (dr);
2342 tree scalar_type = TREE_TYPE (DR_REF (dr));
2343 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2344 gimple *stmt = DR_STMT (dr);
2345 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2346 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2347 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2348 HOST_WIDE_INT dr_step = -1;
2349 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2350 bool slp_impossible = false;
2352 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2353 size of the interleaving group (including gaps). */
2354 if (tree_fits_shwi_p (step))
2356 dr_step = tree_to_shwi (step);
2357 /* Check that STEP is a multiple of type size. Otherwise there is
2358 a non-element-sized gap at the end of the group which we
2359 cannot represent in GROUP_GAP or GROUP_SIZE.
2360 ??? As we can handle non-constant step fine here we should
2361 simply remove uses of GROUP_GAP between the last and first
2362 element and instead rely on DR_STEP. GROUP_SIZE then would
2363 simply not include that gap. */
2364 if ((dr_step % type_size) != 0)
2366 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "Step ");
2370 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2371 dump_printf (MSG_NOTE,
2372 " is not a multiple of the element size for ");
2373 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2374 dump_printf (MSG_NOTE, "\n");
2376 return false;
2378 groupsize = absu_hwi (dr_step) / type_size;
2380 else
2381 groupsize = 0;
2383 /* Not consecutive access is possible only if it is a part of interleaving. */
2384 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2386 /* Check if it this DR is a part of interleaving, and is a single
2387 element of the group that is accessed in the loop. */
2389 /* Gaps are supported only for loads. STEP must be a multiple of the type
2390 size. The size of the group must be a power of 2. */
2391 if (DR_IS_READ (dr)
2392 && (dr_step % type_size) == 0
2393 && groupsize > 0
2394 && pow2p_hwi (groupsize))
2396 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2397 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2398 GROUP_GAP (stmt_info) = groupsize - 1;
2399 if (dump_enabled_p ())
2401 dump_printf_loc (MSG_NOTE, vect_location,
2402 "Detected single element interleaving ");
2403 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2404 dump_printf (MSG_NOTE, " step ");
2405 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2406 dump_printf (MSG_NOTE, "\n");
2409 return true;
2412 if (dump_enabled_p ())
2414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2415 "not consecutive access ");
2416 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2419 if (bb_vinfo)
2421 /* Mark the statement as unvectorizable. */
2422 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2423 return true;
2426 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2427 STMT_VINFO_STRIDED_P (stmt_info) = true;
2428 return true;
2431 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2433 /* First stmt in the interleaving chain. Check the chain. */
2434 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2435 struct data_reference *data_ref = dr;
2436 unsigned int count = 1;
2437 tree prev_init = DR_INIT (data_ref);
2438 gimple *prev = stmt;
2439 HOST_WIDE_INT diff, gaps = 0;
2441 while (next)
2443 /* Skip same data-refs. In case that two or more stmts share
2444 data-ref (supported only for loads), we vectorize only the first
2445 stmt, and the rest get their vectorized loads from the first
2446 one. */
2447 if (!tree_int_cst_compare (DR_INIT (data_ref),
2448 DR_INIT (STMT_VINFO_DATA_REF (
2449 vinfo_for_stmt (next)))))
2451 if (DR_IS_WRITE (data_ref))
2453 if (dump_enabled_p ())
2454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2455 "Two store stmts share the same dr.\n");
2456 return false;
2459 if (dump_enabled_p ())
2460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2461 "Two or more load stmts share the same dr.\n");
2463 /* For load use the same data-ref load. */
2464 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2466 prev = next;
2467 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2468 continue;
2471 prev = next;
2472 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2474 /* All group members have the same STEP by construction. */
2475 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2477 /* Check that the distance between two accesses is equal to the type
2478 size. Otherwise, we have gaps. */
2479 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2480 - TREE_INT_CST_LOW (prev_init)) / type_size;
2481 if (diff != 1)
2483 /* FORNOW: SLP of accesses with gaps is not supported. */
2484 slp_impossible = true;
2485 if (DR_IS_WRITE (data_ref))
2487 if (dump_enabled_p ())
2488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2489 "interleaved store with gaps\n");
2490 return false;
2493 gaps += diff - 1;
2496 last_accessed_element += diff;
2498 /* Store the gap from the previous member of the group. If there is no
2499 gap in the access, GROUP_GAP is always 1. */
2500 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2502 prev_init = DR_INIT (data_ref);
2503 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2504 /* Count the number of data-refs in the chain. */
2505 count++;
2508 if (groupsize == 0)
2509 groupsize = count + gaps;
2511 /* This could be UINT_MAX but as we are generating code in a very
2512 inefficient way we have to cap earlier. See PR78699 for example. */
2513 if (groupsize > 4096)
2515 if (dump_enabled_p ())
2516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2517 "group is too large\n");
2518 return false;
2521 /* Check that the size of the interleaving is equal to count for stores,
2522 i.e., that there are no gaps. */
2523 if (groupsize != count
2524 && !DR_IS_READ (dr))
2526 if (dump_enabled_p ())
2527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2528 "interleaved store with gaps\n");
2529 return false;
2532 /* If there is a gap after the last load in the group it is the
2533 difference between the groupsize and the last accessed
2534 element.
2535 When there is no gap, this difference should be 0. */
2536 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2538 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2539 if (dump_enabled_p ())
2541 dump_printf_loc (MSG_NOTE, vect_location,
2542 "Detected interleaving ");
2543 if (DR_IS_READ (dr))
2544 dump_printf (MSG_NOTE, "load ");
2545 else
2546 dump_printf (MSG_NOTE, "store ");
2547 dump_printf (MSG_NOTE, "of size %u starting with ",
2548 (unsigned)groupsize);
2549 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2550 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2551 dump_printf_loc (MSG_NOTE, vect_location,
2552 "There is a gap of %u elements after the group\n",
2553 GROUP_GAP (vinfo_for_stmt (stmt)));
2556 /* SLP: create an SLP data structure for every interleaving group of
2557 stores for further analysis in vect_analyse_slp. */
2558 if (DR_IS_WRITE (dr) && !slp_impossible)
2560 if (loop_vinfo)
2561 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2562 if (bb_vinfo)
2563 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2567 return true;
2570 /* Analyze groups of accesses: check that DR belongs to a group of
2571 accesses of legal size, step, etc. Detect gaps, single element
2572 interleaving, and other special cases. Set grouped access info.
2573 Collect groups of strided stores for further use in SLP analysis. */
2575 static bool
2576 vect_analyze_group_access (struct data_reference *dr)
2578 if (!vect_analyze_group_access_1 (dr))
2580 /* Dissolve the group if present. */
2581 gimple *next;
2582 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2583 while (stmt)
2585 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2586 next = GROUP_NEXT_ELEMENT (vinfo);
2587 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2588 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2589 stmt = next;
2591 return false;
2593 return true;
2596 /* Analyze the access pattern of the data-reference DR.
2597 In case of non-consecutive accesses call vect_analyze_group_access() to
2598 analyze groups of accesses. */
2600 static bool
2601 vect_analyze_data_ref_access (struct data_reference *dr)
2603 tree step = DR_STEP (dr);
2604 tree scalar_type = TREE_TYPE (DR_REF (dr));
2605 gimple *stmt = DR_STMT (dr);
2606 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2607 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2608 struct loop *loop = NULL;
2610 if (loop_vinfo)
2611 loop = LOOP_VINFO_LOOP (loop_vinfo);
2613 if (loop_vinfo && !step)
2615 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2617 "bad data-ref access in loop\n");
2618 return false;
2621 /* Allow loads with zero step in inner-loop vectorization. */
2622 if (loop_vinfo && integer_zerop (step))
2624 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2625 if (!nested_in_vect_loop_p (loop, stmt))
2626 return DR_IS_READ (dr);
2627 /* Allow references with zero step for outer loops marked
2628 with pragma omp simd only - it guarantees absence of
2629 loop-carried dependencies between inner loop iterations. */
2630 if (!loop->force_vectorize)
2632 if (dump_enabled_p ())
2633 dump_printf_loc (MSG_NOTE, vect_location,
2634 "zero step in inner loop of nest\n");
2635 return false;
2639 if (loop && nested_in_vect_loop_p (loop, stmt))
2641 /* Interleaved accesses are not yet supported within outer-loop
2642 vectorization for references in the inner-loop. */
2643 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2645 /* For the rest of the analysis we use the outer-loop step. */
2646 step = STMT_VINFO_DR_STEP (stmt_info);
2647 if (integer_zerop (step))
2649 if (dump_enabled_p ())
2650 dump_printf_loc (MSG_NOTE, vect_location,
2651 "zero step in outer loop.\n");
2652 return DR_IS_READ (dr);
2656 /* Consecutive? */
2657 if (TREE_CODE (step) == INTEGER_CST)
2659 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2660 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2661 || (dr_step < 0
2662 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2664 /* Mark that it is not interleaving. */
2665 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2666 return true;
2670 if (loop && nested_in_vect_loop_p (loop, stmt))
2672 if (dump_enabled_p ())
2673 dump_printf_loc (MSG_NOTE, vect_location,
2674 "grouped access in outer loop.\n");
2675 return false;
2679 /* Assume this is a DR handled by non-constant strided load case. */
2680 if (TREE_CODE (step) != INTEGER_CST)
2681 return (STMT_VINFO_STRIDED_P (stmt_info)
2682 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2683 || vect_analyze_group_access (dr)));
2685 /* Not consecutive access - check if it's a part of interleaving group. */
2686 return vect_analyze_group_access (dr);
2689 /* Compare two data-references DRA and DRB to group them into chunks
2690 suitable for grouping. */
2692 static int
2693 dr_group_sort_cmp (const void *dra_, const void *drb_)
2695 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2696 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2697 int cmp;
2699 /* Stabilize sort. */
2700 if (dra == drb)
2701 return 0;
2703 /* DRs in different loops never belong to the same group. */
2704 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2705 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2706 if (loopa != loopb)
2707 return loopa->num < loopb->num ? -1 : 1;
2709 /* Ordering of DRs according to base. */
2710 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2712 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2713 DR_BASE_ADDRESS (drb));
2714 if (cmp != 0)
2715 return cmp;
2718 /* And according to DR_OFFSET. */
2719 if (!dr_equal_offsets_p (dra, drb))
2721 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2722 if (cmp != 0)
2723 return cmp;
2726 /* Put reads before writes. */
2727 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2728 return DR_IS_READ (dra) ? -1 : 1;
2730 /* Then sort after access size. */
2731 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2732 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2734 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2735 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2736 if (cmp != 0)
2737 return cmp;
2740 /* And after step. */
2741 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2743 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2744 if (cmp != 0)
2745 return cmp;
2748 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2749 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2750 if (cmp == 0)
2751 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2752 return cmp;
2755 /* Function vect_analyze_data_ref_accesses.
2757 Analyze the access pattern of all the data references in the loop.
2759 FORNOW: the only access pattern that is considered vectorizable is a
2760 simple step 1 (consecutive) access.
2762 FORNOW: handle only arrays and pointer accesses. */
2764 bool
2765 vect_analyze_data_ref_accesses (vec_info *vinfo)
2767 unsigned int i;
2768 vec<data_reference_p> datarefs = vinfo->datarefs;
2769 struct data_reference *dr;
2771 if (dump_enabled_p ())
2772 dump_printf_loc (MSG_NOTE, vect_location,
2773 "=== vect_analyze_data_ref_accesses ===\n");
2775 if (datarefs.is_empty ())
2776 return true;
2778 /* Sort the array of datarefs to make building the interleaving chains
2779 linear. Don't modify the original vector's order, it is needed for
2780 determining what dependencies are reversed. */
2781 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2782 datarefs_copy.qsort (dr_group_sort_cmp);
2784 /* Build the interleaving chains. */
2785 for (i = 0; i < datarefs_copy.length () - 1;)
2787 data_reference_p dra = datarefs_copy[i];
2788 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2789 stmt_vec_info lastinfo = NULL;
2790 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2792 ++i;
2793 continue;
2795 for (i = i + 1; i < datarefs_copy.length (); ++i)
2797 data_reference_p drb = datarefs_copy[i];
2798 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2799 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2800 break;
2802 /* ??? Imperfect sorting (non-compatible types, non-modulo
2803 accesses, same accesses) can lead to a group to be artificially
2804 split here as we don't just skip over those. If it really
2805 matters we can push those to a worklist and re-iterate
2806 over them. The we can just skip ahead to the next DR here. */
2808 /* DRs in a different loop should not be put into the same
2809 interleaving group. */
2810 if (gimple_bb (DR_STMT (dra))->loop_father
2811 != gimple_bb (DR_STMT (drb))->loop_father)
2812 break;
2814 /* Check that the data-refs have same first location (except init)
2815 and they are both either store or load (not load and store,
2816 not masked loads or stores). */
2817 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2818 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2819 DR_BASE_ADDRESS (drb), 0)
2820 || !dr_equal_offsets_p (dra, drb)
2821 || !gimple_assign_single_p (DR_STMT (dra))
2822 || !gimple_assign_single_p (DR_STMT (drb)))
2823 break;
2825 /* Check that the data-refs have the same constant size. */
2826 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2827 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2828 if (!tree_fits_uhwi_p (sza)
2829 || !tree_fits_uhwi_p (szb)
2830 || !tree_int_cst_equal (sza, szb))
2831 break;
2833 /* Check that the data-refs have the same step. */
2834 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2835 break;
2837 /* Do not place the same access in the interleaving chain twice. */
2838 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2839 break;
2841 /* Check the types are compatible.
2842 ??? We don't distinguish this during sorting. */
2843 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2844 TREE_TYPE (DR_REF (drb))))
2845 break;
2847 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2848 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2849 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2850 gcc_assert (init_a <= init_b);
2852 /* If init_b == init_a + the size of the type * k, we have an
2853 interleaving, and DRA is accessed before DRB. */
2854 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2855 if (type_size_a == 0
2856 || (init_b - init_a) % type_size_a != 0)
2857 break;
2859 /* If we have a store, the accesses are adjacent. This splits
2860 groups into chunks we support (we don't support vectorization
2861 of stores with gaps). */
2862 if (!DR_IS_READ (dra)
2863 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2864 (DR_INIT (datarefs_copy[i-1]))
2865 != type_size_a))
2866 break;
2868 /* If the step (if not zero or non-constant) is greater than the
2869 difference between data-refs' inits this splits groups into
2870 suitable sizes. */
2871 if (tree_fits_shwi_p (DR_STEP (dra)))
2873 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2874 if (step != 0 && step <= (init_b - init_a))
2875 break;
2878 if (dump_enabled_p ())
2880 dump_printf_loc (MSG_NOTE, vect_location,
2881 "Detected interleaving ");
2882 if (DR_IS_READ (dra))
2883 dump_printf (MSG_NOTE, "load ");
2884 else
2885 dump_printf (MSG_NOTE, "store ");
2886 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2887 dump_printf (MSG_NOTE, " and ");
2888 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2889 dump_printf (MSG_NOTE, "\n");
2892 /* Link the found element into the group list. */
2893 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2895 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2896 lastinfo = stmtinfo_a;
2898 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2899 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2900 lastinfo = stmtinfo_b;
2904 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2905 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2906 && !vect_analyze_data_ref_access (dr))
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910 "not vectorized: complicated access pattern.\n");
2912 if (is_a <bb_vec_info> (vinfo))
2914 /* Mark the statement as not vectorizable. */
2915 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2916 continue;
2918 else
2920 datarefs_copy.release ();
2921 return false;
2925 datarefs_copy.release ();
2926 return true;
2929 /* Function vect_vfa_segment_size.
2931 Create an expression that computes the size of segment
2932 that will be accessed for a data reference. The functions takes into
2933 account that realignment loads may access one more vector.
2935 Input:
2936 DR: The data reference.
2937 LENGTH_FACTOR: segment length to consider.
2939 Return an expression whose value is the size of segment which will be
2940 accessed by DR. */
2942 static tree
2943 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2945 tree segment_length;
2947 if (integer_zerop (DR_STEP (dr)))
2948 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2949 else
2950 segment_length = size_binop (MULT_EXPR,
2951 fold_convert (sizetype, DR_STEP (dr)),
2952 fold_convert (sizetype, length_factor));
2954 if (vect_supportable_dr_alignment (dr, false)
2955 == dr_explicit_realign_optimized)
2957 tree vector_size = TYPE_SIZE_UNIT
2958 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2960 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2962 return segment_length;
2965 /* Function vect_no_alias_p.
2967 Given data references A and B with equal base and offset, the alias
2968 relation can be decided at compilation time, return TRUE if they do
2969 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2970 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2971 respectively. */
2973 static bool
2974 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2975 tree segment_length_a, tree segment_length_b)
2977 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2978 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2979 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2980 return false;
2982 tree seg_a_min = DR_INIT (a);
2983 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2984 seg_a_min, segment_length_a);
2985 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2986 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2987 [a, a+12) */
2988 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2990 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2991 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2992 seg_a_max, unit_size);
2993 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2994 DR_INIT (a), unit_size);
2996 tree seg_b_min = DR_INIT (b);
2997 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2998 seg_b_min, segment_length_b);
2999 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3001 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3002 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3003 seg_b_max, unit_size);
3004 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3005 DR_INIT (b), unit_size);
3008 if (tree_int_cst_le (seg_a_max, seg_b_min)
3009 || tree_int_cst_le (seg_b_max, seg_a_min))
3010 return true;
3012 return false;
3015 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3016 in DDR is >= VF. */
3018 static bool
3019 dependence_distance_ge_vf (data_dependence_relation *ddr,
3020 unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3022 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3023 || DDR_NUM_DIST_VECTS (ddr) == 0)
3024 return false;
3026 /* If the dependence is exact, we should have limited the VF instead. */
3027 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3029 unsigned int i;
3030 lambda_vector dist_v;
3031 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3033 HOST_WIDE_INT dist = dist_v[loop_depth];
3034 if (dist != 0
3035 && !(dist > 0 && DDR_REVERSED_P (ddr))
3036 && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3037 return false;
3040 if (dump_enabled_p ())
3042 dump_printf_loc (MSG_NOTE, vect_location,
3043 "dependence distance between ");
3044 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3045 dump_printf (MSG_NOTE, " and ");
3046 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3047 dump_printf (MSG_NOTE, " is >= VF\n");
3050 return true;
3053 /* Function vect_prune_runtime_alias_test_list.
3055 Prune a list of ddrs to be tested at run-time by versioning for alias.
3056 Merge several alias checks into one if possible.
3057 Return FALSE if resulting list of ddrs is longer then allowed by
3058 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3060 bool
3061 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3063 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3064 hash_set <tree_pair_hash> compared_objects;
3066 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3067 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3068 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3069 vec<vec_object_pair> &check_unequal_addrs
3070 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3071 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3072 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3074 ddr_p ddr;
3075 unsigned int i;
3076 tree length_factor;
3078 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_NOTE, vect_location,
3080 "=== vect_prune_runtime_alias_test_list ===\n");
3082 if (may_alias_ddrs.is_empty ())
3083 return true;
3085 comp_alias_ddrs.create (may_alias_ddrs.length ());
3087 unsigned int loop_depth
3088 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3089 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3091 /* First, we collect all data ref pairs for aliasing checks. */
3092 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3094 int comp_res;
3095 struct data_reference *dr_a, *dr_b;
3096 gimple *dr_group_first_a, *dr_group_first_b;
3097 tree segment_length_a, segment_length_b;
3098 gimple *stmt_a, *stmt_b;
3100 /* Ignore the alias if the VF we chose ended up being no greater
3101 than the dependence distance. */
3102 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3103 continue;
3105 if (DDR_OBJECT_A (ddr))
3107 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3108 if (!compared_objects.add (new_pair))
3110 if (dump_enabled_p ())
3112 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3113 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3114 dump_printf (MSG_NOTE, " and ");
3115 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3116 dump_printf (MSG_NOTE, " have different addresses\n");
3118 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3120 continue;
3123 dr_a = DDR_A (ddr);
3124 stmt_a = DR_STMT (DDR_A (ddr));
3125 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3126 if (dr_group_first_a)
3128 stmt_a = dr_group_first_a;
3129 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3132 dr_b = DDR_B (ddr);
3133 stmt_b = DR_STMT (DDR_B (ddr));
3134 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3135 if (dr_group_first_b)
3137 stmt_b = dr_group_first_b;
3138 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3141 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3142 length_factor = scalar_loop_iters;
3143 else
3144 length_factor = size_int (vect_factor);
3145 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3146 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3148 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3149 DR_BASE_ADDRESS (dr_b));
3150 if (comp_res == 0)
3151 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3152 DR_OFFSET (dr_b));
3154 /* Alias is known at compilation time. */
3155 if (comp_res == 0
3156 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3157 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3158 && TREE_CODE (segment_length_a) == INTEGER_CST
3159 && TREE_CODE (segment_length_b) == INTEGER_CST)
3161 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3162 continue;
3164 if (dump_enabled_p ())
3165 dump_printf_loc (MSG_NOTE, vect_location,
3166 "not vectorized: compilation time alias.\n");
3168 return false;
3171 dr_with_seg_len_pair_t dr_with_seg_len_pair
3172 (dr_with_seg_len (dr_a, segment_length_a),
3173 dr_with_seg_len (dr_b, segment_length_b));
3175 /* Canonicalize pairs by sorting the two DR members. */
3176 if (comp_res > 0)
3177 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3179 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3182 prune_runtime_alias_test_list (&comp_alias_ddrs,
3183 (unsigned HOST_WIDE_INT) vect_factor);
3185 unsigned int count = (comp_alias_ddrs.length ()
3186 + check_unequal_addrs.length ());
3187 dump_printf_loc (MSG_NOTE, vect_location,
3188 "improved number of alias checks from %d to %d\n",
3189 may_alias_ddrs.length (), count);
3190 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3192 if (dump_enabled_p ())
3193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3194 "number of versioning for alias "
3195 "run-time tests exceeds %d "
3196 "(--param vect-max-version-for-alias-checks)\n",
3197 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3198 return false;
3201 return true;
3204 /* Return true if a non-affine read or write in STMT is suitable for a
3205 gather load or scatter store. Describe the operation in *INFO if so. */
3207 bool
3208 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3209 gather_scatter_info *info)
3211 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3212 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3213 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3214 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3215 tree offtype = NULL_TREE;
3216 tree decl, base, off;
3217 machine_mode pmode;
3218 int punsignedp, reversep, pvolatilep = 0;
3220 base = DR_REF (dr);
3221 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3222 see if we can use the def stmt of the address. */
3223 if (is_gimple_call (stmt)
3224 && gimple_call_internal_p (stmt)
3225 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3226 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3227 && TREE_CODE (base) == MEM_REF
3228 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3229 && integer_zerop (TREE_OPERAND (base, 1))
3230 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3232 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3233 if (is_gimple_assign (def_stmt)
3234 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3235 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3238 /* The gather and scatter builtins need address of the form
3239 loop_invariant + vector * {1, 2, 4, 8}
3241 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3242 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3243 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3244 multiplications and additions in it. To get a vector, we need
3245 a single SSA_NAME that will be defined in the loop and will
3246 contain everything that is not loop invariant and that can be
3247 vectorized. The following code attempts to find such a preexistng
3248 SSA_NAME OFF and put the loop invariants into a tree BASE
3249 that can be gimplified before the loop. */
3250 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3251 &punsignedp, &reversep, &pvolatilep);
3252 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3254 if (TREE_CODE (base) == MEM_REF)
3256 if (!integer_zerop (TREE_OPERAND (base, 1)))
3258 if (off == NULL_TREE)
3260 offset_int moff = mem_ref_offset (base);
3261 off = wide_int_to_tree (sizetype, moff);
3263 else
3264 off = size_binop (PLUS_EXPR, off,
3265 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3267 base = TREE_OPERAND (base, 0);
3269 else
3270 base = build_fold_addr_expr (base);
3272 if (off == NULL_TREE)
3273 off = size_zero_node;
3275 /* If base is not loop invariant, either off is 0, then we start with just
3276 the constant offset in the loop invariant BASE and continue with base
3277 as OFF, otherwise give up.
3278 We could handle that case by gimplifying the addition of base + off
3279 into some SSA_NAME and use that as off, but for now punt. */
3280 if (!expr_invariant_in_loop_p (loop, base))
3282 if (!integer_zerop (off))
3283 return false;
3284 off = base;
3285 base = size_int (pbitpos / BITS_PER_UNIT);
3287 /* Otherwise put base + constant offset into the loop invariant BASE
3288 and continue with OFF. */
3289 else
3291 base = fold_convert (sizetype, base);
3292 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3295 /* OFF at this point may be either a SSA_NAME or some tree expression
3296 from get_inner_reference. Try to peel off loop invariants from it
3297 into BASE as long as possible. */
3298 STRIP_NOPS (off);
3299 while (offtype == NULL_TREE)
3301 enum tree_code code;
3302 tree op0, op1, add = NULL_TREE;
3304 if (TREE_CODE (off) == SSA_NAME)
3306 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3308 if (expr_invariant_in_loop_p (loop, off))
3309 return false;
3311 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3312 break;
3314 op0 = gimple_assign_rhs1 (def_stmt);
3315 code = gimple_assign_rhs_code (def_stmt);
3316 op1 = gimple_assign_rhs2 (def_stmt);
3318 else
3320 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3321 return false;
3322 code = TREE_CODE (off);
3323 extract_ops_from_tree (off, &code, &op0, &op1);
3325 switch (code)
3327 case POINTER_PLUS_EXPR:
3328 case PLUS_EXPR:
3329 if (expr_invariant_in_loop_p (loop, op0))
3331 add = op0;
3332 off = op1;
3333 do_add:
3334 add = fold_convert (sizetype, add);
3335 if (scale != 1)
3336 add = size_binop (MULT_EXPR, add, size_int (scale));
3337 base = size_binop (PLUS_EXPR, base, add);
3338 continue;
3340 if (expr_invariant_in_loop_p (loop, op1))
3342 add = op1;
3343 off = op0;
3344 goto do_add;
3346 break;
3347 case MINUS_EXPR:
3348 if (expr_invariant_in_loop_p (loop, op1))
3350 add = fold_convert (sizetype, op1);
3351 add = size_binop (MINUS_EXPR, size_zero_node, add);
3352 off = op0;
3353 goto do_add;
3355 break;
3356 case MULT_EXPR:
3357 if (scale == 1 && tree_fits_shwi_p (op1))
3359 scale = tree_to_shwi (op1);
3360 off = op0;
3361 continue;
3363 break;
3364 case SSA_NAME:
3365 off = op0;
3366 continue;
3367 CASE_CONVERT:
3368 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3369 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3370 break;
3371 if (TYPE_PRECISION (TREE_TYPE (op0))
3372 == TYPE_PRECISION (TREE_TYPE (off)))
3374 off = op0;
3375 continue;
3377 if (TYPE_PRECISION (TREE_TYPE (op0))
3378 < TYPE_PRECISION (TREE_TYPE (off)))
3380 off = op0;
3381 offtype = TREE_TYPE (off);
3382 STRIP_NOPS (off);
3383 continue;
3385 break;
3386 default:
3387 break;
3389 break;
3392 /* If at the end OFF still isn't a SSA_NAME or isn't
3393 defined in the loop, punt. */
3394 if (TREE_CODE (off) != SSA_NAME
3395 || expr_invariant_in_loop_p (loop, off))
3396 return false;
3398 if (offtype == NULL_TREE)
3399 offtype = TREE_TYPE (off);
3401 if (DR_IS_READ (dr))
3402 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3403 offtype, scale);
3404 else
3405 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3406 offtype, scale);
3408 if (decl == NULL_TREE)
3409 return false;
3411 info->decl = decl;
3412 info->base = base;
3413 info->offset = off;
3414 info->offset_dt = vect_unknown_def_type;
3415 info->offset_vectype = NULL_TREE;
3416 info->scale = scale;
3417 return true;
3420 /* Function vect_analyze_data_refs.
3422 Find all the data references in the loop or basic block.
3424 The general structure of the analysis of data refs in the vectorizer is as
3425 follows:
3426 1- vect_analyze_data_refs(loop/bb): call
3427 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3428 in the loop/bb and their dependences.
3429 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3430 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3431 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3435 bool
3436 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3438 struct loop *loop = NULL;
3439 unsigned int i;
3440 struct data_reference *dr;
3441 tree scalar_type;
3443 if (dump_enabled_p ())
3444 dump_printf_loc (MSG_NOTE, vect_location,
3445 "=== vect_analyze_data_refs ===\n");
3447 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3448 loop = LOOP_VINFO_LOOP (loop_vinfo);
3450 /* Go through the data-refs, check that the analysis succeeded. Update
3451 pointer from stmt_vec_info struct to DR and vectype. */
3453 vec<data_reference_p> datarefs = vinfo->datarefs;
3454 FOR_EACH_VEC_ELT (datarefs, i, dr)
3456 gimple *stmt;
3457 stmt_vec_info stmt_info;
3458 tree base, offset, init;
3459 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3460 bool simd_lane_access = false;
3461 int vf;
3463 again:
3464 if (!dr || !DR_REF (dr))
3466 if (dump_enabled_p ())
3467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3468 "not vectorized: unhandled data-ref\n");
3469 return false;
3472 stmt = DR_STMT (dr);
3473 stmt_info = vinfo_for_stmt (stmt);
3475 /* Discard clobbers from the dataref vector. We will remove
3476 clobber stmts during vectorization. */
3477 if (gimple_clobber_p (stmt))
3479 free_data_ref (dr);
3480 if (i == datarefs.length () - 1)
3482 datarefs.pop ();
3483 break;
3485 datarefs.ordered_remove (i);
3486 dr = datarefs[i];
3487 goto again;
3490 /* Check that analysis of the data-ref succeeded. */
3491 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3492 || !DR_STEP (dr))
3494 bool maybe_gather
3495 = DR_IS_READ (dr)
3496 && !TREE_THIS_VOLATILE (DR_REF (dr))
3497 && targetm.vectorize.builtin_gather != NULL;
3498 bool maybe_scatter
3499 = DR_IS_WRITE (dr)
3500 && !TREE_THIS_VOLATILE (DR_REF (dr))
3501 && targetm.vectorize.builtin_scatter != NULL;
3502 bool maybe_simd_lane_access
3503 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3505 /* If target supports vector gather loads or scatter stores, or if
3506 this might be a SIMD lane access, see if they can't be used. */
3507 if (is_a <loop_vec_info> (vinfo)
3508 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3509 && !nested_in_vect_loop_p (loop, stmt))
3511 struct data_reference *newdr
3512 = create_data_ref (NULL, loop_containing_stmt (stmt),
3513 DR_REF (dr), stmt, !maybe_scatter,
3514 DR_IS_CONDITIONAL_IN_STMT (dr));
3515 gcc_assert (newdr != NULL && DR_REF (newdr));
3516 if (DR_BASE_ADDRESS (newdr)
3517 && DR_OFFSET (newdr)
3518 && DR_INIT (newdr)
3519 && DR_STEP (newdr)
3520 && integer_zerop (DR_STEP (newdr)))
3522 if (maybe_simd_lane_access)
3524 tree off = DR_OFFSET (newdr);
3525 STRIP_NOPS (off);
3526 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3527 && TREE_CODE (off) == MULT_EXPR
3528 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3530 tree step = TREE_OPERAND (off, 1);
3531 off = TREE_OPERAND (off, 0);
3532 STRIP_NOPS (off);
3533 if (CONVERT_EXPR_P (off)
3534 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3535 0)))
3536 < TYPE_PRECISION (TREE_TYPE (off)))
3537 off = TREE_OPERAND (off, 0);
3538 if (TREE_CODE (off) == SSA_NAME)
3540 gimple *def = SSA_NAME_DEF_STMT (off);
3541 tree reft = TREE_TYPE (DR_REF (newdr));
3542 if (is_gimple_call (def)
3543 && gimple_call_internal_p (def)
3544 && (gimple_call_internal_fn (def)
3545 == IFN_GOMP_SIMD_LANE))
3547 tree arg = gimple_call_arg (def, 0);
3548 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3549 arg = SSA_NAME_VAR (arg);
3550 if (arg == loop->simduid
3551 /* For now. */
3552 && tree_int_cst_equal
3553 (TYPE_SIZE_UNIT (reft),
3554 step))
3556 DR_OFFSET (newdr) = ssize_int (0);
3557 DR_STEP (newdr) = step;
3558 DR_OFFSET_ALIGNMENT (newdr)
3559 = BIGGEST_ALIGNMENT;
3560 DR_STEP_ALIGNMENT (newdr)
3561 = highest_pow2_factor (step);
3562 dr = newdr;
3563 simd_lane_access = true;
3569 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3571 dr = newdr;
3572 if (maybe_gather)
3573 gatherscatter = GATHER;
3574 else
3575 gatherscatter = SCATTER;
3578 if (gatherscatter == SG_NONE && !simd_lane_access)
3579 free_data_ref (newdr);
3582 if (gatherscatter == SG_NONE && !simd_lane_access)
3584 if (dump_enabled_p ())
3586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3587 "not vectorized: data ref analysis "
3588 "failed ");
3589 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3592 if (is_a <bb_vec_info> (vinfo))
3593 break;
3595 return false;
3599 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3601 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3603 "not vectorized: base addr of dr is a "
3604 "constant\n");
3606 if (is_a <bb_vec_info> (vinfo))
3607 break;
3609 if (gatherscatter != SG_NONE || simd_lane_access)
3610 free_data_ref (dr);
3611 return false;
3614 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3616 if (dump_enabled_p ())
3618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3619 "not vectorized: volatile type ");
3620 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3623 if (is_a <bb_vec_info> (vinfo))
3624 break;
3626 return false;
3629 if (stmt_can_throw_internal (stmt))
3631 if (dump_enabled_p ())
3633 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3634 "not vectorized: statement can throw an "
3635 "exception ");
3636 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3639 if (is_a <bb_vec_info> (vinfo))
3640 break;
3642 if (gatherscatter != SG_NONE || simd_lane_access)
3643 free_data_ref (dr);
3644 return false;
3647 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3648 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3650 if (dump_enabled_p ())
3652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3653 "not vectorized: statement is bitfield "
3654 "access ");
3655 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3658 if (is_a <bb_vec_info> (vinfo))
3659 break;
3661 if (gatherscatter != SG_NONE || simd_lane_access)
3662 free_data_ref (dr);
3663 return false;
3666 base = unshare_expr (DR_BASE_ADDRESS (dr));
3667 offset = unshare_expr (DR_OFFSET (dr));
3668 init = unshare_expr (DR_INIT (dr));
3670 if (is_gimple_call (stmt)
3671 && (!gimple_call_internal_p (stmt)
3672 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3673 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3675 if (dump_enabled_p ())
3677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3678 "not vectorized: dr in a call ");
3679 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3682 if (is_a <bb_vec_info> (vinfo))
3683 break;
3685 if (gatherscatter != SG_NONE || simd_lane_access)
3686 free_data_ref (dr);
3687 return false;
3690 /* Update DR field in stmt_vec_info struct. */
3692 /* If the dataref is in an inner-loop of the loop that is considered for
3693 for vectorization, we also want to analyze the access relative to
3694 the outer-loop (DR contains information only relative to the
3695 inner-most enclosing loop). We do that by building a reference to the
3696 first location accessed by the inner-loop, and analyze it relative to
3697 the outer-loop. */
3698 if (loop && nested_in_vect_loop_p (loop, stmt))
3700 /* Build a reference to the first location accessed by the
3701 inner loop: *(BASE + INIT + OFFSET). By construction,
3702 this address must be invariant in the inner loop, so we
3703 can consider it as being used in the outer loop. */
3704 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3705 init, offset);
3706 tree init_addr = fold_build_pointer_plus (base, init_offset);
3707 tree init_ref = build_fold_indirect_ref (init_addr);
3709 if (dump_enabled_p ())
3711 dump_printf_loc (MSG_NOTE, vect_location,
3712 "analyze in outer loop: ");
3713 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3714 dump_printf (MSG_NOTE, "\n");
3717 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3718 init_ref, loop))
3719 /* dr_analyze_innermost already explained the failure. */
3720 return false;
3722 if (dump_enabled_p ())
3724 dump_printf_loc (MSG_NOTE, vect_location,
3725 "\touter base_address: ");
3726 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3727 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3728 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3729 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3730 STMT_VINFO_DR_OFFSET (stmt_info));
3731 dump_printf (MSG_NOTE,
3732 "\n\touter constant offset from base address: ");
3733 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3734 STMT_VINFO_DR_INIT (stmt_info));
3735 dump_printf (MSG_NOTE, "\n\touter step: ");
3736 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3737 STMT_VINFO_DR_STEP (stmt_info));
3738 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3739 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3740 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3741 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3742 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3743 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3744 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3745 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3749 if (STMT_VINFO_DATA_REF (stmt_info))
3751 if (dump_enabled_p ())
3753 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3754 "not vectorized: more than one data ref "
3755 "in stmt: ");
3756 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3759 if (is_a <bb_vec_info> (vinfo))
3760 break;
3762 if (gatherscatter != SG_NONE || simd_lane_access)
3763 free_data_ref (dr);
3764 return false;
3767 STMT_VINFO_DATA_REF (stmt_info) = dr;
3768 if (simd_lane_access)
3770 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3771 free_data_ref (datarefs[i]);
3772 datarefs[i] = dr;
3775 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3776 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3777 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3779 if (dump_enabled_p ())
3781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3782 "not vectorized: base object not addressable "
3783 "for stmt: ");
3784 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3786 if (is_a <bb_vec_info> (vinfo))
3788 /* In BB vectorization the ref can still participate
3789 in dependence analysis, we just can't vectorize it. */
3790 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3791 continue;
3793 return false;
3796 /* Set vectype for STMT. */
3797 scalar_type = TREE_TYPE (DR_REF (dr));
3798 STMT_VINFO_VECTYPE (stmt_info)
3799 = get_vectype_for_scalar_type (scalar_type);
3800 if (!STMT_VINFO_VECTYPE (stmt_info))
3802 if (dump_enabled_p ())
3804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3805 "not vectorized: no vectype for stmt: ");
3806 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3807 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3808 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3809 scalar_type);
3810 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3813 if (is_a <bb_vec_info> (vinfo))
3815 /* No vector type is fine, the ref can still participate
3816 in dependence analysis, we just can't vectorize it. */
3817 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3818 continue;
3821 if (gatherscatter != SG_NONE || simd_lane_access)
3823 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3824 if (gatherscatter != SG_NONE)
3825 free_data_ref (dr);
3827 return false;
3829 else
3831 if (dump_enabled_p ())
3833 dump_printf_loc (MSG_NOTE, vect_location,
3834 "got vectype for stmt: ");
3835 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3836 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3837 STMT_VINFO_VECTYPE (stmt_info));
3838 dump_printf (MSG_NOTE, "\n");
3842 /* Adjust the minimal vectorization factor according to the
3843 vector type. */
3844 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3845 if (vf > *min_vf)
3846 *min_vf = vf;
3848 if (gatherscatter != SG_NONE)
3850 gather_scatter_info gs_info;
3851 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3852 &gs_info)
3853 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3855 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3856 free_data_ref (dr);
3857 if (dump_enabled_p ())
3859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3860 (gatherscatter == GATHER) ?
3861 "not vectorized: not suitable for gather "
3862 "load " :
3863 "not vectorized: not suitable for scatter "
3864 "store ");
3865 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3867 return false;
3870 free_data_ref (datarefs[i]);
3871 datarefs[i] = dr;
3872 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3875 else if (is_a <loop_vec_info> (vinfo)
3876 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3878 if (nested_in_vect_loop_p (loop, stmt))
3880 if (dump_enabled_p ())
3882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3883 "not vectorized: not suitable for strided "
3884 "load ");
3885 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3887 return false;
3889 STMT_VINFO_STRIDED_P (stmt_info) = true;
3893 /* If we stopped analysis at the first dataref we could not analyze
3894 when trying to vectorize a basic-block mark the rest of the datarefs
3895 as not vectorizable and truncate the vector of datarefs. That
3896 avoids spending useless time in analyzing their dependence. */
3897 if (i != datarefs.length ())
3899 gcc_assert (is_a <bb_vec_info> (vinfo));
3900 for (unsigned j = i; j < datarefs.length (); ++j)
3902 data_reference_p dr = datarefs[j];
3903 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3904 free_data_ref (dr);
3906 datarefs.truncate (i);
3909 return true;
3913 /* Function vect_get_new_vect_var.
3915 Returns a name for a new variable. The current naming scheme appends the
3916 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3917 the name of vectorizer generated variables, and appends that to NAME if
3918 provided. */
3920 tree
3921 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3923 const char *prefix;
3924 tree new_vect_var;
3926 switch (var_kind)
3928 case vect_simple_var:
3929 prefix = "vect";
3930 break;
3931 case vect_scalar_var:
3932 prefix = "stmp";
3933 break;
3934 case vect_mask_var:
3935 prefix = "mask";
3936 break;
3937 case vect_pointer_var:
3938 prefix = "vectp";
3939 break;
3940 default:
3941 gcc_unreachable ();
3944 if (name)
3946 char* tmp = concat (prefix, "_", name, NULL);
3947 new_vect_var = create_tmp_reg (type, tmp);
3948 free (tmp);
3950 else
3951 new_vect_var = create_tmp_reg (type, prefix);
3953 return new_vect_var;
3956 /* Like vect_get_new_vect_var but return an SSA name. */
3958 tree
3959 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3961 const char *prefix;
3962 tree new_vect_var;
3964 switch (var_kind)
3966 case vect_simple_var:
3967 prefix = "vect";
3968 break;
3969 case vect_scalar_var:
3970 prefix = "stmp";
3971 break;
3972 case vect_pointer_var:
3973 prefix = "vectp";
3974 break;
3975 default:
3976 gcc_unreachable ();
3979 if (name)
3981 char* tmp = concat (prefix, "_", name, NULL);
3982 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3983 free (tmp);
3985 else
3986 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3988 return new_vect_var;
3991 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3993 static void
3994 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3995 stmt_vec_info stmt_info)
3997 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3998 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3999 int misalign = DR_MISALIGNMENT (dr);
4000 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4001 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4002 else
4003 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4006 /* Function vect_create_addr_base_for_vector_ref.
4008 Create an expression that computes the address of the first memory location
4009 that will be accessed for a data reference.
4011 Input:
4012 STMT: The statement containing the data reference.
4013 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4014 OFFSET: Optional. If supplied, it is be added to the initial address.
4015 LOOP: Specify relative to which loop-nest should the address be computed.
4016 For example, when the dataref is in an inner-loop nested in an
4017 outer-loop that is now being vectorized, LOOP can be either the
4018 outer-loop, or the inner-loop. The first memory location accessed
4019 by the following dataref ('in' points to short):
4021 for (i=0; i<N; i++)
4022 for (j=0; j<M; j++)
4023 s += in[i+j]
4025 is as follows:
4026 if LOOP=i_loop: &in (relative to i_loop)
4027 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4028 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4029 initial address. Unlike OFFSET, which is number of elements to
4030 be added, BYTE_OFFSET is measured in bytes.
4032 Output:
4033 1. Return an SSA_NAME whose value is the address of the memory location of
4034 the first vector of the data reference.
4035 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4036 these statement(s) which define the returned SSA_NAME.
4038 FORNOW: We are only handling array accesses with step 1. */
4040 tree
4041 vect_create_addr_base_for_vector_ref (gimple *stmt,
4042 gimple_seq *new_stmt_list,
4043 tree offset,
4044 tree byte_offset)
4046 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4047 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4048 const char *base_name;
4049 tree addr_base;
4050 tree dest;
4051 gimple_seq seq = NULL;
4052 tree vect_ptr_type;
4053 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4054 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4055 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4057 tree data_ref_base = unshare_expr (drb->base_address);
4058 tree base_offset = unshare_expr (drb->offset);
4059 tree init = unshare_expr (drb->init);
4061 if (loop_vinfo)
4062 base_name = get_name (data_ref_base);
4063 else
4065 base_offset = ssize_int (0);
4066 init = ssize_int (0);
4067 base_name = get_name (DR_REF (dr));
4070 /* Create base_offset */
4071 base_offset = size_binop (PLUS_EXPR,
4072 fold_convert (sizetype, base_offset),
4073 fold_convert (sizetype, init));
4075 if (offset)
4077 offset = fold_build2 (MULT_EXPR, sizetype,
4078 fold_convert (sizetype, offset), step);
4079 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4080 base_offset, offset);
4082 if (byte_offset)
4084 byte_offset = fold_convert (sizetype, byte_offset);
4085 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4086 base_offset, byte_offset);
4089 /* base + base_offset */
4090 if (loop_vinfo)
4091 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4092 else
4094 addr_base = build1 (ADDR_EXPR,
4095 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4096 unshare_expr (DR_REF (dr)));
4099 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4100 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4101 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4102 gimple_seq_add_seq (new_stmt_list, seq);
4104 if (DR_PTR_INFO (dr)
4105 && TREE_CODE (addr_base) == SSA_NAME
4106 && !SSA_NAME_PTR_INFO (addr_base))
4108 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4109 if (offset || byte_offset)
4110 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4113 if (dump_enabled_p ())
4115 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4116 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4117 dump_printf (MSG_NOTE, "\n");
4120 return addr_base;
4124 /* Function vect_create_data_ref_ptr.
4126 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4127 location accessed in the loop by STMT, along with the def-use update
4128 chain to appropriately advance the pointer through the loop iterations.
4129 Also set aliasing information for the pointer. This pointer is used by
4130 the callers to this function to create a memory reference expression for
4131 vector load/store access.
4133 Input:
4134 1. STMT: a stmt that references memory. Expected to be of the form
4135 GIMPLE_ASSIGN <name, data-ref> or
4136 GIMPLE_ASSIGN <data-ref, name>.
4137 2. AGGR_TYPE: the type of the reference, which should be either a vector
4138 or an array.
4139 3. AT_LOOP: the loop where the vector memref is to be created.
4140 4. OFFSET (optional): an offset to be added to the initial address accessed
4141 by the data-ref in STMT.
4142 5. BSI: location where the new stmts are to be placed if there is no loop
4143 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4144 pointing to the initial address.
4145 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4146 to the initial address accessed by the data-ref in STMT. This is
4147 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4148 in bytes.
4150 Output:
4151 1. Declare a new ptr to vector_type, and have it point to the base of the
4152 data reference (initial addressed accessed by the data reference).
4153 For example, for vector of type V8HI, the following code is generated:
4155 v8hi *ap;
4156 ap = (v8hi *)initial_address;
4158 if OFFSET is not supplied:
4159 initial_address = &a[init];
4160 if OFFSET is supplied:
4161 initial_address = &a[init + OFFSET];
4162 if BYTE_OFFSET is supplied:
4163 initial_address = &a[init] + BYTE_OFFSET;
4165 Return the initial_address in INITIAL_ADDRESS.
4167 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4168 update the pointer in each iteration of the loop.
4170 Return the increment stmt that updates the pointer in PTR_INCR.
4172 3. Set INV_P to true if the access pattern of the data reference in the
4173 vectorized loop is invariant. Set it to false otherwise.
4175 4. Return the pointer. */
4177 tree
4178 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4179 tree offset, tree *initial_address,
4180 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4181 bool only_init, bool *inv_p, tree byte_offset)
4183 const char *base_name;
4184 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4185 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4186 struct loop *loop = NULL;
4187 bool nested_in_vect_loop = false;
4188 struct loop *containing_loop = NULL;
4189 tree aggr_ptr_type;
4190 tree aggr_ptr;
4191 tree new_temp;
4192 gimple_seq new_stmt_list = NULL;
4193 edge pe = NULL;
4194 basic_block new_bb;
4195 tree aggr_ptr_init;
4196 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4197 tree aptr;
4198 gimple_stmt_iterator incr_gsi;
4199 bool insert_after;
4200 tree indx_before_incr, indx_after_incr;
4201 gimple *incr;
4202 tree step;
4203 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4205 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4206 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4208 if (loop_vinfo)
4210 loop = LOOP_VINFO_LOOP (loop_vinfo);
4211 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4212 containing_loop = (gimple_bb (stmt))->loop_father;
4213 pe = loop_preheader_edge (loop);
4215 else
4217 gcc_assert (bb_vinfo);
4218 only_init = true;
4219 *ptr_incr = NULL;
4222 /* Check the step (evolution) of the load in LOOP, and record
4223 whether it's invariant. */
4224 step = vect_dr_behavior (dr)->step;
4225 if (integer_zerop (step))
4226 *inv_p = true;
4227 else
4228 *inv_p = false;
4230 /* Create an expression for the first address accessed by this load
4231 in LOOP. */
4232 base_name = get_name (DR_BASE_ADDRESS (dr));
4234 if (dump_enabled_p ())
4236 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4237 dump_printf_loc (MSG_NOTE, vect_location,
4238 "create %s-pointer variable to type: ",
4239 get_tree_code_name (TREE_CODE (aggr_type)));
4240 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4241 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4242 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4243 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4244 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4245 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4246 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4247 else
4248 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4249 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4250 dump_printf (MSG_NOTE, "\n");
4253 /* (1) Create the new aggregate-pointer variable.
4254 Vector and array types inherit the alias set of their component
4255 type by default so we need to use a ref-all pointer if the data
4256 reference does not conflict with the created aggregated data
4257 reference because it is not addressable. */
4258 bool need_ref_all = false;
4259 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4260 get_alias_set (DR_REF (dr))))
4261 need_ref_all = true;
4262 /* Likewise for any of the data references in the stmt group. */
4263 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4265 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4268 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4269 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4270 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4271 get_alias_set (DR_REF (sdr))))
4273 need_ref_all = true;
4274 break;
4276 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4278 while (orig_stmt);
4280 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4281 need_ref_all);
4282 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4285 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4286 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4287 def-use update cycles for the pointer: one relative to the outer-loop
4288 (LOOP), which is what steps (3) and (4) below do. The other is relative
4289 to the inner-loop (which is the inner-most loop containing the dataref),
4290 and this is done be step (5) below.
4292 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4293 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4294 redundant. Steps (3),(4) create the following:
4296 vp0 = &base_addr;
4297 LOOP: vp1 = phi(vp0,vp2)
4300 vp2 = vp1 + step
4301 goto LOOP
4303 If there is an inner-loop nested in loop, then step (5) will also be
4304 applied, and an additional update in the inner-loop will be created:
4306 vp0 = &base_addr;
4307 LOOP: vp1 = phi(vp0,vp2)
4309 inner: vp3 = phi(vp1,vp4)
4310 vp4 = vp3 + inner_step
4311 if () goto inner
4313 vp2 = vp1 + step
4314 if () goto LOOP */
4316 /* (2) Calculate the initial address of the aggregate-pointer, and set
4317 the aggregate-pointer to point to it before the loop. */
4319 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4321 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4322 offset, byte_offset);
4323 if (new_stmt_list)
4325 if (pe)
4327 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4328 gcc_assert (!new_bb);
4330 else
4331 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4334 *initial_address = new_temp;
4335 aggr_ptr_init = new_temp;
4337 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4338 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4339 inner-loop nested in LOOP (during outer-loop vectorization). */
4341 /* No update in loop is required. */
4342 if (only_init && (!loop_vinfo || at_loop == loop))
4343 aptr = aggr_ptr_init;
4344 else
4346 /* The step of the aggregate pointer is the type size. */
4347 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4348 /* One exception to the above is when the scalar step of the load in
4349 LOOP is zero. In this case the step here is also zero. */
4350 if (*inv_p)
4351 iv_step = size_zero_node;
4352 else if (tree_int_cst_sgn (step) == -1)
4353 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4355 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4357 create_iv (aggr_ptr_init,
4358 fold_convert (aggr_ptr_type, iv_step),
4359 aggr_ptr, loop, &incr_gsi, insert_after,
4360 &indx_before_incr, &indx_after_incr);
4361 incr = gsi_stmt (incr_gsi);
4362 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4364 /* Copy the points-to information if it exists. */
4365 if (DR_PTR_INFO (dr))
4367 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4368 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4370 if (ptr_incr)
4371 *ptr_incr = incr;
4373 aptr = indx_before_incr;
4376 if (!nested_in_vect_loop || only_init)
4377 return aptr;
4380 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4381 nested in LOOP, if exists. */
4383 gcc_assert (nested_in_vect_loop);
4384 if (!only_init)
4386 standard_iv_increment_position (containing_loop, &incr_gsi,
4387 &insert_after);
4388 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4389 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4390 &indx_after_incr);
4391 incr = gsi_stmt (incr_gsi);
4392 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4394 /* Copy the points-to information if it exists. */
4395 if (DR_PTR_INFO (dr))
4397 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4398 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4400 if (ptr_incr)
4401 *ptr_incr = incr;
4403 return indx_before_incr;
4405 else
4406 gcc_unreachable ();
4410 /* Function bump_vector_ptr
4412 Increment a pointer (to a vector type) by vector-size. If requested,
4413 i.e. if PTR-INCR is given, then also connect the new increment stmt
4414 to the existing def-use update-chain of the pointer, by modifying
4415 the PTR_INCR as illustrated below:
4417 The pointer def-use update-chain before this function:
4418 DATAREF_PTR = phi (p_0, p_2)
4419 ....
4420 PTR_INCR: p_2 = DATAREF_PTR + step
4422 The pointer def-use update-chain after this function:
4423 DATAREF_PTR = phi (p_0, p_2)
4424 ....
4425 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4426 ....
4427 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4429 Input:
4430 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4431 in the loop.
4432 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4433 the loop. The increment amount across iterations is expected
4434 to be vector_size.
4435 BSI - location where the new update stmt is to be placed.
4436 STMT - the original scalar memory-access stmt that is being vectorized.
4437 BUMP - optional. The offset by which to bump the pointer. If not given,
4438 the offset is assumed to be vector_size.
4440 Output: Return NEW_DATAREF_PTR as illustrated above.
4444 tree
4445 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4446 gimple *stmt, tree bump)
4448 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4449 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4450 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4451 tree update = TYPE_SIZE_UNIT (vectype);
4452 gassign *incr_stmt;
4453 ssa_op_iter iter;
4454 use_operand_p use_p;
4455 tree new_dataref_ptr;
4457 if (bump)
4458 update = bump;
4460 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4461 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4462 else
4463 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4464 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4465 dataref_ptr, update);
4466 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4468 /* Copy the points-to information if it exists. */
4469 if (DR_PTR_INFO (dr))
4471 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4472 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4475 if (!ptr_incr)
4476 return new_dataref_ptr;
4478 /* Update the vector-pointer's cross-iteration increment. */
4479 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4481 tree use = USE_FROM_PTR (use_p);
4483 if (use == dataref_ptr)
4484 SET_USE (use_p, new_dataref_ptr);
4485 else
4486 gcc_assert (tree_int_cst_compare (use, update) == 0);
4489 return new_dataref_ptr;
4493 /* Function vect_create_destination_var.
4495 Create a new temporary of type VECTYPE. */
4497 tree
4498 vect_create_destination_var (tree scalar_dest, tree vectype)
4500 tree vec_dest;
4501 const char *name;
4502 char *new_name;
4503 tree type;
4504 enum vect_var_kind kind;
4506 kind = vectype
4507 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4508 ? vect_mask_var
4509 : vect_simple_var
4510 : vect_scalar_var;
4511 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4513 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4515 name = get_name (scalar_dest);
4516 if (name)
4517 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4518 else
4519 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4520 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4521 free (new_name);
4523 return vec_dest;
4526 /* Function vect_grouped_store_supported.
4528 Returns TRUE if interleave high and interleave low permutations
4529 are supported, and FALSE otherwise. */
4531 bool
4532 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4534 machine_mode mode = TYPE_MODE (vectype);
4536 /* vect_permute_store_chain requires the group size to be equal to 3 or
4537 be a power of two. */
4538 if (count != 3 && exact_log2 (count) == -1)
4540 if (dump_enabled_p ())
4541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4542 "the size of the group of accesses"
4543 " is not a power of 2 or not eqaul to 3\n");
4544 return false;
4547 /* Check that the permutation is supported. */
4548 if (VECTOR_MODE_P (mode))
4550 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4551 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4553 if (count == 3)
4555 unsigned int j0 = 0, j1 = 0, j2 = 0;
4556 unsigned int i, j;
4558 for (j = 0; j < 3; j++)
4560 int nelt0 = ((3 - j) * nelt) % 3;
4561 int nelt1 = ((3 - j) * nelt + 1) % 3;
4562 int nelt2 = ((3 - j) * nelt + 2) % 3;
4563 for (i = 0; i < nelt; i++)
4565 if (3 * i + nelt0 < nelt)
4566 sel[3 * i + nelt0] = j0++;
4567 if (3 * i + nelt1 < nelt)
4568 sel[3 * i + nelt1] = nelt + j1++;
4569 if (3 * i + nelt2 < nelt)
4570 sel[3 * i + nelt2] = 0;
4572 if (!can_vec_perm_p (mode, false, sel))
4574 if (dump_enabled_p ())
4575 dump_printf (MSG_MISSED_OPTIMIZATION,
4576 "permutaion op not supported by target.\n");
4577 return false;
4580 for (i = 0; i < nelt; i++)
4582 if (3 * i + nelt0 < nelt)
4583 sel[3 * i + nelt0] = 3 * i + nelt0;
4584 if (3 * i + nelt1 < nelt)
4585 sel[3 * i + nelt1] = 3 * i + nelt1;
4586 if (3 * i + nelt2 < nelt)
4587 sel[3 * i + nelt2] = nelt + j2++;
4589 if (!can_vec_perm_p (mode, false, sel))
4591 if (dump_enabled_p ())
4592 dump_printf (MSG_MISSED_OPTIMIZATION,
4593 "permutaion op not supported by target.\n");
4594 return false;
4597 return true;
4599 else
4601 /* If length is not equal to 3 then only power of 2 is supported. */
4602 gcc_assert (pow2p_hwi (count));
4604 for (i = 0; i < nelt / 2; i++)
4606 sel[i * 2] = i;
4607 sel[i * 2 + 1] = i + nelt;
4609 if (can_vec_perm_p (mode, false, sel))
4611 for (i = 0; i < nelt; i++)
4612 sel[i] += nelt / 2;
4613 if (can_vec_perm_p (mode, false, sel))
4614 return true;
4619 if (dump_enabled_p ())
4620 dump_printf (MSG_MISSED_OPTIMIZATION,
4621 "permutaion op not supported by target.\n");
4622 return false;
4626 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4627 type VECTYPE. */
4629 bool
4630 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4632 return vect_lanes_optab_supported_p ("vec_store_lanes",
4633 vec_store_lanes_optab,
4634 vectype, count);
4638 /* Function vect_permute_store_chain.
4640 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4641 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4642 the data correctly for the stores. Return the final references for stores
4643 in RESULT_CHAIN.
4645 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4646 The input is 4 vectors each containing 8 elements. We assign a number to
4647 each element, the input sequence is:
4649 1st vec: 0 1 2 3 4 5 6 7
4650 2nd vec: 8 9 10 11 12 13 14 15
4651 3rd vec: 16 17 18 19 20 21 22 23
4652 4th vec: 24 25 26 27 28 29 30 31
4654 The output sequence should be:
4656 1st vec: 0 8 16 24 1 9 17 25
4657 2nd vec: 2 10 18 26 3 11 19 27
4658 3rd vec: 4 12 20 28 5 13 21 30
4659 4th vec: 6 14 22 30 7 15 23 31
4661 i.e., we interleave the contents of the four vectors in their order.
4663 We use interleave_high/low instructions to create such output. The input of
4664 each interleave_high/low operation is two vectors:
4665 1st vec 2nd vec
4666 0 1 2 3 4 5 6 7
4667 the even elements of the result vector are obtained left-to-right from the
4668 high/low elements of the first vector. The odd elements of the result are
4669 obtained left-to-right from the high/low elements of the second vector.
4670 The output of interleave_high will be: 0 4 1 5
4671 and of interleave_low: 2 6 3 7
4674 The permutation is done in log LENGTH stages. In each stage interleave_high
4675 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4676 where the first argument is taken from the first half of DR_CHAIN and the
4677 second argument from it's second half.
4678 In our example,
4680 I1: interleave_high (1st vec, 3rd vec)
4681 I2: interleave_low (1st vec, 3rd vec)
4682 I3: interleave_high (2nd vec, 4th vec)
4683 I4: interleave_low (2nd vec, 4th vec)
4685 The output for the first stage is:
4687 I1: 0 16 1 17 2 18 3 19
4688 I2: 4 20 5 21 6 22 7 23
4689 I3: 8 24 9 25 10 26 11 27
4690 I4: 12 28 13 29 14 30 15 31
4692 The output of the second stage, i.e. the final result is:
4694 I1: 0 8 16 24 1 9 17 25
4695 I2: 2 10 18 26 3 11 19 27
4696 I3: 4 12 20 28 5 13 21 30
4697 I4: 6 14 22 30 7 15 23 31. */
4699 void
4700 vect_permute_store_chain (vec<tree> dr_chain,
4701 unsigned int length,
4702 gimple *stmt,
4703 gimple_stmt_iterator *gsi,
4704 vec<tree> *result_chain)
4706 tree vect1, vect2, high, low;
4707 gimple *perm_stmt;
4708 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4709 tree perm_mask_low, perm_mask_high;
4710 tree data_ref;
4711 tree perm3_mask_low, perm3_mask_high;
4712 unsigned int i, n, log_length = exact_log2 (length);
4713 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4714 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4716 result_chain->quick_grow (length);
4717 memcpy (result_chain->address (), dr_chain.address (),
4718 length * sizeof (tree));
4720 if (length == 3)
4722 unsigned int j0 = 0, j1 = 0, j2 = 0;
4724 for (j = 0; j < 3; j++)
4726 int nelt0 = ((3 - j) * nelt) % 3;
4727 int nelt1 = ((3 - j) * nelt + 1) % 3;
4728 int nelt2 = ((3 - j) * nelt + 2) % 3;
4730 for (i = 0; i < nelt; i++)
4732 if (3 * i + nelt0 < nelt)
4733 sel[3 * i + nelt0] = j0++;
4734 if (3 * i + nelt1 < nelt)
4735 sel[3 * i + nelt1] = nelt + j1++;
4736 if (3 * i + nelt2 < nelt)
4737 sel[3 * i + nelt2] = 0;
4739 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4741 for (i = 0; i < nelt; i++)
4743 if (3 * i + nelt0 < nelt)
4744 sel[3 * i + nelt0] = 3 * i + nelt0;
4745 if (3 * i + nelt1 < nelt)
4746 sel[3 * i + nelt1] = 3 * i + nelt1;
4747 if (3 * i + nelt2 < nelt)
4748 sel[3 * i + nelt2] = nelt + j2++;
4750 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4752 vect1 = dr_chain[0];
4753 vect2 = dr_chain[1];
4755 /* Create interleaving stmt:
4756 low = VEC_PERM_EXPR <vect1, vect2,
4757 {j, nelt, *, j + 1, nelt + j + 1, *,
4758 j + 2, nelt + j + 2, *, ...}> */
4759 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4760 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4761 vect2, perm3_mask_low);
4762 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4764 vect1 = data_ref;
4765 vect2 = dr_chain[2];
4766 /* Create interleaving stmt:
4767 low = VEC_PERM_EXPR <vect1, vect2,
4768 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4769 6, 7, nelt + j + 2, ...}> */
4770 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4771 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4772 vect2, perm3_mask_high);
4773 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4774 (*result_chain)[j] = data_ref;
4777 else
4779 /* If length is not equal to 3 then only power of 2 is supported. */
4780 gcc_assert (pow2p_hwi (length));
4782 for (i = 0, n = nelt / 2; i < n; i++)
4784 sel[i * 2] = i;
4785 sel[i * 2 + 1] = i + nelt;
4787 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4789 for (i = 0; i < nelt; i++)
4790 sel[i] += nelt / 2;
4791 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4793 for (i = 0, n = log_length; i < n; i++)
4795 for (j = 0; j < length/2; j++)
4797 vect1 = dr_chain[j];
4798 vect2 = dr_chain[j+length/2];
4800 /* Create interleaving stmt:
4801 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4802 ...}> */
4803 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4804 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4805 vect2, perm_mask_high);
4806 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4807 (*result_chain)[2*j] = high;
4809 /* Create interleaving stmt:
4810 low = VEC_PERM_EXPR <vect1, vect2,
4811 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4812 ...}> */
4813 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4814 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4815 vect2, perm_mask_low);
4816 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4817 (*result_chain)[2*j+1] = low;
4819 memcpy (dr_chain.address (), result_chain->address (),
4820 length * sizeof (tree));
4825 /* Function vect_setup_realignment
4827 This function is called when vectorizing an unaligned load using
4828 the dr_explicit_realign[_optimized] scheme.
4829 This function generates the following code at the loop prolog:
4831 p = initial_addr;
4832 x msq_init = *(floor(p)); # prolog load
4833 realignment_token = call target_builtin;
4834 loop:
4835 x msq = phi (msq_init, ---)
4837 The stmts marked with x are generated only for the case of
4838 dr_explicit_realign_optimized.
4840 The code above sets up a new (vector) pointer, pointing to the first
4841 location accessed by STMT, and a "floor-aligned" load using that pointer.
4842 It also generates code to compute the "realignment-token" (if the relevant
4843 target hook was defined), and creates a phi-node at the loop-header bb
4844 whose arguments are the result of the prolog-load (created by this
4845 function) and the result of a load that takes place in the loop (to be
4846 created by the caller to this function).
4848 For the case of dr_explicit_realign_optimized:
4849 The caller to this function uses the phi-result (msq) to create the
4850 realignment code inside the loop, and sets up the missing phi argument,
4851 as follows:
4852 loop:
4853 msq = phi (msq_init, lsq)
4854 lsq = *(floor(p')); # load in loop
4855 result = realign_load (msq, lsq, realignment_token);
4857 For the case of dr_explicit_realign:
4858 loop:
4859 msq = *(floor(p)); # load in loop
4860 p' = p + (VS-1);
4861 lsq = *(floor(p')); # load in loop
4862 result = realign_load (msq, lsq, realignment_token);
4864 Input:
4865 STMT - (scalar) load stmt to be vectorized. This load accesses
4866 a memory location that may be unaligned.
4867 BSI - place where new code is to be inserted.
4868 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4869 is used.
4871 Output:
4872 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4873 target hook, if defined.
4874 Return value - the result of the loop-header phi node. */
4876 tree
4877 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4878 tree *realignment_token,
4879 enum dr_alignment_support alignment_support_scheme,
4880 tree init_addr,
4881 struct loop **at_loop)
4883 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4884 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4885 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4886 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4887 struct loop *loop = NULL;
4888 edge pe = NULL;
4889 tree scalar_dest = gimple_assign_lhs (stmt);
4890 tree vec_dest;
4891 gimple *inc;
4892 tree ptr;
4893 tree data_ref;
4894 basic_block new_bb;
4895 tree msq_init = NULL_TREE;
4896 tree new_temp;
4897 gphi *phi_stmt;
4898 tree msq = NULL_TREE;
4899 gimple_seq stmts = NULL;
4900 bool inv_p;
4901 bool compute_in_loop = false;
4902 bool nested_in_vect_loop = false;
4903 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4904 struct loop *loop_for_initial_load = NULL;
4906 if (loop_vinfo)
4908 loop = LOOP_VINFO_LOOP (loop_vinfo);
4909 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4912 gcc_assert (alignment_support_scheme == dr_explicit_realign
4913 || alignment_support_scheme == dr_explicit_realign_optimized);
4915 /* We need to generate three things:
4916 1. the misalignment computation
4917 2. the extra vector load (for the optimized realignment scheme).
4918 3. the phi node for the two vectors from which the realignment is
4919 done (for the optimized realignment scheme). */
4921 /* 1. Determine where to generate the misalignment computation.
4923 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4924 calculation will be generated by this function, outside the loop (in the
4925 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4926 caller, inside the loop.
4928 Background: If the misalignment remains fixed throughout the iterations of
4929 the loop, then both realignment schemes are applicable, and also the
4930 misalignment computation can be done outside LOOP. This is because we are
4931 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4932 are a multiple of VS (the Vector Size), and therefore the misalignment in
4933 different vectorized LOOP iterations is always the same.
4934 The problem arises only if the memory access is in an inner-loop nested
4935 inside LOOP, which is now being vectorized using outer-loop vectorization.
4936 This is the only case when the misalignment of the memory access may not
4937 remain fixed throughout the iterations of the inner-loop (as explained in
4938 detail in vect_supportable_dr_alignment). In this case, not only is the
4939 optimized realignment scheme not applicable, but also the misalignment
4940 computation (and generation of the realignment token that is passed to
4941 REALIGN_LOAD) have to be done inside the loop.
4943 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4944 or not, which in turn determines if the misalignment is computed inside
4945 the inner-loop, or outside LOOP. */
4947 if (init_addr != NULL_TREE || !loop_vinfo)
4949 compute_in_loop = true;
4950 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4954 /* 2. Determine where to generate the extra vector load.
4956 For the optimized realignment scheme, instead of generating two vector
4957 loads in each iteration, we generate a single extra vector load in the
4958 preheader of the loop, and in each iteration reuse the result of the
4959 vector load from the previous iteration. In case the memory access is in
4960 an inner-loop nested inside LOOP, which is now being vectorized using
4961 outer-loop vectorization, we need to determine whether this initial vector
4962 load should be generated at the preheader of the inner-loop, or can be
4963 generated at the preheader of LOOP. If the memory access has no evolution
4964 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4965 to be generated inside LOOP (in the preheader of the inner-loop). */
4967 if (nested_in_vect_loop)
4969 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4970 bool invariant_in_outerloop =
4971 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4972 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4974 else
4975 loop_for_initial_load = loop;
4976 if (at_loop)
4977 *at_loop = loop_for_initial_load;
4979 if (loop_for_initial_load)
4980 pe = loop_preheader_edge (loop_for_initial_load);
4982 /* 3. For the case of the optimized realignment, create the first vector
4983 load at the loop preheader. */
4985 if (alignment_support_scheme == dr_explicit_realign_optimized)
4987 /* Create msq_init = *(floor(p1)) in the loop preheader */
4988 gassign *new_stmt;
4990 gcc_assert (!compute_in_loop);
4991 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4992 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4993 NULL_TREE, &init_addr, NULL, &inc,
4994 true, &inv_p);
4995 if (TREE_CODE (ptr) == SSA_NAME)
4996 new_temp = copy_ssa_name (ptr);
4997 else
4998 new_temp = make_ssa_name (TREE_TYPE (ptr));
4999 new_stmt = gimple_build_assign
5000 (new_temp, BIT_AND_EXPR, ptr,
5001 build_int_cst (TREE_TYPE (ptr),
5002 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5003 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5004 gcc_assert (!new_bb);
5005 data_ref
5006 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5007 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5008 new_stmt = gimple_build_assign (vec_dest, data_ref);
5009 new_temp = make_ssa_name (vec_dest, new_stmt);
5010 gimple_assign_set_lhs (new_stmt, new_temp);
5011 if (pe)
5013 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5014 gcc_assert (!new_bb);
5016 else
5017 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5019 msq_init = gimple_assign_lhs (new_stmt);
5022 /* 4. Create realignment token using a target builtin, if available.
5023 It is done either inside the containing loop, or before LOOP (as
5024 determined above). */
5026 if (targetm.vectorize.builtin_mask_for_load)
5028 gcall *new_stmt;
5029 tree builtin_decl;
5031 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5032 if (!init_addr)
5034 /* Generate the INIT_ADDR computation outside LOOP. */
5035 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5036 NULL_TREE);
5037 if (loop)
5039 pe = loop_preheader_edge (loop);
5040 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5041 gcc_assert (!new_bb);
5043 else
5044 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5047 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5048 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5049 vec_dest =
5050 vect_create_destination_var (scalar_dest,
5051 gimple_call_return_type (new_stmt));
5052 new_temp = make_ssa_name (vec_dest, new_stmt);
5053 gimple_call_set_lhs (new_stmt, new_temp);
5055 if (compute_in_loop)
5056 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5057 else
5059 /* Generate the misalignment computation outside LOOP. */
5060 pe = loop_preheader_edge (loop);
5061 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5062 gcc_assert (!new_bb);
5065 *realignment_token = gimple_call_lhs (new_stmt);
5067 /* The result of the CALL_EXPR to this builtin is determined from
5068 the value of the parameter and no global variables are touched
5069 which makes the builtin a "const" function. Requiring the
5070 builtin to have the "const" attribute makes it unnecessary
5071 to call mark_call_clobbered. */
5072 gcc_assert (TREE_READONLY (builtin_decl));
5075 if (alignment_support_scheme == dr_explicit_realign)
5076 return msq;
5078 gcc_assert (!compute_in_loop);
5079 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5082 /* 5. Create msq = phi <msq_init, lsq> in loop */
5084 pe = loop_preheader_edge (containing_loop);
5085 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5086 msq = make_ssa_name (vec_dest);
5087 phi_stmt = create_phi_node (msq, containing_loop->header);
5088 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5090 return msq;
5094 /* Function vect_grouped_load_supported.
5096 COUNT is the size of the load group (the number of statements plus the
5097 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5098 only one statement, with a gap of COUNT - 1.
5100 Returns true if a suitable permute exists. */
5102 bool
5103 vect_grouped_load_supported (tree vectype, bool single_element_p,
5104 unsigned HOST_WIDE_INT count)
5106 machine_mode mode = TYPE_MODE (vectype);
5108 /* If this is single-element interleaving with an element distance
5109 that leaves unused vector loads around punt - we at least create
5110 very sub-optimal code in that case (and blow up memory,
5111 see PR65518). */
5112 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5114 if (dump_enabled_p ())
5115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5116 "single-element interleaving not supported "
5117 "for not adjacent vector loads\n");
5118 return false;
5121 /* vect_permute_load_chain requires the group size to be equal to 3 or
5122 be a power of two. */
5123 if (count != 3 && exact_log2 (count) == -1)
5125 if (dump_enabled_p ())
5126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5127 "the size of the group of accesses"
5128 " is not a power of 2 or not equal to 3\n");
5129 return false;
5132 /* Check that the permutation is supported. */
5133 if (VECTOR_MODE_P (mode))
5135 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5136 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5138 if (count == 3)
5140 unsigned int k;
5141 for (k = 0; k < 3; k++)
5143 for (i = 0; i < nelt; i++)
5144 if (3 * i + k < 2 * nelt)
5145 sel[i] = 3 * i + k;
5146 else
5147 sel[i] = 0;
5148 if (!can_vec_perm_p (mode, false, sel))
5150 if (dump_enabled_p ())
5151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5152 "shuffle of 3 loads is not supported by"
5153 " target\n");
5154 return false;
5156 for (i = 0, j = 0; i < nelt; i++)
5157 if (3 * i + k < 2 * nelt)
5158 sel[i] = i;
5159 else
5160 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5161 if (!can_vec_perm_p (mode, false, sel))
5163 if (dump_enabled_p ())
5164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5165 "shuffle of 3 loads is not supported by"
5166 " target\n");
5167 return false;
5170 return true;
5172 else
5174 /* If length is not equal to 3 then only power of 2 is supported. */
5175 gcc_assert (pow2p_hwi (count));
5176 for (i = 0; i < nelt; i++)
5177 sel[i] = i * 2;
5178 if (can_vec_perm_p (mode, false, sel))
5180 for (i = 0; i < nelt; i++)
5181 sel[i] = i * 2 + 1;
5182 if (can_vec_perm_p (mode, false, sel))
5183 return true;
5188 if (dump_enabled_p ())
5189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5190 "extract even/odd not supported by target\n");
5191 return false;
5194 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5195 type VECTYPE. */
5197 bool
5198 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5200 return vect_lanes_optab_supported_p ("vec_load_lanes",
5201 vec_load_lanes_optab,
5202 vectype, count);
5205 /* Function vect_permute_load_chain.
5207 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5208 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5209 the input data correctly. Return the final references for loads in
5210 RESULT_CHAIN.
5212 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5213 The input is 4 vectors each containing 8 elements. We assign a number to each
5214 element, the input sequence is:
5216 1st vec: 0 1 2 3 4 5 6 7
5217 2nd vec: 8 9 10 11 12 13 14 15
5218 3rd vec: 16 17 18 19 20 21 22 23
5219 4th vec: 24 25 26 27 28 29 30 31
5221 The output sequence should be:
5223 1st vec: 0 4 8 12 16 20 24 28
5224 2nd vec: 1 5 9 13 17 21 25 29
5225 3rd vec: 2 6 10 14 18 22 26 30
5226 4th vec: 3 7 11 15 19 23 27 31
5228 i.e., the first output vector should contain the first elements of each
5229 interleaving group, etc.
5231 We use extract_even/odd instructions to create such output. The input of
5232 each extract_even/odd operation is two vectors
5233 1st vec 2nd vec
5234 0 1 2 3 4 5 6 7
5236 and the output is the vector of extracted even/odd elements. The output of
5237 extract_even will be: 0 2 4 6
5238 and of extract_odd: 1 3 5 7
5241 The permutation is done in log LENGTH stages. In each stage extract_even
5242 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5243 their order. In our example,
5245 E1: extract_even (1st vec, 2nd vec)
5246 E2: extract_odd (1st vec, 2nd vec)
5247 E3: extract_even (3rd vec, 4th vec)
5248 E4: extract_odd (3rd vec, 4th vec)
5250 The output for the first stage will be:
5252 E1: 0 2 4 6 8 10 12 14
5253 E2: 1 3 5 7 9 11 13 15
5254 E3: 16 18 20 22 24 26 28 30
5255 E4: 17 19 21 23 25 27 29 31
5257 In order to proceed and create the correct sequence for the next stage (or
5258 for the correct output, if the second stage is the last one, as in our
5259 example), we first put the output of extract_even operation and then the
5260 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5261 The input for the second stage is:
5263 1st vec (E1): 0 2 4 6 8 10 12 14
5264 2nd vec (E3): 16 18 20 22 24 26 28 30
5265 3rd vec (E2): 1 3 5 7 9 11 13 15
5266 4th vec (E4): 17 19 21 23 25 27 29 31
5268 The output of the second stage:
5270 E1: 0 4 8 12 16 20 24 28
5271 E2: 2 6 10 14 18 22 26 30
5272 E3: 1 5 9 13 17 21 25 29
5273 E4: 3 7 11 15 19 23 27 31
5275 And RESULT_CHAIN after reordering:
5277 1st vec (E1): 0 4 8 12 16 20 24 28
5278 2nd vec (E3): 1 5 9 13 17 21 25 29
5279 3rd vec (E2): 2 6 10 14 18 22 26 30
5280 4th vec (E4): 3 7 11 15 19 23 27 31. */
5282 static void
5283 vect_permute_load_chain (vec<tree> dr_chain,
5284 unsigned int length,
5285 gimple *stmt,
5286 gimple_stmt_iterator *gsi,
5287 vec<tree> *result_chain)
5289 tree data_ref, first_vect, second_vect;
5290 tree perm_mask_even, perm_mask_odd;
5291 tree perm3_mask_low, perm3_mask_high;
5292 gimple *perm_stmt;
5293 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5294 unsigned int i, j, log_length = exact_log2 (length);
5295 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5296 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5298 result_chain->quick_grow (length);
5299 memcpy (result_chain->address (), dr_chain.address (),
5300 length * sizeof (tree));
5302 if (length == 3)
5304 unsigned int k;
5306 for (k = 0; k < 3; k++)
5308 for (i = 0; i < nelt; i++)
5309 if (3 * i + k < 2 * nelt)
5310 sel[i] = 3 * i + k;
5311 else
5312 sel[i] = 0;
5313 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5315 for (i = 0, j = 0; i < nelt; i++)
5316 if (3 * i + k < 2 * nelt)
5317 sel[i] = i;
5318 else
5319 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5321 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5323 first_vect = dr_chain[0];
5324 second_vect = dr_chain[1];
5326 /* Create interleaving stmt (low part of):
5327 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5328 ...}> */
5329 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5330 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5331 second_vect, perm3_mask_low);
5332 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5334 /* Create interleaving stmt (high part of):
5335 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5336 ...}> */
5337 first_vect = data_ref;
5338 second_vect = dr_chain[2];
5339 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5340 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5341 second_vect, perm3_mask_high);
5342 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5343 (*result_chain)[k] = data_ref;
5346 else
5348 /* If length is not equal to 3 then only power of 2 is supported. */
5349 gcc_assert (pow2p_hwi (length));
5351 for (i = 0; i < nelt; ++i)
5352 sel[i] = i * 2;
5353 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5355 for (i = 0; i < nelt; ++i)
5356 sel[i] = i * 2 + 1;
5357 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5359 for (i = 0; i < log_length; i++)
5361 for (j = 0; j < length; j += 2)
5363 first_vect = dr_chain[j];
5364 second_vect = dr_chain[j+1];
5366 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5367 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5368 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5369 first_vect, second_vect,
5370 perm_mask_even);
5371 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5372 (*result_chain)[j/2] = data_ref;
5374 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5375 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5376 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5377 first_vect, second_vect,
5378 perm_mask_odd);
5379 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5380 (*result_chain)[j/2+length/2] = data_ref;
5382 memcpy (dr_chain.address (), result_chain->address (),
5383 length * sizeof (tree));
5388 /* Function vect_shift_permute_load_chain.
5390 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5391 sequence of stmts to reorder the input data accordingly.
5392 Return the final references for loads in RESULT_CHAIN.
5393 Return true if successed, false otherwise.
5395 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5396 The input is 3 vectors each containing 8 elements. We assign a
5397 number to each element, the input sequence is:
5399 1st vec: 0 1 2 3 4 5 6 7
5400 2nd vec: 8 9 10 11 12 13 14 15
5401 3rd vec: 16 17 18 19 20 21 22 23
5403 The output sequence should be:
5405 1st vec: 0 3 6 9 12 15 18 21
5406 2nd vec: 1 4 7 10 13 16 19 22
5407 3rd vec: 2 5 8 11 14 17 20 23
5409 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5411 First we shuffle all 3 vectors to get correct elements order:
5413 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5414 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5415 3rd vec: (16 19 22) (17 20 23) (18 21)
5417 Next we unite and shift vector 3 times:
5419 1st step:
5420 shift right by 6 the concatenation of:
5421 "1st vec" and "2nd vec"
5422 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5423 "2nd vec" and "3rd vec"
5424 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5425 "3rd vec" and "1st vec"
5426 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5427 | New vectors |
5429 So that now new vectors are:
5431 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5432 2nd vec: (10 13) (16 19 22) (17 20 23)
5433 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5435 2nd step:
5436 shift right by 5 the concatenation of:
5437 "1st vec" and "3rd vec"
5438 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5439 "2nd vec" and "1st vec"
5440 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5441 "3rd vec" and "2nd vec"
5442 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5443 | New vectors |
5445 So that now new vectors are:
5447 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5448 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5449 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5451 3rd step:
5452 shift right by 5 the concatenation of:
5453 "1st vec" and "1st vec"
5454 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5455 shift right by 3 the concatenation of:
5456 "2nd vec" and "2nd vec"
5457 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5458 | New vectors |
5460 So that now all vectors are READY:
5461 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5462 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5463 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5465 This algorithm is faster than one in vect_permute_load_chain if:
5466 1. "shift of a concatination" is faster than general permutation.
5467 This is usually so.
5468 2. The TARGET machine can't execute vector instructions in parallel.
5469 This is because each step of the algorithm depends on previous.
5470 The algorithm in vect_permute_load_chain is much more parallel.
5472 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5475 static bool
5476 vect_shift_permute_load_chain (vec<tree> dr_chain,
5477 unsigned int length,
5478 gimple *stmt,
5479 gimple_stmt_iterator *gsi,
5480 vec<tree> *result_chain)
5482 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5483 tree perm2_mask1, perm2_mask2, perm3_mask;
5484 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5485 gimple *perm_stmt;
5487 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5488 unsigned int i;
5489 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5490 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5491 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5492 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5494 result_chain->quick_grow (length);
5495 memcpy (result_chain->address (), dr_chain.address (),
5496 length * sizeof (tree));
5498 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5500 unsigned int j, log_length = exact_log2 (length);
5501 for (i = 0; i < nelt / 2; ++i)
5502 sel[i] = i * 2;
5503 for (i = 0; i < nelt / 2; ++i)
5504 sel[nelt / 2 + i] = i * 2 + 1;
5505 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5507 if (dump_enabled_p ())
5508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5509 "shuffle of 2 fields structure is not \
5510 supported by target\n");
5511 return false;
5513 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5515 for (i = 0; i < nelt / 2; ++i)
5516 sel[i] = i * 2 + 1;
5517 for (i = 0; i < nelt / 2; ++i)
5518 sel[nelt / 2 + i] = i * 2;
5519 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5521 if (dump_enabled_p ())
5522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5523 "shuffle of 2 fields structure is not \
5524 supported by target\n");
5525 return false;
5527 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5529 /* Generating permutation constant to shift all elements.
5530 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5531 for (i = 0; i < nelt; i++)
5532 sel[i] = nelt / 2 + i;
5533 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5535 if (dump_enabled_p ())
5536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5537 "shift permutation is not supported by target\n");
5538 return false;
5540 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5542 /* Generating permutation constant to select vector from 2.
5543 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5544 for (i = 0; i < nelt / 2; i++)
5545 sel[i] = i;
5546 for (i = nelt / 2; i < nelt; i++)
5547 sel[i] = nelt + i;
5548 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5550 if (dump_enabled_p ())
5551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5552 "select is not supported by target\n");
5553 return false;
5555 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5557 for (i = 0; i < log_length; i++)
5559 for (j = 0; j < length; j += 2)
5561 first_vect = dr_chain[j];
5562 second_vect = dr_chain[j + 1];
5564 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5565 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5566 first_vect, first_vect,
5567 perm2_mask1);
5568 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5569 vect[0] = data_ref;
5571 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5572 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5573 second_vect, second_vect,
5574 perm2_mask2);
5575 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5576 vect[1] = data_ref;
5578 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5579 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5580 vect[0], vect[1], shift1_mask);
5581 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5582 (*result_chain)[j/2 + length/2] = data_ref;
5584 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5585 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5586 vect[0], vect[1], select_mask);
5587 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5588 (*result_chain)[j/2] = data_ref;
5590 memcpy (dr_chain.address (), result_chain->address (),
5591 length * sizeof (tree));
5593 return true;
5595 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5597 unsigned int k = 0, l = 0;
5599 /* Generating permutation constant to get all elements in rigth order.
5600 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5601 for (i = 0; i < nelt; i++)
5603 if (3 * k + (l % 3) >= nelt)
5605 k = 0;
5606 l += (3 - (nelt % 3));
5608 sel[i] = 3 * k + (l % 3);
5609 k++;
5611 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5613 if (dump_enabled_p ())
5614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5615 "shuffle of 3 fields structure is not \
5616 supported by target\n");
5617 return false;
5619 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5621 /* Generating permutation constant to shift all elements.
5622 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5623 for (i = 0; i < nelt; i++)
5624 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5625 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5627 if (dump_enabled_p ())
5628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5629 "shift permutation is not supported by target\n");
5630 return false;
5632 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5634 /* Generating permutation constant to shift all elements.
5635 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5636 for (i = 0; i < nelt; i++)
5637 sel[i] = 2 * (nelt / 3) + 1 + i;
5638 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5640 if (dump_enabled_p ())
5641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5642 "shift permutation is not supported by target\n");
5643 return false;
5645 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5647 /* Generating permutation constant to shift all elements.
5648 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5649 for (i = 0; i < nelt; i++)
5650 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5651 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5653 if (dump_enabled_p ())
5654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5655 "shift permutation is not supported by target\n");
5656 return false;
5658 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5660 /* Generating permutation constant to shift all elements.
5661 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5662 for (i = 0; i < nelt; i++)
5663 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5664 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5666 if (dump_enabled_p ())
5667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5668 "shift permutation is not supported by target\n");
5669 return false;
5671 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5673 for (k = 0; k < 3; k++)
5675 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5676 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5677 dr_chain[k], dr_chain[k],
5678 perm3_mask);
5679 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5680 vect[k] = data_ref;
5683 for (k = 0; k < 3; k++)
5685 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5686 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5687 vect[k % 3], vect[(k + 1) % 3],
5688 shift1_mask);
5689 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5690 vect_shift[k] = data_ref;
5693 for (k = 0; k < 3; k++)
5695 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5696 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5697 vect_shift[(4 - k) % 3],
5698 vect_shift[(3 - k) % 3],
5699 shift2_mask);
5700 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5701 vect[k] = data_ref;
5704 (*result_chain)[3 - (nelt % 3)] = vect[2];
5706 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5707 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5708 vect[0], shift3_mask);
5709 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5710 (*result_chain)[nelt % 3] = data_ref;
5712 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5713 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5714 vect[1], shift4_mask);
5715 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5716 (*result_chain)[0] = data_ref;
5717 return true;
5719 return false;
5722 /* Function vect_transform_grouped_load.
5724 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5725 to perform their permutation and ascribe the result vectorized statements to
5726 the scalar statements.
5729 void
5730 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5731 gimple_stmt_iterator *gsi)
5733 machine_mode mode;
5734 vec<tree> result_chain = vNULL;
5736 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5737 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5738 vectors, that are ready for vector computation. */
5739 result_chain.create (size);
5741 /* If reassociation width for vector type is 2 or greater target machine can
5742 execute 2 or more vector instructions in parallel. Otherwise try to
5743 get chain for loads group using vect_shift_permute_load_chain. */
5744 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5745 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5746 || pow2p_hwi (size)
5747 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5748 gsi, &result_chain))
5749 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5750 vect_record_grouped_load_vectors (stmt, result_chain);
5751 result_chain.release ();
5754 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5755 generated as part of the vectorization of STMT. Assign the statement
5756 for each vector to the associated scalar statement. */
5758 void
5759 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5761 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5762 gimple *next_stmt, *new_stmt;
5763 unsigned int i, gap_count;
5764 tree tmp_data_ref;
5766 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5767 Since we scan the chain starting from it's first node, their order
5768 corresponds the order of data-refs in RESULT_CHAIN. */
5769 next_stmt = first_stmt;
5770 gap_count = 1;
5771 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5773 if (!next_stmt)
5774 break;
5776 /* Skip the gaps. Loads created for the gaps will be removed by dead
5777 code elimination pass later. No need to check for the first stmt in
5778 the group, since it always exists.
5779 GROUP_GAP is the number of steps in elements from the previous
5780 access (if there is no gap GROUP_GAP is 1). We skip loads that
5781 correspond to the gaps. */
5782 if (next_stmt != first_stmt
5783 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5785 gap_count++;
5786 continue;
5789 while (next_stmt)
5791 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5792 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5793 copies, and we put the new vector statement in the first available
5794 RELATED_STMT. */
5795 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5796 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5797 else
5799 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5801 gimple *prev_stmt =
5802 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5803 gimple *rel_stmt =
5804 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5805 while (rel_stmt)
5807 prev_stmt = rel_stmt;
5808 rel_stmt =
5809 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5812 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5813 new_stmt;
5817 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5818 gap_count = 1;
5819 /* If NEXT_STMT accesses the same DR as the previous statement,
5820 put the same TMP_DATA_REF as its vectorized statement; otherwise
5821 get the next data-ref from RESULT_CHAIN. */
5822 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5823 break;
5828 /* Function vect_force_dr_alignment_p.
5830 Returns whether the alignment of a DECL can be forced to be aligned
5831 on ALIGNMENT bit boundary. */
5833 bool
5834 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5836 if (!VAR_P (decl))
5837 return false;
5839 if (decl_in_symtab_p (decl)
5840 && !symtab_node::get (decl)->can_increase_alignment_p ())
5841 return false;
5843 if (TREE_STATIC (decl))
5844 return (alignment <= MAX_OFILE_ALIGNMENT);
5845 else
5846 return (alignment <= MAX_STACK_ALIGNMENT);
5850 /* Return whether the data reference DR is supported with respect to its
5851 alignment.
5852 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5853 it is aligned, i.e., check if it is possible to vectorize it with different
5854 alignment. */
5856 enum dr_alignment_support
5857 vect_supportable_dr_alignment (struct data_reference *dr,
5858 bool check_aligned_accesses)
5860 gimple *stmt = DR_STMT (dr);
5861 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5862 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5863 machine_mode mode = TYPE_MODE (vectype);
5864 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5865 struct loop *vect_loop = NULL;
5866 bool nested_in_vect_loop = false;
5868 if (aligned_access_p (dr) && !check_aligned_accesses)
5869 return dr_aligned;
5871 /* For now assume all conditional loads/stores support unaligned
5872 access without any special code. */
5873 if (is_gimple_call (stmt)
5874 && gimple_call_internal_p (stmt)
5875 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5876 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5877 return dr_unaligned_supported;
5879 if (loop_vinfo)
5881 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5882 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5885 /* Possibly unaligned access. */
5887 /* We can choose between using the implicit realignment scheme (generating
5888 a misaligned_move stmt) and the explicit realignment scheme (generating
5889 aligned loads with a REALIGN_LOAD). There are two variants to the
5890 explicit realignment scheme: optimized, and unoptimized.
5891 We can optimize the realignment only if the step between consecutive
5892 vector loads is equal to the vector size. Since the vector memory
5893 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5894 is guaranteed that the misalignment amount remains the same throughout the
5895 execution of the vectorized loop. Therefore, we can create the
5896 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5897 at the loop preheader.
5899 However, in the case of outer-loop vectorization, when vectorizing a
5900 memory access in the inner-loop nested within the LOOP that is now being
5901 vectorized, while it is guaranteed that the misalignment of the
5902 vectorized memory access will remain the same in different outer-loop
5903 iterations, it is *not* guaranteed that is will remain the same throughout
5904 the execution of the inner-loop. This is because the inner-loop advances
5905 with the original scalar step (and not in steps of VS). If the inner-loop
5906 step happens to be a multiple of VS, then the misalignment remains fixed
5907 and we can use the optimized realignment scheme. For example:
5909 for (i=0; i<N; i++)
5910 for (j=0; j<M; j++)
5911 s += a[i+j];
5913 When vectorizing the i-loop in the above example, the step between
5914 consecutive vector loads is 1, and so the misalignment does not remain
5915 fixed across the execution of the inner-loop, and the realignment cannot
5916 be optimized (as illustrated in the following pseudo vectorized loop):
5918 for (i=0; i<N; i+=4)
5919 for (j=0; j<M; j++){
5920 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5921 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5922 // (assuming that we start from an aligned address).
5925 We therefore have to use the unoptimized realignment scheme:
5927 for (i=0; i<N; i+=4)
5928 for (j=k; j<M; j+=4)
5929 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5930 // that the misalignment of the initial address is
5931 // 0).
5933 The loop can then be vectorized as follows:
5935 for (k=0; k<4; k++){
5936 rt = get_realignment_token (&vp[k]);
5937 for (i=0; i<N; i+=4){
5938 v1 = vp[i+k];
5939 for (j=k; j<M; j+=4){
5940 v2 = vp[i+j+VS-1];
5941 va = REALIGN_LOAD <v1,v2,rt>;
5942 vs += va;
5943 v1 = v2;
5946 } */
5948 if (DR_IS_READ (dr))
5950 bool is_packed = false;
5951 tree type = (TREE_TYPE (DR_REF (dr)));
5953 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5954 && (!targetm.vectorize.builtin_mask_for_load
5955 || targetm.vectorize.builtin_mask_for_load ()))
5957 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5959 /* If we are doing SLP then the accesses need not have the
5960 same alignment, instead it depends on the SLP group size. */
5961 if (loop_vinfo
5962 && STMT_SLP_TYPE (stmt_info)
5963 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5964 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5965 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5967 else if (!loop_vinfo
5968 || (nested_in_vect_loop
5969 && (TREE_INT_CST_LOW (DR_STEP (dr))
5970 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5971 return dr_explicit_realign;
5972 else
5973 return dr_explicit_realign_optimized;
5975 if (!known_alignment_for_access_p (dr))
5976 is_packed = not_size_aligned (DR_REF (dr));
5978 if (targetm.vectorize.support_vector_misalignment
5979 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5980 /* Can't software pipeline the loads, but can at least do them. */
5981 return dr_unaligned_supported;
5983 else
5985 bool is_packed = false;
5986 tree type = (TREE_TYPE (DR_REF (dr)));
5988 if (!known_alignment_for_access_p (dr))
5989 is_packed = not_size_aligned (DR_REF (dr));
5991 if (targetm.vectorize.support_vector_misalignment
5992 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5993 return dr_unaligned_supported;
5996 /* Unsupported. */
5997 return dr_unaligned_unsupported;