Turn SECONDARY_MEMORY_NEEDED_MODE into a target hook
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob070c707fdaf2a3778a3eee8abcd7d036c9c5b9cb
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
56 /* Return true if load- or store-lanes optab OPTAB is implemented for
57 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59 static bool
60 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
61 tree vectype, unsigned HOST_WIDE_INT count)
63 machine_mode mode;
64 scalar_int_mode array_mode;
65 bool limit_p;
67 mode = TYPE_MODE (vectype);
68 limit_p = !targetm.array_mode_supported_p (mode, count);
69 if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
70 limit_p).exists (&array_mode))
72 if (dump_enabled_p ())
73 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
74 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
75 GET_MODE_NAME (mode), count);
76 return false;
79 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
81 if (dump_enabled_p ())
82 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
83 "cannot use %s<%s><%s>\n", name,
84 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
85 return false;
88 if (dump_enabled_p ())
89 dump_printf_loc (MSG_NOTE, vect_location,
90 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
91 GET_MODE_NAME (mode));
93 return true;
97 /* Return the smallest scalar part of STMT.
98 This is used to determine the vectype of the stmt. We generally set the
99 vectype according to the type of the result (lhs). For stmts whose
100 result-type is different than the type of the arguments (e.g., demotion,
101 promotion), vectype will be reset appropriately (later). Note that we have
102 to visit the smallest datatype in this function, because that determines the
103 VF. If the smallest datatype in the loop is present only as the rhs of a
104 promotion operation - we'd miss it.
105 Such a case, where a variable of this datatype does not appear in the lhs
106 anywhere in the loop, can only occur if it's an invariant: e.g.:
107 'int_x = (int) short_inv', which we'd expect to have been optimized away by
108 invariant motion. However, we cannot rely on invariant motion to always
109 take invariants out of the loop, and so in the case of promotion we also
110 have to check the rhs.
111 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
112 types. */
114 tree
115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
116 HOST_WIDE_INT *rhs_size_unit)
118 tree scalar_type = gimple_expr_type (stmt);
119 HOST_WIDE_INT lhs, rhs;
121 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
123 if (is_gimple_assign (stmt)
124 && (gimple_assign_cast_p (stmt)
125 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
126 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
127 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
129 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
131 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
132 if (rhs < lhs)
133 scalar_type = rhs_type;
136 *lhs_size_unit = lhs;
137 *rhs_size_unit = rhs;
138 return scalar_type;
142 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
143 tested at run-time. Return TRUE if DDR was successfully inserted.
144 Return false if versioning is not supported. */
146 static bool
147 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
149 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
151 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
152 return false;
154 if (!runtime_alias_check_p (ddr, loop,
155 optimize_loop_nest_for_speed_p (loop)))
156 return false;
158 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
159 return true;
163 /* A subroutine of vect_analyze_data_ref_dependence. Handle
164 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
165 distances. These distances are conservatively correct but they don't
166 reflect a guaranteed dependence.
168 Return true if this function does all the work necessary to avoid
169 an alias or false if the caller should use the dependence distances
170 to limit the vectorization factor in the usual way. LOOP_DEPTH is
171 the depth of the loop described by LOOP_VINFO and the other arguments
172 are as for vect_analyze_data_ref_dependence. */
174 static bool
175 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
176 loop_vec_info loop_vinfo,
177 int loop_depth, int *max_vf)
179 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
180 lambda_vector dist_v;
181 unsigned int i;
182 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
184 int dist = dist_v[loop_depth];
185 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
187 /* If the user asserted safelen >= DIST consecutive iterations
188 can be executed concurrently, assume independence.
190 ??? An alternative would be to add the alias check even
191 in this case, and vectorize the fallback loop with the
192 maximum VF set to safelen. However, if the user has
193 explicitly given a length, it's less likely that that
194 would be a win. */
195 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
197 if (loop->safelen < *max_vf)
198 *max_vf = loop->safelen;
199 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
200 continue;
203 /* For dependence distances of 2 or more, we have the option
204 of limiting VF or checking for an alias at runtime.
205 Prefer to check at runtime if we can, to avoid limiting
206 the VF unnecessarily when the bases are in fact independent.
208 Note that the alias checks will be removed if the VF ends up
209 being small enough. */
210 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
213 return true;
217 /* Function vect_analyze_data_ref_dependence.
219 Return TRUE if there (might) exist a dependence between a memory-reference
220 DRA and a memory-reference DRB. When versioning for alias may check a
221 dependence at run-time, return FALSE. Adjust *MAX_VF according to
222 the data dependence. */
224 static bool
225 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
226 loop_vec_info loop_vinfo, int *max_vf)
228 unsigned int i;
229 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
230 struct data_reference *dra = DDR_A (ddr);
231 struct data_reference *drb = DDR_B (ddr);
232 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
233 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
234 lambda_vector dist_v;
235 unsigned int loop_depth;
237 /* In loop analysis all data references should be vectorizable. */
238 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
239 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
240 gcc_unreachable ();
242 /* Independent data accesses. */
243 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
244 return false;
246 if (dra == drb
247 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
248 return false;
250 /* We do not have to consider dependences between accesses that belong
251 to the same group. */
252 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
253 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
254 return false;
256 /* Even if we have an anti-dependence then, as the vectorized loop covers at
257 least two scalar iterations, there is always also a true dependence.
258 As the vectorizer does not re-order loads and stores we can ignore
259 the anti-dependence if TBAA can disambiguate both DRs similar to the
260 case with known negative distance anti-dependences (positive
261 distance anti-dependences would violate TBAA constraints). */
262 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
263 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
264 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
265 get_alias_set (DR_REF (drb))))
266 return false;
268 /* Unknown data dependence. */
269 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
271 /* If user asserted safelen consecutive iterations can be
272 executed concurrently, assume independence. */
273 if (loop->safelen >= 2)
275 if (loop->safelen < *max_vf)
276 *max_vf = loop->safelen;
277 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
278 return false;
281 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
282 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
284 if (dump_enabled_p ())
286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
287 "versioning for alias not supported for: "
288 "can't determine dependence between ");
289 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
290 DR_REF (dra));
291 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
292 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
293 DR_REF (drb));
294 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
296 return true;
299 if (dump_enabled_p ())
301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
302 "versioning for alias required: "
303 "can't determine dependence between ");
304 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
305 DR_REF (dra));
306 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
308 DR_REF (drb));
309 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
312 /* Add to list of ddrs that need to be tested at run-time. */
313 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
316 /* Known data dependence. */
317 if (DDR_NUM_DIST_VECTS (ddr) == 0)
319 /* If user asserted safelen consecutive iterations can be
320 executed concurrently, assume independence. */
321 if (loop->safelen >= 2)
323 if (loop->safelen < *max_vf)
324 *max_vf = loop->safelen;
325 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
326 return false;
329 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
330 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
332 if (dump_enabled_p ())
334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
335 "versioning for alias not supported for: "
336 "bad dist vector for ");
337 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
338 DR_REF (dra));
339 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
340 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
341 DR_REF (drb));
342 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
344 return true;
347 if (dump_enabled_p ())
349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
350 "versioning for alias required: "
351 "bad dist vector for ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
353 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
354 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
355 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
357 /* Add to list of ddrs that need to be tested at run-time. */
358 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
361 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
363 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
364 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
365 loop_depth, max_vf))
366 return false;
368 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
370 int dist = dist_v[loop_depth];
372 if (dump_enabled_p ())
373 dump_printf_loc (MSG_NOTE, vect_location,
374 "dependence distance = %d.\n", dist);
376 if (dist == 0)
378 if (dump_enabled_p ())
380 dump_printf_loc (MSG_NOTE, vect_location,
381 "dependence distance == 0 between ");
382 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
383 dump_printf (MSG_NOTE, " and ");
384 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
385 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
388 /* When we perform grouped accesses and perform implicit CSE
389 by detecting equal accesses and doing disambiguation with
390 runtime alias tests like for
391 .. = a[i];
392 .. = a[i+1];
393 a[i] = ..;
394 a[i+1] = ..;
395 *p = ..;
396 .. = a[i];
397 .. = a[i+1];
398 where we will end up loading { a[i], a[i+1] } once, make
399 sure that inserting group loads before the first load and
400 stores after the last store will do the right thing.
401 Similar for groups like
402 a[i] = ...;
403 ... = a[i];
404 a[i+1] = ...;
405 where loads from the group interleave with the store. */
406 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
407 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
409 gimple *earlier_stmt;
410 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
411 if (DR_IS_WRITE
412 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
416 "READ_WRITE dependence in interleaving."
417 "\n");
418 return true;
422 continue;
425 if (dist > 0 && DDR_REVERSED_P (ddr))
427 /* If DDR_REVERSED_P the order of the data-refs in DDR was
428 reversed (to make distance vector positive), and the actual
429 distance is negative. */
430 if (dump_enabled_p ())
431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
432 "dependence distance negative.\n");
433 /* Record a negative dependence distance to later limit the
434 amount of stmt copying / unrolling we can perform.
435 Only need to handle read-after-write dependence. */
436 if (DR_IS_READ (drb)
437 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
438 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
439 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
440 continue;
443 if (abs (dist) >= 2
444 && abs (dist) < *max_vf)
446 /* The dependence distance requires reduction of the maximal
447 vectorization factor. */
448 *max_vf = abs (dist);
449 if (dump_enabled_p ())
450 dump_printf_loc (MSG_NOTE, vect_location,
451 "adjusting maximal vectorization factor to %i\n",
452 *max_vf);
455 if (abs (dist) >= *max_vf)
457 /* Dependence distance does not create dependence, as far as
458 vectorization is concerned, in this case. */
459 if (dump_enabled_p ())
460 dump_printf_loc (MSG_NOTE, vect_location,
461 "dependence distance >= VF.\n");
462 continue;
465 if (dump_enabled_p ())
467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
468 "not vectorized, possible dependence "
469 "between data-refs ");
470 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
471 dump_printf (MSG_NOTE, " and ");
472 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
473 dump_printf (MSG_NOTE, "\n");
476 return true;
479 return false;
482 /* Function vect_analyze_data_ref_dependences.
484 Examine all the data references in the loop, and make sure there do not
485 exist any data dependences between them. Set *MAX_VF according to
486 the maximum vectorization factor the data dependences allow. */
488 bool
489 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
491 unsigned int i;
492 struct data_dependence_relation *ddr;
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_NOTE, vect_location,
496 "=== vect_analyze_data_ref_dependences ===\n");
498 LOOP_VINFO_DDRS (loop_vinfo)
499 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
500 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
501 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
502 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
503 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
504 &LOOP_VINFO_DDRS (loop_vinfo),
505 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
506 return false;
508 /* For epilogues we either have no aliases or alias versioning
509 was applied to original loop. Therefore we may just get max_vf
510 using VF of original loop. */
511 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
512 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
513 else
514 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
515 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
516 return false;
518 return true;
522 /* Function vect_slp_analyze_data_ref_dependence.
524 Return TRUE if there (might) exist a dependence between a memory-reference
525 DRA and a memory-reference DRB. When versioning for alias may check a
526 dependence at run-time, return FALSE. Adjust *MAX_VF according to
527 the data dependence. */
529 static bool
530 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
532 struct data_reference *dra = DDR_A (ddr);
533 struct data_reference *drb = DDR_B (ddr);
535 /* We need to check dependences of statements marked as unvectorizable
536 as well, they still can prohibit vectorization. */
538 /* Independent data accesses. */
539 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
540 return false;
542 if (dra == drb)
543 return false;
545 /* Read-read is OK. */
546 if (DR_IS_READ (dra) && DR_IS_READ (drb))
547 return false;
549 /* If dra and drb are part of the same interleaving chain consider
550 them independent. */
551 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
552 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
553 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
554 return false;
556 /* Unknown data dependence. */
557 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
559 if (dump_enabled_p ())
561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
562 "can't determine dependence between ");
563 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
564 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
565 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
566 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
569 else if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location,
572 "determined dependence between ");
573 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
574 dump_printf (MSG_NOTE, " and ");
575 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
576 dump_printf (MSG_NOTE, "\n");
579 return true;
583 /* Analyze dependences involved in the transform of SLP NODE. STORES
584 contain the vector of scalar stores of this instance if we are
585 disambiguating the loads. */
587 static bool
588 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
589 vec<gimple *> stores, gimple *last_store)
591 /* This walks over all stmts involved in the SLP load/store done
592 in NODE verifying we can sink them up to the last stmt in the
593 group. */
594 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
595 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
597 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
598 if (access == last_access)
599 continue;
600 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
601 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
602 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
604 gimple *stmt = gsi_stmt (gsi);
605 if (! gimple_vuse (stmt)
606 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
607 continue;
609 /* If we couldn't record a (single) data reference for this
610 stmt we have to give up. */
611 /* ??? Here and below if dependence analysis fails we can resort
612 to the alias oracle which can handle more kinds of stmts. */
613 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
614 if (!dr_b)
615 return false;
617 bool dependent = false;
618 /* If we run into a store of this same instance (we've just
619 marked those) then delay dependence checking until we run
620 into the last store because this is where it will have
621 been sunk to (and we verify if we can do that as well). */
622 if (gimple_visited_p (stmt))
624 if (stmt != last_store)
625 continue;
626 unsigned i;
627 gimple *store;
628 FOR_EACH_VEC_ELT (stores, i, store)
630 data_reference *store_dr
631 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
632 ddr_p ddr = initialize_data_dependence_relation
633 (dr_a, store_dr, vNULL);
634 dependent = vect_slp_analyze_data_ref_dependence (ddr);
635 free_dependence_relation (ddr);
636 if (dependent)
637 break;
640 else
642 ddr_p ddr = initialize_data_dependence_relation (dr_a,
643 dr_b, vNULL);
644 dependent = vect_slp_analyze_data_ref_dependence (ddr);
645 free_dependence_relation (ddr);
647 if (dependent)
648 return false;
651 return true;
655 /* Function vect_analyze_data_ref_dependences.
657 Examine all the data references in the basic-block, and make sure there
658 do not exist any data dependences between them. Set *MAX_VF according to
659 the maximum vectorization factor the data dependences allow. */
661 bool
662 vect_slp_analyze_instance_dependence (slp_instance instance)
664 if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE, vect_location,
666 "=== vect_slp_analyze_instance_dependence ===\n");
668 /* The stores of this instance are at the root of the SLP tree. */
669 slp_tree store = SLP_INSTANCE_TREE (instance);
670 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
671 store = NULL;
673 /* Verify we can sink stores to the vectorized stmt insert location. */
674 gimple *last_store = NULL;
675 if (store)
677 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
678 return false;
680 /* Mark stores in this instance and remember the last one. */
681 last_store = vect_find_last_scalar_stmt_in_slp (store);
682 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
683 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
686 bool res = true;
688 /* Verify we can sink loads to the vectorized stmt insert location,
689 special-casing stores of this instance. */
690 slp_tree load;
691 unsigned int i;
692 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
693 if (! vect_slp_analyze_node_dependences (instance, load,
694 store
695 ? SLP_TREE_SCALAR_STMTS (store)
696 : vNULL, last_store))
698 res = false;
699 break;
702 /* Unset the visited flag. */
703 if (store)
704 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
705 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
707 return res;
710 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
711 the statement that contains DRB, which is useful for recording in the
712 dump file. */
714 static void
715 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
716 innermost_loop_behavior *drb)
718 bool existed;
719 innermost_loop_behavior *&entry
720 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
721 if (!existed || entry->base_alignment < drb->base_alignment)
723 entry = drb;
724 if (dump_enabled_p ())
726 dump_printf_loc (MSG_NOTE, vect_location,
727 "recording new base alignment for ");
728 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
729 dump_printf (MSG_NOTE, "\n");
730 dump_printf_loc (MSG_NOTE, vect_location,
731 " alignment: %d\n", drb->base_alignment);
732 dump_printf_loc (MSG_NOTE, vect_location,
733 " misalignment: %d\n", drb->base_misalignment);
734 dump_printf_loc (MSG_NOTE, vect_location,
735 " based on: ");
736 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
741 /* If the region we're going to vectorize is reached, all unconditional
742 data references occur at least once. We can therefore pool the base
743 alignment guarantees from each unconditional reference. Do this by
744 going through all the data references in VINFO and checking whether
745 the containing statement makes the reference unconditionally. If so,
746 record the alignment of the base address in VINFO so that it can be
747 used for all other references with the same base. */
749 void
750 vect_record_base_alignments (vec_info *vinfo)
752 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
753 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
754 data_reference *dr;
755 unsigned int i;
756 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
757 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
759 gimple *stmt = DR_STMT (dr);
760 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
762 /* If DR is nested in the loop that is being vectorized, we can also
763 record the alignment of the base wrt the outer loop. */
764 if (loop && nested_in_vect_loop_p (loop, stmt))
766 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
767 vect_record_base_alignment
768 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
773 /* Function vect_compute_data_ref_alignment
775 Compute the misalignment of the data reference DR.
777 Output:
778 1. If during the misalignment computation it is found that the data reference
779 cannot be vectorized then false is returned.
780 2. DR_MISALIGNMENT (DR) is defined.
782 FOR NOW: No analysis is actually performed. Misalignment is calculated
783 only for trivial cases. TODO. */
785 bool
786 vect_compute_data_ref_alignment (struct data_reference *dr)
788 gimple *stmt = DR_STMT (dr);
789 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
790 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
791 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
792 struct loop *loop = NULL;
793 tree ref = DR_REF (dr);
794 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
796 if (dump_enabled_p ())
797 dump_printf_loc (MSG_NOTE, vect_location,
798 "vect_compute_data_ref_alignment:\n");
800 if (loop_vinfo)
801 loop = LOOP_VINFO_LOOP (loop_vinfo);
803 /* Initialize misalignment to unknown. */
804 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
806 innermost_loop_behavior *drb = vect_dr_behavior (dr);
807 bool step_preserves_misalignment_p;
809 /* No step for BB vectorization. */
810 if (!loop)
812 gcc_assert (integer_zerop (drb->step));
813 step_preserves_misalignment_p = true;
816 /* In case the dataref is in an inner-loop of the loop that is being
817 vectorized (LOOP), we use the base and misalignment information
818 relative to the outer-loop (LOOP). This is ok only if the misalignment
819 stays the same throughout the execution of the inner-loop, which is why
820 we have to check that the stride of the dataref in the inner-loop evenly
821 divides by the vector size. */
822 else if (nested_in_vect_loop_p (loop, stmt))
824 step_preserves_misalignment_p
825 = (DR_STEP_ALIGNMENT (dr)
826 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
828 if (dump_enabled_p ())
830 if (step_preserves_misalignment_p)
831 dump_printf_loc (MSG_NOTE, vect_location,
832 "inner step divides the vector-size.\n");
833 else
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
835 "inner step doesn't divide the vector-size.\n");
839 /* Similarly we can only use base and misalignment information relative to
840 an innermost loop if the misalignment stays the same throughout the
841 execution of the loop. As above, this is the case if the stride of
842 the dataref evenly divides by the vector size. */
843 else
845 unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
846 step_preserves_misalignment_p
847 = ((DR_STEP_ALIGNMENT (dr) * vf)
848 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
850 if (!step_preserves_misalignment_p && dump_enabled_p ())
851 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
852 "step doesn't divide the vector-size.\n");
855 unsigned int base_alignment = drb->base_alignment;
856 unsigned int base_misalignment = drb->base_misalignment;
857 unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
859 /* Calculate the maximum of the pooled base address alignment and the
860 alignment that we can compute for DR itself. */
861 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
862 if (entry && base_alignment < (*entry)->base_alignment)
864 base_alignment = (*entry)->base_alignment;
865 base_misalignment = (*entry)->base_misalignment;
868 if (drb->offset_alignment < vector_alignment
869 || !step_preserves_misalignment_p
870 /* We need to know whether the step wrt the vectorized loop is
871 negative when computing the starting misalignment below. */
872 || TREE_CODE (drb->step) != INTEGER_CST)
874 if (dump_enabled_p ())
876 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
877 "Unknown alignment for access: ");
878 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
879 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
881 return true;
884 if (base_alignment < vector_alignment)
886 tree base = drb->base_address;
887 if (TREE_CODE (base) == ADDR_EXPR)
888 base = TREE_OPERAND (base, 0);
889 if (!vect_can_force_dr_alignment_p (base,
890 vector_alignment * BITS_PER_UNIT))
892 if (dump_enabled_p ())
894 dump_printf_loc (MSG_NOTE, vect_location,
895 "can't force alignment of ref: ");
896 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
897 dump_printf (MSG_NOTE, "\n");
899 return true;
902 if (DECL_USER_ALIGN (base))
904 if (dump_enabled_p ())
906 dump_printf_loc (MSG_NOTE, vect_location,
907 "not forcing alignment of user-aligned "
908 "variable: ");
909 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
910 dump_printf (MSG_NOTE, "\n");
912 return true;
915 /* Force the alignment of the decl.
916 NOTE: This is the only change to the code we make during
917 the analysis phase, before deciding to vectorize the loop. */
918 if (dump_enabled_p ())
920 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
921 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
922 dump_printf (MSG_NOTE, "\n");
925 DR_VECT_AUX (dr)->base_decl = base;
926 DR_VECT_AUX (dr)->base_misaligned = true;
927 base_misalignment = 0;
929 unsigned int misalignment = (base_misalignment
930 + TREE_INT_CST_LOW (drb->init));
932 /* If this is a backward running DR then first access in the larger
933 vectype actually is N-1 elements before the address in the DR.
934 Adjust misalign accordingly. */
935 if (tree_int_cst_sgn (drb->step) < 0)
936 /* PLUS because STEP is negative. */
937 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
938 * TREE_INT_CST_LOW (drb->step));
940 SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
942 if (dump_enabled_p ())
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
945 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
946 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
947 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
950 return true;
954 /* Function vect_update_misalignment_for_peel.
955 Sets DR's misalignment
956 - to 0 if it has the same alignment as DR_PEEL,
957 - to the misalignment computed using NPEEL if DR's salignment is known,
958 - to -1 (unknown) otherwise.
960 DR - the data reference whose misalignment is to be adjusted.
961 DR_PEEL - the data reference whose misalignment is being made
962 zero in the vector loop by the peel.
963 NPEEL - the number of iterations in the peel loop if the misalignment
964 of DR_PEEL is known at compile time. */
966 static void
967 vect_update_misalignment_for_peel (struct data_reference *dr,
968 struct data_reference *dr_peel, int npeel)
970 unsigned int i;
971 vec<dr_p> same_aligned_drs;
972 struct data_reference *current_dr;
973 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
974 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
975 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
976 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
978 /* For interleaved data accesses the step in the loop must be multiplied by
979 the size of the interleaving group. */
980 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
981 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
982 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
983 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
985 /* It can be assumed that the data refs with the same alignment as dr_peel
986 are aligned in the vector loop. */
987 same_aligned_drs
988 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
989 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
991 if (current_dr != dr)
992 continue;
993 gcc_assert (!known_alignment_for_access_p (dr)
994 || !known_alignment_for_access_p (dr_peel)
995 || (DR_MISALIGNMENT (dr) / dr_size
996 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
997 SET_DR_MISALIGNMENT (dr, 0);
998 return;
1001 if (known_alignment_for_access_p (dr)
1002 && known_alignment_for_access_p (dr_peel))
1004 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1005 int misal = DR_MISALIGNMENT (dr);
1006 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1007 misal += negative ? -npeel * dr_size : npeel * dr_size;
1008 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
1009 SET_DR_MISALIGNMENT (dr, misal);
1010 return;
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1015 "to unknown (-1).\n");
1016 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1020 /* Function verify_data_ref_alignment
1022 Return TRUE if DR can be handled with respect to alignment. */
1024 static bool
1025 verify_data_ref_alignment (data_reference_p dr)
1027 enum dr_alignment_support supportable_dr_alignment
1028 = vect_supportable_dr_alignment (dr, false);
1029 if (!supportable_dr_alignment)
1031 if (dump_enabled_p ())
1033 if (DR_IS_READ (dr))
1034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035 "not vectorized: unsupported unaligned load.");
1036 else
1037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1038 "not vectorized: unsupported unaligned "
1039 "store.");
1041 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1042 DR_REF (dr));
1043 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1045 return false;
1048 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1049 dump_printf_loc (MSG_NOTE, vect_location,
1050 "Vectorizing an unaligned access.\n");
1052 return true;
1055 /* Function vect_verify_datarefs_alignment
1057 Return TRUE if all data references in the loop can be
1058 handled with respect to alignment. */
1060 bool
1061 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1063 vec<data_reference_p> datarefs = vinfo->datarefs;
1064 struct data_reference *dr;
1065 unsigned int i;
1067 FOR_EACH_VEC_ELT (datarefs, i, dr)
1069 gimple *stmt = DR_STMT (dr);
1070 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1072 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1073 continue;
1075 /* For interleaving, only the alignment of the first access matters. */
1076 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1077 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1078 continue;
1080 /* Strided accesses perform only component accesses, alignment is
1081 irrelevant for them. */
1082 if (STMT_VINFO_STRIDED_P (stmt_info)
1083 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1084 continue;
1086 if (! verify_data_ref_alignment (dr))
1087 return false;
1090 return true;
1093 /* Given an memory reference EXP return whether its alignment is less
1094 than its size. */
1096 static bool
1097 not_size_aligned (tree exp)
1099 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1100 return true;
1102 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1103 > get_object_alignment (exp));
1106 /* Function vector_alignment_reachable_p
1108 Return true if vector alignment for DR is reachable by peeling
1109 a few loop iterations. Return false otherwise. */
1111 static bool
1112 vector_alignment_reachable_p (struct data_reference *dr)
1114 gimple *stmt = DR_STMT (dr);
1115 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1116 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1118 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1120 /* For interleaved access we peel only if number of iterations in
1121 the prolog loop ({VF - misalignment}), is a multiple of the
1122 number of the interleaved accesses. */
1123 int elem_size, mis_in_elements;
1124 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1126 /* FORNOW: handle only known alignment. */
1127 if (!known_alignment_for_access_p (dr))
1128 return false;
1130 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1131 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1133 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1134 return false;
1137 /* If misalignment is known at the compile time then allow peeling
1138 only if natural alignment is reachable through peeling. */
1139 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1141 HOST_WIDE_INT elmsize =
1142 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1143 if (dump_enabled_p ())
1145 dump_printf_loc (MSG_NOTE, vect_location,
1146 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1147 dump_printf (MSG_NOTE,
1148 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1150 if (DR_MISALIGNMENT (dr) % elmsize)
1152 if (dump_enabled_p ())
1153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1154 "data size does not divide the misalignment.\n");
1155 return false;
1159 if (!known_alignment_for_access_p (dr))
1161 tree type = TREE_TYPE (DR_REF (dr));
1162 bool is_packed = not_size_aligned (DR_REF (dr));
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 "Unknown misalignment, %snaturally aligned\n",
1166 is_packed ? "not " : "");
1167 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1170 return true;
1174 /* Calculate the cost of the memory access represented by DR. */
1176 static void
1177 vect_get_data_access_cost (struct data_reference *dr,
1178 unsigned int *inside_cost,
1179 unsigned int *outside_cost,
1180 stmt_vector_for_cost *body_cost_vec)
1182 gimple *stmt = DR_STMT (dr);
1183 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1184 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1185 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1186 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1187 int ncopies = MAX (1, vf / nunits); /* TODO: Handle SLP properly */
1189 if (DR_IS_READ (dr))
1190 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1191 NULL, body_cost_vec, false);
1192 else
1193 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_NOTE, vect_location,
1197 "vect_get_data_access_cost: inside_cost = %d, "
1198 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1202 typedef struct _vect_peel_info
1204 struct data_reference *dr;
1205 int npeel;
1206 unsigned int count;
1207 } *vect_peel_info;
1209 typedef struct _vect_peel_extended_info
1211 struct _vect_peel_info peel_info;
1212 unsigned int inside_cost;
1213 unsigned int outside_cost;
1214 } *vect_peel_extended_info;
1217 /* Peeling hashtable helpers. */
1219 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1221 static inline hashval_t hash (const _vect_peel_info *);
1222 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1225 inline hashval_t
1226 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1228 return (hashval_t) peel_info->npeel;
1231 inline bool
1232 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1234 return (a->npeel == b->npeel);
1238 /* Insert DR into peeling hash table with NPEEL as key. */
1240 static void
1241 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1242 loop_vec_info loop_vinfo, struct data_reference *dr,
1243 int npeel)
1245 struct _vect_peel_info elem, *slot;
1246 _vect_peel_info **new_slot;
1247 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1249 elem.npeel = npeel;
1250 slot = peeling_htab->find (&elem);
1251 if (slot)
1252 slot->count++;
1253 else
1255 slot = XNEW (struct _vect_peel_info);
1256 slot->npeel = npeel;
1257 slot->dr = dr;
1258 slot->count = 1;
1259 new_slot = peeling_htab->find_slot (slot, INSERT);
1260 *new_slot = slot;
1263 if (!supportable_dr_alignment
1264 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1265 slot->count += VECT_MAX_COST;
1269 /* Traverse peeling hash table to find peeling option that aligns maximum
1270 number of data accesses. */
1273 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1274 _vect_peel_extended_info *max)
1276 vect_peel_info elem = *slot;
1278 if (elem->count > max->peel_info.count
1279 || (elem->count == max->peel_info.count
1280 && max->peel_info.npeel > elem->npeel))
1282 max->peel_info.npeel = elem->npeel;
1283 max->peel_info.count = elem->count;
1284 max->peel_info.dr = elem->dr;
1287 return 1;
1290 /* Get the costs of peeling NPEEL iterations checking data access costs
1291 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1292 misalignment will be zero after peeling. */
1294 static void
1295 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1296 struct data_reference *dr0,
1297 unsigned int *inside_cost,
1298 unsigned int *outside_cost,
1299 stmt_vector_for_cost *body_cost_vec,
1300 unsigned int npeel,
1301 bool unknown_misalignment)
1303 unsigned i;
1304 data_reference *dr;
1306 FOR_EACH_VEC_ELT (datarefs, i, dr)
1308 gimple *stmt = DR_STMT (dr);
1309 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1310 /* For interleaving, only the alignment of the first access
1311 matters. */
1312 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1313 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1314 continue;
1316 /* Strided accesses perform only component accesses, alignment is
1317 irrelevant for them. */
1318 if (STMT_VINFO_STRIDED_P (stmt_info)
1319 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1320 continue;
1322 int save_misalignment;
1323 save_misalignment = DR_MISALIGNMENT (dr);
1324 if (npeel == 0)
1326 else if (unknown_misalignment && dr == dr0)
1327 SET_DR_MISALIGNMENT (dr, 0);
1328 else
1329 vect_update_misalignment_for_peel (dr, dr0, npeel);
1330 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1331 body_cost_vec);
1332 SET_DR_MISALIGNMENT (dr, save_misalignment);
1336 /* Traverse peeling hash table and calculate cost for each peeling option.
1337 Find the one with the lowest cost. */
1340 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1341 _vect_peel_extended_info *min)
1343 vect_peel_info elem = *slot;
1344 int dummy;
1345 unsigned int inside_cost = 0, outside_cost = 0;
1346 gimple *stmt = DR_STMT (elem->dr);
1347 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1348 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1349 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1350 epilogue_cost_vec;
1352 prologue_cost_vec.create (2);
1353 body_cost_vec.create (2);
1354 epilogue_cost_vec.create (2);
1356 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1357 elem->dr, &inside_cost, &outside_cost,
1358 &body_cost_vec, elem->npeel, false);
1360 body_cost_vec.release ();
1362 outside_cost += vect_get_known_peeling_cost
1363 (loop_vinfo, elem->npeel, &dummy,
1364 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1365 &prologue_cost_vec, &epilogue_cost_vec);
1367 /* Prologue and epilogue costs are added to the target model later.
1368 These costs depend only on the scalar iteration cost, the
1369 number of peeling iterations finally chosen, and the number of
1370 misaligned statements. So discard the information found here. */
1371 prologue_cost_vec.release ();
1372 epilogue_cost_vec.release ();
1374 if (inside_cost < min->inside_cost
1375 || (inside_cost == min->inside_cost
1376 && outside_cost < min->outside_cost))
1378 min->inside_cost = inside_cost;
1379 min->outside_cost = outside_cost;
1380 min->peel_info.dr = elem->dr;
1381 min->peel_info.npeel = elem->npeel;
1382 min->peel_info.count = elem->count;
1385 return 1;
1389 /* Choose best peeling option by traversing peeling hash table and either
1390 choosing an option with the lowest cost (if cost model is enabled) or the
1391 option that aligns as many accesses as possible. */
1393 static struct _vect_peel_extended_info
1394 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1395 loop_vec_info loop_vinfo)
1397 struct _vect_peel_extended_info res;
1399 res.peel_info.dr = NULL;
1401 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1403 res.inside_cost = INT_MAX;
1404 res.outside_cost = INT_MAX;
1405 peeling_htab->traverse <_vect_peel_extended_info *,
1406 vect_peeling_hash_get_lowest_cost> (&res);
1408 else
1410 res.peel_info.count = 0;
1411 peeling_htab->traverse <_vect_peel_extended_info *,
1412 vect_peeling_hash_get_most_frequent> (&res);
1413 res.inside_cost = 0;
1414 res.outside_cost = 0;
1417 return res;
1420 /* Return true if the new peeling NPEEL is supported. */
1422 static bool
1423 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1424 unsigned npeel)
1426 unsigned i;
1427 struct data_reference *dr = NULL;
1428 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1429 gimple *stmt;
1430 stmt_vec_info stmt_info;
1431 enum dr_alignment_support supportable_dr_alignment;
1433 /* Ensure that all data refs can be vectorized after the peel. */
1434 FOR_EACH_VEC_ELT (datarefs, i, dr)
1436 int save_misalignment;
1438 if (dr == dr0)
1439 continue;
1441 stmt = DR_STMT (dr);
1442 stmt_info = vinfo_for_stmt (stmt);
1443 /* For interleaving, only the alignment of the first access
1444 matters. */
1445 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1446 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1447 continue;
1449 /* Strided accesses perform only component accesses, alignment is
1450 irrelevant for them. */
1451 if (STMT_VINFO_STRIDED_P (stmt_info)
1452 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1453 continue;
1455 save_misalignment = DR_MISALIGNMENT (dr);
1456 vect_update_misalignment_for_peel (dr, dr0, npeel);
1457 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1458 SET_DR_MISALIGNMENT (dr, save_misalignment);
1460 if (!supportable_dr_alignment)
1461 return false;
1464 return true;
1467 /* Function vect_enhance_data_refs_alignment
1469 This pass will use loop versioning and loop peeling in order to enhance
1470 the alignment of data references in the loop.
1472 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1473 original loop is to be vectorized. Any other loops that are created by
1474 the transformations performed in this pass - are not supposed to be
1475 vectorized. This restriction will be relaxed.
1477 This pass will require a cost model to guide it whether to apply peeling
1478 or versioning or a combination of the two. For example, the scheme that
1479 intel uses when given a loop with several memory accesses, is as follows:
1480 choose one memory access ('p') which alignment you want to force by doing
1481 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1482 other accesses are not necessarily aligned, or (2) use loop versioning to
1483 generate one loop in which all accesses are aligned, and another loop in
1484 which only 'p' is necessarily aligned.
1486 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1487 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1488 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1490 Devising a cost model is the most critical aspect of this work. It will
1491 guide us on which access to peel for, whether to use loop versioning, how
1492 many versions to create, etc. The cost model will probably consist of
1493 generic considerations as well as target specific considerations (on
1494 powerpc for example, misaligned stores are more painful than misaligned
1495 loads).
1497 Here are the general steps involved in alignment enhancements:
1499 -- original loop, before alignment analysis:
1500 for (i=0; i<N; i++){
1501 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1502 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1505 -- After vect_compute_data_refs_alignment:
1506 for (i=0; i<N; i++){
1507 x = q[i]; # DR_MISALIGNMENT(q) = 3
1508 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1511 -- Possibility 1: we do loop versioning:
1512 if (p is aligned) {
1513 for (i=0; i<N; i++){ # loop 1A
1514 x = q[i]; # DR_MISALIGNMENT(q) = 3
1515 p[i] = y; # DR_MISALIGNMENT(p) = 0
1518 else {
1519 for (i=0; i<N; i++){ # loop 1B
1520 x = q[i]; # DR_MISALIGNMENT(q) = 3
1521 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1525 -- Possibility 2: we do loop peeling:
1526 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1527 x = q[i];
1528 p[i] = y;
1530 for (i = 3; i < N; i++){ # loop 2A
1531 x = q[i]; # DR_MISALIGNMENT(q) = 0
1532 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1535 -- Possibility 3: combination of loop peeling and versioning:
1536 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1537 x = q[i];
1538 p[i] = y;
1540 if (p is aligned) {
1541 for (i = 3; i<N; i++){ # loop 3A
1542 x = q[i]; # DR_MISALIGNMENT(q) = 0
1543 p[i] = y; # DR_MISALIGNMENT(p) = 0
1546 else {
1547 for (i = 3; i<N; i++){ # loop 3B
1548 x = q[i]; # DR_MISALIGNMENT(q) = 0
1549 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1553 These loops are later passed to loop_transform to be vectorized. The
1554 vectorizer will use the alignment information to guide the transformation
1555 (whether to generate regular loads/stores, or with special handling for
1556 misalignment). */
1558 bool
1559 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1561 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1562 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563 enum dr_alignment_support supportable_dr_alignment;
1564 struct data_reference *dr0 = NULL, *first_store = NULL;
1565 struct data_reference *dr;
1566 unsigned int i, j;
1567 bool do_peeling = false;
1568 bool do_versioning = false;
1569 bool stat;
1570 gimple *stmt;
1571 stmt_vec_info stmt_info;
1572 unsigned int npeel = 0;
1573 bool one_misalignment_known = false;
1574 bool one_misalignment_unknown = false;
1575 bool one_dr_unsupportable = false;
1576 struct data_reference *unsupportable_dr = NULL;
1577 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1578 unsigned possible_npeel_number = 1;
1579 tree vectype;
1580 unsigned int nelements, mis, same_align_drs_max = 0;
1581 hash_table<peel_info_hasher> peeling_htab (1);
1583 if (dump_enabled_p ())
1584 dump_printf_loc (MSG_NOTE, vect_location,
1585 "=== vect_enhance_data_refs_alignment ===\n");
1587 /* Reset data so we can safely be called multiple times. */
1588 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1589 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1591 /* While cost model enhancements are expected in the future, the high level
1592 view of the code at this time is as follows:
1594 A) If there is a misaligned access then see if peeling to align
1595 this access can make all data references satisfy
1596 vect_supportable_dr_alignment. If so, update data structures
1597 as needed and return true.
1599 B) If peeling wasn't possible and there is a data reference with an
1600 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1601 then see if loop versioning checks can be used to make all data
1602 references satisfy vect_supportable_dr_alignment. If so, update
1603 data structures as needed and return true.
1605 C) If neither peeling nor versioning were successful then return false if
1606 any data reference does not satisfy vect_supportable_dr_alignment.
1608 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1610 Note, Possibility 3 above (which is peeling and versioning together) is not
1611 being done at this time. */
1613 /* (1) Peeling to force alignment. */
1615 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1616 Considerations:
1617 + How many accesses will become aligned due to the peeling
1618 - How many accesses will become unaligned due to the peeling,
1619 and the cost of misaligned accesses.
1620 - The cost of peeling (the extra runtime checks, the increase
1621 in code size). */
1623 FOR_EACH_VEC_ELT (datarefs, i, dr)
1625 stmt = DR_STMT (dr);
1626 stmt_info = vinfo_for_stmt (stmt);
1628 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1629 continue;
1631 /* For interleaving, only the alignment of the first access
1632 matters. */
1633 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1634 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1635 continue;
1637 /* For invariant accesses there is nothing to enhance. */
1638 if (integer_zerop (DR_STEP (dr)))
1639 continue;
1641 /* Strided accesses perform only component accesses, alignment is
1642 irrelevant for them. */
1643 if (STMT_VINFO_STRIDED_P (stmt_info)
1644 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1645 continue;
1647 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1648 do_peeling = vector_alignment_reachable_p (dr);
1649 if (do_peeling)
1651 if (known_alignment_for_access_p (dr))
1653 unsigned int npeel_tmp = 0;
1654 bool negative = tree_int_cst_compare (DR_STEP (dr),
1655 size_zero_node) < 0;
1657 vectype = STMT_VINFO_VECTYPE (stmt_info);
1658 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1659 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1660 TREE_TYPE (DR_REF (dr))));
1661 if (DR_MISALIGNMENT (dr) != 0)
1662 npeel_tmp = (negative ? (mis - nelements)
1663 : (nelements - mis)) & (nelements - 1);
1665 /* For multiple types, it is possible that the bigger type access
1666 will have more than one peeling option. E.g., a loop with two
1667 types: one of size (vector size / 4), and the other one of
1668 size (vector size / 8). Vectorization factor will 8. If both
1669 accesses are misaligned by 3, the first one needs one scalar
1670 iteration to be aligned, and the second one needs 5. But the
1671 first one will be aligned also by peeling 5 scalar
1672 iterations, and in that case both accesses will be aligned.
1673 Hence, except for the immediate peeling amount, we also want
1674 to try to add full vector size, while we don't exceed
1675 vectorization factor.
1676 We do this automatically for cost model, since we calculate
1677 cost for every peeling option. */
1678 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1680 if (STMT_SLP_TYPE (stmt_info))
1681 possible_npeel_number
1682 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1683 else
1684 possible_npeel_number = vf / nelements;
1686 /* NPEEL_TMP is 0 when there is no misalignment, but also
1687 allow peeling NELEMENTS. */
1688 if (DR_MISALIGNMENT (dr) == 0)
1689 possible_npeel_number++;
1692 /* Save info about DR in the hash table. Also include peeling
1693 amounts according to the explanation above. */
1694 for (j = 0; j < possible_npeel_number; j++)
1696 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1697 dr, npeel_tmp);
1698 npeel_tmp += nelements;
1701 one_misalignment_known = true;
1703 else
1705 /* If we don't know any misalignment values, we prefer
1706 peeling for data-ref that has the maximum number of data-refs
1707 with the same alignment, unless the target prefers to align
1708 stores over load. */
1709 unsigned same_align_drs
1710 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1711 if (!dr0
1712 || same_align_drs_max < same_align_drs)
1714 same_align_drs_max = same_align_drs;
1715 dr0 = dr;
1717 /* For data-refs with the same number of related
1718 accesses prefer the one where the misalign
1719 computation will be invariant in the outermost loop. */
1720 else if (same_align_drs_max == same_align_drs)
1722 struct loop *ivloop0, *ivloop;
1723 ivloop0 = outermost_invariant_loop_for_expr
1724 (loop, DR_BASE_ADDRESS (dr0));
1725 ivloop = outermost_invariant_loop_for_expr
1726 (loop, DR_BASE_ADDRESS (dr));
1727 if ((ivloop && !ivloop0)
1728 || (ivloop && ivloop0
1729 && flow_loop_nested_p (ivloop, ivloop0)))
1730 dr0 = dr;
1733 one_misalignment_unknown = true;
1735 /* Check for data refs with unsupportable alignment that
1736 can be peeled. */
1737 if (!supportable_dr_alignment)
1739 one_dr_unsupportable = true;
1740 unsupportable_dr = dr;
1743 if (!first_store && DR_IS_WRITE (dr))
1744 first_store = dr;
1747 else
1749 if (!aligned_access_p (dr))
1751 if (dump_enabled_p ())
1752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753 "vector alignment may not be reachable\n");
1754 break;
1759 /* Check if we can possibly peel the loop. */
1760 if (!vect_can_advance_ivs_p (loop_vinfo)
1761 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1762 || loop->inner)
1763 do_peeling = false;
1765 struct _vect_peel_extended_info peel_for_known_alignment;
1766 struct _vect_peel_extended_info peel_for_unknown_alignment;
1767 struct _vect_peel_extended_info best_peel;
1769 peel_for_unknown_alignment.inside_cost = INT_MAX;
1770 peel_for_unknown_alignment.outside_cost = INT_MAX;
1771 peel_for_unknown_alignment.peel_info.count = 0;
1773 if (do_peeling
1774 && one_misalignment_unknown)
1776 /* Check if the target requires to prefer stores over loads, i.e., if
1777 misaligned stores are more expensive than misaligned loads (taking
1778 drs with same alignment into account). */
1779 unsigned int load_inside_cost = 0;
1780 unsigned int load_outside_cost = 0;
1781 unsigned int store_inside_cost = 0;
1782 unsigned int store_outside_cost = 0;
1784 stmt_vector_for_cost dummy;
1785 dummy.create (2);
1786 vect_get_peeling_costs_all_drs (datarefs, dr0,
1787 &load_inside_cost,
1788 &load_outside_cost,
1789 &dummy, vf / 2, true);
1790 dummy.release ();
1792 if (first_store)
1794 dummy.create (2);
1795 vect_get_peeling_costs_all_drs (datarefs, first_store,
1796 &store_inside_cost,
1797 &store_outside_cost,
1798 &dummy, vf / 2, true);
1799 dummy.release ();
1801 else
1803 store_inside_cost = INT_MAX;
1804 store_outside_cost = INT_MAX;
1807 if (load_inside_cost > store_inside_cost
1808 || (load_inside_cost == store_inside_cost
1809 && load_outside_cost > store_outside_cost))
1811 dr0 = first_store;
1812 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1813 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1815 else
1817 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1818 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1821 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1822 prologue_cost_vec.create (2);
1823 epilogue_cost_vec.create (2);
1825 int dummy2;
1826 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1827 (loop_vinfo, vf / 2, &dummy2,
1828 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1829 &prologue_cost_vec, &epilogue_cost_vec);
1831 prologue_cost_vec.release ();
1832 epilogue_cost_vec.release ();
1834 peel_for_unknown_alignment.peel_info.count = 1
1835 + STMT_VINFO_SAME_ALIGN_REFS
1836 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1839 peel_for_unknown_alignment.peel_info.npeel = 0;
1840 peel_for_unknown_alignment.peel_info.dr = dr0;
1842 best_peel = peel_for_unknown_alignment;
1844 peel_for_known_alignment.inside_cost = INT_MAX;
1845 peel_for_known_alignment.outside_cost = INT_MAX;
1846 peel_for_known_alignment.peel_info.count = 0;
1847 peel_for_known_alignment.peel_info.dr = NULL;
1849 if (do_peeling && one_misalignment_known)
1851 /* Peeling is possible, but there is no data access that is not supported
1852 unless aligned. So we try to choose the best possible peeling from
1853 the hash table. */
1854 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1855 (&peeling_htab, loop_vinfo);
1858 /* Compare costs of peeling for known and unknown alignment. */
1859 if (peel_for_known_alignment.peel_info.dr != NULL
1860 && peel_for_unknown_alignment.inside_cost
1861 >= peel_for_known_alignment.inside_cost)
1863 best_peel = peel_for_known_alignment;
1865 /* If the best peeling for known alignment has NPEEL == 0, perform no
1866 peeling at all except if there is an unsupportable dr that we can
1867 align. */
1868 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1869 do_peeling = false;
1872 /* If there is an unsupportable data ref, prefer this over all choices so far
1873 since we'd have to discard a chosen peeling except when it accidentally
1874 aligned the unsupportable data ref. */
1875 if (one_dr_unsupportable)
1876 dr0 = unsupportable_dr;
1877 else if (do_peeling)
1879 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1880 TODO: Use nopeel_outside_cost or get rid of it? */
1881 unsigned nopeel_inside_cost = 0;
1882 unsigned nopeel_outside_cost = 0;
1884 stmt_vector_for_cost dummy;
1885 dummy.create (2);
1886 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1887 &nopeel_outside_cost, &dummy, 0, false);
1888 dummy.release ();
1890 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1891 costs will be recorded. */
1892 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1893 prologue_cost_vec.create (2);
1894 epilogue_cost_vec.create (2);
1896 int dummy2;
1897 nopeel_outside_cost += vect_get_known_peeling_cost
1898 (loop_vinfo, 0, &dummy2,
1899 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1900 &prologue_cost_vec, &epilogue_cost_vec);
1902 prologue_cost_vec.release ();
1903 epilogue_cost_vec.release ();
1905 npeel = best_peel.peel_info.npeel;
1906 dr0 = best_peel.peel_info.dr;
1908 /* If no peeling is not more expensive than the best peeling we
1909 have so far, don't perform any peeling. */
1910 if (nopeel_inside_cost <= best_peel.inside_cost)
1911 do_peeling = false;
1914 if (do_peeling)
1916 stmt = DR_STMT (dr0);
1917 stmt_info = vinfo_for_stmt (stmt);
1918 vectype = STMT_VINFO_VECTYPE (stmt_info);
1919 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1921 if (known_alignment_for_access_p (dr0))
1923 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1924 size_zero_node) < 0;
1925 if (!npeel)
1927 /* Since it's known at compile time, compute the number of
1928 iterations in the peeled loop (the peeling factor) for use in
1929 updating DR_MISALIGNMENT values. The peeling factor is the
1930 vectorization factor minus the misalignment as an element
1931 count. */
1932 mis = DR_MISALIGNMENT (dr0);
1933 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1934 npeel = ((negative ? mis - nelements : nelements - mis)
1935 & (nelements - 1));
1938 /* For interleaved data access every iteration accesses all the
1939 members of the group, therefore we divide the number of iterations
1940 by the group size. */
1941 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1942 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1943 npeel /= GROUP_SIZE (stmt_info);
1945 if (dump_enabled_p ())
1946 dump_printf_loc (MSG_NOTE, vect_location,
1947 "Try peeling by %d\n", npeel);
1950 /* Ensure that all datarefs can be vectorized after the peel. */
1951 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1952 do_peeling = false;
1954 /* Check if all datarefs are supportable and log. */
1955 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1957 stat = vect_verify_datarefs_alignment (loop_vinfo);
1958 if (!stat)
1959 do_peeling = false;
1960 else
1961 return stat;
1964 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1965 if (do_peeling)
1967 unsigned max_allowed_peel
1968 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1969 if (max_allowed_peel != (unsigned)-1)
1971 unsigned max_peel = npeel;
1972 if (max_peel == 0)
1974 gimple *dr_stmt = DR_STMT (dr0);
1975 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1976 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1977 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1979 if (max_peel > max_allowed_peel)
1981 do_peeling = false;
1982 if (dump_enabled_p ())
1983 dump_printf_loc (MSG_NOTE, vect_location,
1984 "Disable peeling, max peels reached: %d\n", max_peel);
1989 /* Cost model #2 - if peeling may result in a remaining loop not
1990 iterating enough to be vectorized then do not peel. */
1991 if (do_peeling
1992 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1994 unsigned max_peel
1995 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1996 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1997 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1998 do_peeling = false;
2001 if (do_peeling)
2003 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2004 If the misalignment of DR_i is identical to that of dr0 then set
2005 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2006 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2007 by the peeling factor times the element size of DR_i (MOD the
2008 vectorization factor times the size). Otherwise, the
2009 misalignment of DR_i must be set to unknown. */
2010 FOR_EACH_VEC_ELT (datarefs, i, dr)
2011 if (dr != dr0)
2013 /* Strided accesses perform only component accesses, alignment
2014 is irrelevant for them. */
2015 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2016 if (STMT_VINFO_STRIDED_P (stmt_info)
2017 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2018 continue;
2020 vect_update_misalignment_for_peel (dr, dr0, npeel);
2023 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2024 if (npeel)
2025 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2026 else
2027 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2028 = DR_MISALIGNMENT (dr0);
2029 SET_DR_MISALIGNMENT (dr0, 0);
2030 if (dump_enabled_p ())
2032 dump_printf_loc (MSG_NOTE, vect_location,
2033 "Alignment of access forced using peeling.\n");
2034 dump_printf_loc (MSG_NOTE, vect_location,
2035 "Peeling for alignment will be applied.\n");
2038 /* The inside-loop cost will be accounted for in vectorizable_load
2039 and vectorizable_store correctly with adjusted alignments.
2040 Drop the body_cst_vec on the floor here. */
2041 stat = vect_verify_datarefs_alignment (loop_vinfo);
2042 gcc_assert (stat);
2043 return stat;
2047 /* (2) Versioning to force alignment. */
2049 /* Try versioning if:
2050 1) optimize loop for speed
2051 2) there is at least one unsupported misaligned data ref with an unknown
2052 misalignment, and
2053 3) all misaligned data refs with a known misalignment are supported, and
2054 4) the number of runtime alignment checks is within reason. */
2056 do_versioning =
2057 optimize_loop_nest_for_speed_p (loop)
2058 && (!loop->inner); /* FORNOW */
2060 if (do_versioning)
2062 FOR_EACH_VEC_ELT (datarefs, i, dr)
2064 stmt = DR_STMT (dr);
2065 stmt_info = vinfo_for_stmt (stmt);
2067 /* For interleaving, only the alignment of the first access
2068 matters. */
2069 if (aligned_access_p (dr)
2070 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2071 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2072 continue;
2074 if (STMT_VINFO_STRIDED_P (stmt_info))
2076 /* Strided loads perform only component accesses, alignment is
2077 irrelevant for them. */
2078 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2079 continue;
2080 do_versioning = false;
2081 break;
2084 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2086 if (!supportable_dr_alignment)
2088 gimple *stmt;
2089 int mask;
2090 tree vectype;
2092 if (known_alignment_for_access_p (dr)
2093 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2094 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2096 do_versioning = false;
2097 break;
2100 stmt = DR_STMT (dr);
2101 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2102 gcc_assert (vectype);
2104 /* The rightmost bits of an aligned address must be zeros.
2105 Construct the mask needed for this test. For example,
2106 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2107 mask must be 15 = 0xf. */
2108 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2110 /* FORNOW: use the same mask to test all potentially unaligned
2111 references in the loop. The vectorizer currently supports
2112 a single vector size, see the reference to
2113 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2114 vectorization factor is computed. */
2115 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2116 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2117 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2118 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2119 DR_STMT (dr));
2123 /* Versioning requires at least one misaligned data reference. */
2124 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2125 do_versioning = false;
2126 else if (!do_versioning)
2127 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2130 if (do_versioning)
2132 vec<gimple *> may_misalign_stmts
2133 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2134 gimple *stmt;
2136 /* It can now be assumed that the data references in the statements
2137 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2138 of the loop being vectorized. */
2139 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2141 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2142 dr = STMT_VINFO_DATA_REF (stmt_info);
2143 SET_DR_MISALIGNMENT (dr, 0);
2144 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_NOTE, vect_location,
2146 "Alignment of access forced using versioning.\n");
2149 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_NOTE, vect_location,
2151 "Versioning for alignment will be applied.\n");
2153 /* Peeling and versioning can't be done together at this time. */
2154 gcc_assert (! (do_peeling && do_versioning));
2156 stat = vect_verify_datarefs_alignment (loop_vinfo);
2157 gcc_assert (stat);
2158 return stat;
2161 /* This point is reached if neither peeling nor versioning is being done. */
2162 gcc_assert (! (do_peeling || do_versioning));
2164 stat = vect_verify_datarefs_alignment (loop_vinfo);
2165 return stat;
2169 /* Function vect_find_same_alignment_drs.
2171 Update group and alignment relations according to the chosen
2172 vectorization factor. */
2174 static void
2175 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2177 struct data_reference *dra = DDR_A (ddr);
2178 struct data_reference *drb = DDR_B (ddr);
2179 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2180 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2182 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2183 return;
2185 if (dra == drb)
2186 return;
2188 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2189 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2190 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2191 return;
2193 /* Two references with distance zero have the same alignment. */
2194 offset_int diff = (wi::to_offset (DR_INIT (dra))
2195 - wi::to_offset (DR_INIT (drb)));
2196 if (diff != 0)
2198 /* Get the wider of the two alignments. */
2199 unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2200 unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2201 unsigned int max_align = MAX (align_a, align_b);
2203 /* Require the gap to be a multiple of the larger vector alignment. */
2204 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2205 return;
2208 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2209 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2210 if (dump_enabled_p ())
2212 dump_printf_loc (MSG_NOTE, vect_location,
2213 "accesses have the same alignment: ");
2214 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2215 dump_printf (MSG_NOTE, " and ");
2216 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2217 dump_printf (MSG_NOTE, "\n");
2222 /* Function vect_analyze_data_refs_alignment
2224 Analyze the alignment of the data-references in the loop.
2225 Return FALSE if a data reference is found that cannot be vectorized. */
2227 bool
2228 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2230 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_NOTE, vect_location,
2232 "=== vect_analyze_data_refs_alignment ===\n");
2234 /* Mark groups of data references with same alignment using
2235 data dependence information. */
2236 vec<ddr_p> ddrs = vinfo->ddrs;
2237 struct data_dependence_relation *ddr;
2238 unsigned int i;
2240 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2241 vect_find_same_alignment_drs (ddr);
2243 vec<data_reference_p> datarefs = vinfo->datarefs;
2244 struct data_reference *dr;
2246 vect_record_base_alignments (vinfo);
2247 FOR_EACH_VEC_ELT (datarefs, i, dr)
2249 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2250 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2251 && !vect_compute_data_ref_alignment (dr))
2253 /* Strided accesses perform only component accesses, misalignment
2254 information is irrelevant for them. */
2255 if (STMT_VINFO_STRIDED_P (stmt_info)
2256 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2257 continue;
2259 if (dump_enabled_p ())
2260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2261 "not vectorized: can't calculate alignment "
2262 "for data ref.\n");
2264 return false;
2268 return true;
2272 /* Analyze alignment of DRs of stmts in NODE. */
2274 static bool
2275 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2277 /* We vectorize from the first scalar stmt in the node unless
2278 the node is permuted in which case we start from the first
2279 element in the group. */
2280 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2281 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2282 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2283 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2285 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2286 if (! vect_compute_data_ref_alignment (dr)
2287 /* For creating the data-ref pointer we need alignment of the
2288 first element anyway. */
2289 || (dr != first_dr
2290 && ! vect_compute_data_ref_alignment (first_dr))
2291 || ! verify_data_ref_alignment (dr))
2293 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2295 "not vectorized: bad data alignment in basic "
2296 "block.\n");
2297 return false;
2300 return true;
2303 /* Function vect_slp_analyze_instance_alignment
2305 Analyze the alignment of the data-references in the SLP instance.
2306 Return FALSE if a data reference is found that cannot be vectorized. */
2308 bool
2309 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_NOTE, vect_location,
2313 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2315 slp_tree node;
2316 unsigned i;
2317 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2318 if (! vect_slp_analyze_and_verify_node_alignment (node))
2319 return false;
2321 node = SLP_INSTANCE_TREE (instance);
2322 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2323 && ! vect_slp_analyze_and_verify_node_alignment
2324 (SLP_INSTANCE_TREE (instance)))
2325 return false;
2327 return true;
2331 /* Analyze groups of accesses: check that DR belongs to a group of
2332 accesses of legal size, step, etc. Detect gaps, single element
2333 interleaving, and other special cases. Set grouped access info.
2334 Collect groups of strided stores for further use in SLP analysis.
2335 Worker for vect_analyze_group_access. */
2337 static bool
2338 vect_analyze_group_access_1 (struct data_reference *dr)
2340 tree step = DR_STEP (dr);
2341 tree scalar_type = TREE_TYPE (DR_REF (dr));
2342 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2343 gimple *stmt = DR_STMT (dr);
2344 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2345 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2346 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2347 HOST_WIDE_INT dr_step = -1;
2348 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2349 bool slp_impossible = false;
2351 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2352 size of the interleaving group (including gaps). */
2353 if (tree_fits_shwi_p (step))
2355 dr_step = tree_to_shwi (step);
2356 /* Check that STEP is a multiple of type size. Otherwise there is
2357 a non-element-sized gap at the end of the group which we
2358 cannot represent in GROUP_GAP or GROUP_SIZE.
2359 ??? As we can handle non-constant step fine here we should
2360 simply remove uses of GROUP_GAP between the last and first
2361 element and instead rely on DR_STEP. GROUP_SIZE then would
2362 simply not include that gap. */
2363 if ((dr_step % type_size) != 0)
2365 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_NOTE, vect_location,
2368 "Step ");
2369 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2370 dump_printf (MSG_NOTE,
2371 " is not a multiple of the element size for ");
2372 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2373 dump_printf (MSG_NOTE, "\n");
2375 return false;
2377 groupsize = absu_hwi (dr_step) / type_size;
2379 else
2380 groupsize = 0;
2382 /* Not consecutive access is possible only if it is a part of interleaving. */
2383 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2385 /* Check if it this DR is a part of interleaving, and is a single
2386 element of the group that is accessed in the loop. */
2388 /* Gaps are supported only for loads. STEP must be a multiple of the type
2389 size. The size of the group must be a power of 2. */
2390 if (DR_IS_READ (dr)
2391 && (dr_step % type_size) == 0
2392 && groupsize > 0
2393 && pow2p_hwi (groupsize))
2395 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2396 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2397 GROUP_GAP (stmt_info) = groupsize - 1;
2398 if (dump_enabled_p ())
2400 dump_printf_loc (MSG_NOTE, vect_location,
2401 "Detected single element interleaving ");
2402 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2403 dump_printf (MSG_NOTE, " step ");
2404 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2405 dump_printf (MSG_NOTE, "\n");
2408 return true;
2411 if (dump_enabled_p ())
2413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2414 "not consecutive access ");
2415 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2418 if (bb_vinfo)
2420 /* Mark the statement as unvectorizable. */
2421 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2422 return true;
2425 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2426 STMT_VINFO_STRIDED_P (stmt_info) = true;
2427 return true;
2430 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2432 /* First stmt in the interleaving chain. Check the chain. */
2433 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2434 struct data_reference *data_ref = dr;
2435 unsigned int count = 1;
2436 tree prev_init = DR_INIT (data_ref);
2437 gimple *prev = stmt;
2438 HOST_WIDE_INT diff, gaps = 0;
2440 while (next)
2442 /* Skip same data-refs. In case that two or more stmts share
2443 data-ref (supported only for loads), we vectorize only the first
2444 stmt, and the rest get their vectorized loads from the first
2445 one. */
2446 if (!tree_int_cst_compare (DR_INIT (data_ref),
2447 DR_INIT (STMT_VINFO_DATA_REF (
2448 vinfo_for_stmt (next)))))
2450 if (DR_IS_WRITE (data_ref))
2452 if (dump_enabled_p ())
2453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2454 "Two store stmts share the same dr.\n");
2455 return false;
2458 if (dump_enabled_p ())
2459 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2460 "Two or more load stmts share the same dr.\n");
2462 /* For load use the same data-ref load. */
2463 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2465 prev = next;
2466 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2467 continue;
2470 prev = next;
2471 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2473 /* All group members have the same STEP by construction. */
2474 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2476 /* Check that the distance between two accesses is equal to the type
2477 size. Otherwise, we have gaps. */
2478 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2479 - TREE_INT_CST_LOW (prev_init)) / type_size;
2480 if (diff != 1)
2482 /* FORNOW: SLP of accesses with gaps is not supported. */
2483 slp_impossible = true;
2484 if (DR_IS_WRITE (data_ref))
2486 if (dump_enabled_p ())
2487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2488 "interleaved store with gaps\n");
2489 return false;
2492 gaps += diff - 1;
2495 last_accessed_element += diff;
2497 /* Store the gap from the previous member of the group. If there is no
2498 gap in the access, GROUP_GAP is always 1. */
2499 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2501 prev_init = DR_INIT (data_ref);
2502 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2503 /* Count the number of data-refs in the chain. */
2504 count++;
2507 if (groupsize == 0)
2508 groupsize = count + gaps;
2510 /* This could be UINT_MAX but as we are generating code in a very
2511 inefficient way we have to cap earlier. See PR78699 for example. */
2512 if (groupsize > 4096)
2514 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2516 "group is too large\n");
2517 return false;
2520 /* Check that the size of the interleaving is equal to count for stores,
2521 i.e., that there are no gaps. */
2522 if (groupsize != count
2523 && !DR_IS_READ (dr))
2525 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2527 "interleaved store with gaps\n");
2528 return false;
2531 /* If there is a gap after the last load in the group it is the
2532 difference between the groupsize and the last accessed
2533 element.
2534 When there is no gap, this difference should be 0. */
2535 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2537 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2538 if (dump_enabled_p ())
2540 dump_printf_loc (MSG_NOTE, vect_location,
2541 "Detected interleaving ");
2542 if (DR_IS_READ (dr))
2543 dump_printf (MSG_NOTE, "load ");
2544 else
2545 dump_printf (MSG_NOTE, "store ");
2546 dump_printf (MSG_NOTE, "of size %u starting with ",
2547 (unsigned)groupsize);
2548 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2549 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2550 dump_printf_loc (MSG_NOTE, vect_location,
2551 "There is a gap of %u elements after the group\n",
2552 GROUP_GAP (vinfo_for_stmt (stmt)));
2555 /* SLP: create an SLP data structure for every interleaving group of
2556 stores for further analysis in vect_analyse_slp. */
2557 if (DR_IS_WRITE (dr) && !slp_impossible)
2559 if (loop_vinfo)
2560 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2561 if (bb_vinfo)
2562 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2566 return true;
2569 /* Analyze groups of accesses: check that DR belongs to a group of
2570 accesses of legal size, step, etc. Detect gaps, single element
2571 interleaving, and other special cases. Set grouped access info.
2572 Collect groups of strided stores for further use in SLP analysis. */
2574 static bool
2575 vect_analyze_group_access (struct data_reference *dr)
2577 if (!vect_analyze_group_access_1 (dr))
2579 /* Dissolve the group if present. */
2580 gimple *next;
2581 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2582 while (stmt)
2584 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2585 next = GROUP_NEXT_ELEMENT (vinfo);
2586 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2587 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2588 stmt = next;
2590 return false;
2592 return true;
2595 /* Analyze the access pattern of the data-reference DR.
2596 In case of non-consecutive accesses call vect_analyze_group_access() to
2597 analyze groups of accesses. */
2599 static bool
2600 vect_analyze_data_ref_access (struct data_reference *dr)
2602 tree step = DR_STEP (dr);
2603 tree scalar_type = TREE_TYPE (DR_REF (dr));
2604 gimple *stmt = DR_STMT (dr);
2605 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2606 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2607 struct loop *loop = NULL;
2609 if (loop_vinfo)
2610 loop = LOOP_VINFO_LOOP (loop_vinfo);
2612 if (loop_vinfo && !step)
2614 if (dump_enabled_p ())
2615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2616 "bad data-ref access in loop\n");
2617 return false;
2620 /* Allow loads with zero step in inner-loop vectorization. */
2621 if (loop_vinfo && integer_zerop (step))
2623 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2624 if (!nested_in_vect_loop_p (loop, stmt))
2625 return DR_IS_READ (dr);
2626 /* Allow references with zero step for outer loops marked
2627 with pragma omp simd only - it guarantees absence of
2628 loop-carried dependencies between inner loop iterations. */
2629 if (!loop->force_vectorize)
2631 if (dump_enabled_p ())
2632 dump_printf_loc (MSG_NOTE, vect_location,
2633 "zero step in inner loop of nest\n");
2634 return false;
2638 if (loop && nested_in_vect_loop_p (loop, stmt))
2640 /* Interleaved accesses are not yet supported within outer-loop
2641 vectorization for references in the inner-loop. */
2642 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2644 /* For the rest of the analysis we use the outer-loop step. */
2645 step = STMT_VINFO_DR_STEP (stmt_info);
2646 if (integer_zerop (step))
2648 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE, vect_location,
2650 "zero step in outer loop.\n");
2651 return DR_IS_READ (dr);
2655 /* Consecutive? */
2656 if (TREE_CODE (step) == INTEGER_CST)
2658 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2659 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2660 || (dr_step < 0
2661 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2663 /* Mark that it is not interleaving. */
2664 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2665 return true;
2669 if (loop && nested_in_vect_loop_p (loop, stmt))
2671 if (dump_enabled_p ())
2672 dump_printf_loc (MSG_NOTE, vect_location,
2673 "grouped access in outer loop.\n");
2674 return false;
2678 /* Assume this is a DR handled by non-constant strided load case. */
2679 if (TREE_CODE (step) != INTEGER_CST)
2680 return (STMT_VINFO_STRIDED_P (stmt_info)
2681 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2682 || vect_analyze_group_access (dr)));
2684 /* Not consecutive access - check if it's a part of interleaving group. */
2685 return vect_analyze_group_access (dr);
2688 /* Compare two data-references DRA and DRB to group them into chunks
2689 suitable for grouping. */
2691 static int
2692 dr_group_sort_cmp (const void *dra_, const void *drb_)
2694 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2695 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2696 int cmp;
2698 /* Stabilize sort. */
2699 if (dra == drb)
2700 return 0;
2702 /* DRs in different loops never belong to the same group. */
2703 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2704 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2705 if (loopa != loopb)
2706 return loopa->num < loopb->num ? -1 : 1;
2708 /* Ordering of DRs according to base. */
2709 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2711 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2712 DR_BASE_ADDRESS (drb));
2713 if (cmp != 0)
2714 return cmp;
2717 /* And according to DR_OFFSET. */
2718 if (!dr_equal_offsets_p (dra, drb))
2720 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2721 if (cmp != 0)
2722 return cmp;
2725 /* Put reads before writes. */
2726 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2727 return DR_IS_READ (dra) ? -1 : 1;
2729 /* Then sort after access size. */
2730 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2731 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2733 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2734 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2735 if (cmp != 0)
2736 return cmp;
2739 /* And after step. */
2740 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2742 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2743 if (cmp != 0)
2744 return cmp;
2747 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2748 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2749 if (cmp == 0)
2750 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2751 return cmp;
2754 /* Function vect_analyze_data_ref_accesses.
2756 Analyze the access pattern of all the data references in the loop.
2758 FORNOW: the only access pattern that is considered vectorizable is a
2759 simple step 1 (consecutive) access.
2761 FORNOW: handle only arrays and pointer accesses. */
2763 bool
2764 vect_analyze_data_ref_accesses (vec_info *vinfo)
2766 unsigned int i;
2767 vec<data_reference_p> datarefs = vinfo->datarefs;
2768 struct data_reference *dr;
2770 if (dump_enabled_p ())
2771 dump_printf_loc (MSG_NOTE, vect_location,
2772 "=== vect_analyze_data_ref_accesses ===\n");
2774 if (datarefs.is_empty ())
2775 return true;
2777 /* Sort the array of datarefs to make building the interleaving chains
2778 linear. Don't modify the original vector's order, it is needed for
2779 determining what dependencies are reversed. */
2780 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2781 datarefs_copy.qsort (dr_group_sort_cmp);
2783 /* Build the interleaving chains. */
2784 for (i = 0; i < datarefs_copy.length () - 1;)
2786 data_reference_p dra = datarefs_copy[i];
2787 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2788 stmt_vec_info lastinfo = NULL;
2789 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2791 ++i;
2792 continue;
2794 for (i = i + 1; i < datarefs_copy.length (); ++i)
2796 data_reference_p drb = datarefs_copy[i];
2797 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2798 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2799 break;
2801 /* ??? Imperfect sorting (non-compatible types, non-modulo
2802 accesses, same accesses) can lead to a group to be artificially
2803 split here as we don't just skip over those. If it really
2804 matters we can push those to a worklist and re-iterate
2805 over them. The we can just skip ahead to the next DR here. */
2807 /* DRs in a different loop should not be put into the same
2808 interleaving group. */
2809 if (gimple_bb (DR_STMT (dra))->loop_father
2810 != gimple_bb (DR_STMT (drb))->loop_father)
2811 break;
2813 /* Check that the data-refs have same first location (except init)
2814 and they are both either store or load (not load and store,
2815 not masked loads or stores). */
2816 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2817 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2818 DR_BASE_ADDRESS (drb), 0)
2819 || !dr_equal_offsets_p (dra, drb)
2820 || !gimple_assign_single_p (DR_STMT (dra))
2821 || !gimple_assign_single_p (DR_STMT (drb)))
2822 break;
2824 /* Check that the data-refs have the same constant size. */
2825 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2826 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2827 if (!tree_fits_uhwi_p (sza)
2828 || !tree_fits_uhwi_p (szb)
2829 || !tree_int_cst_equal (sza, szb))
2830 break;
2832 /* Check that the data-refs have the same step. */
2833 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2834 break;
2836 /* Do not place the same access in the interleaving chain twice. */
2837 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2838 break;
2840 /* Check the types are compatible.
2841 ??? We don't distinguish this during sorting. */
2842 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2843 TREE_TYPE (DR_REF (drb))))
2844 break;
2846 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2847 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2848 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2849 gcc_assert (init_a <= init_b);
2851 /* If init_b == init_a + the size of the type * k, we have an
2852 interleaving, and DRA is accessed before DRB. */
2853 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2854 if (type_size_a == 0
2855 || (init_b - init_a) % type_size_a != 0)
2856 break;
2858 /* If we have a store, the accesses are adjacent. This splits
2859 groups into chunks we support (we don't support vectorization
2860 of stores with gaps). */
2861 if (!DR_IS_READ (dra)
2862 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2863 (DR_INIT (datarefs_copy[i-1]))
2864 != type_size_a))
2865 break;
2867 /* If the step (if not zero or non-constant) is greater than the
2868 difference between data-refs' inits this splits groups into
2869 suitable sizes. */
2870 if (tree_fits_shwi_p (DR_STEP (dra)))
2872 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2873 if (step != 0 && step <= (init_b - init_a))
2874 break;
2877 if (dump_enabled_p ())
2879 dump_printf_loc (MSG_NOTE, vect_location,
2880 "Detected interleaving ");
2881 if (DR_IS_READ (dra))
2882 dump_printf (MSG_NOTE, "load ");
2883 else
2884 dump_printf (MSG_NOTE, "store ");
2885 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2886 dump_printf (MSG_NOTE, " and ");
2887 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2888 dump_printf (MSG_NOTE, "\n");
2891 /* Link the found element into the group list. */
2892 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2894 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2895 lastinfo = stmtinfo_a;
2897 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2898 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2899 lastinfo = stmtinfo_b;
2903 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2904 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2905 && !vect_analyze_data_ref_access (dr))
2907 if (dump_enabled_p ())
2908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2909 "not vectorized: complicated access pattern.\n");
2911 if (is_a <bb_vec_info> (vinfo))
2913 /* Mark the statement as not vectorizable. */
2914 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2915 continue;
2917 else
2919 datarefs_copy.release ();
2920 return false;
2924 datarefs_copy.release ();
2925 return true;
2928 /* Function vect_vfa_segment_size.
2930 Create an expression that computes the size of segment
2931 that will be accessed for a data reference. The functions takes into
2932 account that realignment loads may access one more vector.
2934 Input:
2935 DR: The data reference.
2936 LENGTH_FACTOR: segment length to consider.
2938 Return an expression whose value is the size of segment which will be
2939 accessed by DR. */
2941 static tree
2942 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2944 tree segment_length;
2946 if (integer_zerop (DR_STEP (dr)))
2947 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2948 else
2949 segment_length = size_binop (MULT_EXPR,
2950 fold_convert (sizetype, DR_STEP (dr)),
2951 fold_convert (sizetype, length_factor));
2953 if (vect_supportable_dr_alignment (dr, false)
2954 == dr_explicit_realign_optimized)
2956 tree vector_size = TYPE_SIZE_UNIT
2957 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2959 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2961 return segment_length;
2964 /* Function vect_no_alias_p.
2966 Given data references A and B with equal base and offset, the alias
2967 relation can be decided at compilation time, return TRUE if they do
2968 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2969 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2970 respectively. */
2972 static bool
2973 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2974 tree segment_length_a, tree segment_length_b)
2976 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2977 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2978 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2979 return false;
2981 tree seg_a_min = DR_INIT (a);
2982 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2983 seg_a_min, segment_length_a);
2984 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2985 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2986 [a, a+12) */
2987 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2989 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2990 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2991 seg_a_max, unit_size);
2992 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2993 DR_INIT (a), unit_size);
2995 tree seg_b_min = DR_INIT (b);
2996 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2997 seg_b_min, segment_length_b);
2998 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3000 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3001 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3002 seg_b_max, unit_size);
3003 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3004 DR_INIT (b), unit_size);
3007 if (tree_int_cst_le (seg_a_max, seg_b_min)
3008 || tree_int_cst_le (seg_b_max, seg_a_min))
3009 return true;
3011 return false;
3014 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3015 in DDR is >= VF. */
3017 static bool
3018 dependence_distance_ge_vf (data_dependence_relation *ddr,
3019 unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3021 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3022 || DDR_NUM_DIST_VECTS (ddr) == 0)
3023 return false;
3025 /* If the dependence is exact, we should have limited the VF instead. */
3026 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3028 unsigned int i;
3029 lambda_vector dist_v;
3030 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3032 HOST_WIDE_INT dist = dist_v[loop_depth];
3033 if (dist != 0
3034 && !(dist > 0 && DDR_REVERSED_P (ddr))
3035 && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3036 return false;
3039 if (dump_enabled_p ())
3041 dump_printf_loc (MSG_NOTE, vect_location,
3042 "dependence distance between ");
3043 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3044 dump_printf (MSG_NOTE, " and ");
3045 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3046 dump_printf (MSG_NOTE, " is >= VF\n");
3049 return true;
3052 /* Function vect_prune_runtime_alias_test_list.
3054 Prune a list of ddrs to be tested at run-time by versioning for alias.
3055 Merge several alias checks into one if possible.
3056 Return FALSE if resulting list of ddrs is longer then allowed by
3057 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3059 bool
3060 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3062 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3063 hash_set <tree_pair_hash> compared_objects;
3065 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3066 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3067 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3068 vec<vec_object_pair> &check_unequal_addrs
3069 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3070 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3071 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3073 ddr_p ddr;
3074 unsigned int i;
3075 tree length_factor;
3077 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_NOTE, vect_location,
3079 "=== vect_prune_runtime_alias_test_list ===\n");
3081 if (may_alias_ddrs.is_empty ())
3082 return true;
3084 comp_alias_ddrs.create (may_alias_ddrs.length ());
3086 unsigned int loop_depth
3087 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3088 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3090 /* First, we collect all data ref pairs for aliasing checks. */
3091 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3093 int comp_res;
3094 struct data_reference *dr_a, *dr_b;
3095 gimple *dr_group_first_a, *dr_group_first_b;
3096 tree segment_length_a, segment_length_b;
3097 gimple *stmt_a, *stmt_b;
3099 /* Ignore the alias if the VF we chose ended up being no greater
3100 than the dependence distance. */
3101 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3102 continue;
3104 if (DDR_OBJECT_A (ddr))
3106 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3107 if (!compared_objects.add (new_pair))
3109 if (dump_enabled_p ())
3111 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3112 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3113 dump_printf (MSG_NOTE, " and ");
3114 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3115 dump_printf (MSG_NOTE, " have different addresses\n");
3117 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3119 continue;
3122 dr_a = DDR_A (ddr);
3123 stmt_a = DR_STMT (DDR_A (ddr));
3124 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3125 if (dr_group_first_a)
3127 stmt_a = dr_group_first_a;
3128 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3131 dr_b = DDR_B (ddr);
3132 stmt_b = DR_STMT (DDR_B (ddr));
3133 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3134 if (dr_group_first_b)
3136 stmt_b = dr_group_first_b;
3137 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3140 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3141 length_factor = scalar_loop_iters;
3142 else
3143 length_factor = size_int (vect_factor);
3144 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3145 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3147 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3148 DR_BASE_ADDRESS (dr_b));
3149 if (comp_res == 0)
3150 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3151 DR_OFFSET (dr_b));
3153 /* Alias is known at compilation time. */
3154 if (comp_res == 0
3155 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3156 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3157 && TREE_CODE (segment_length_a) == INTEGER_CST
3158 && TREE_CODE (segment_length_b) == INTEGER_CST)
3160 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3161 continue;
3163 if (dump_enabled_p ())
3164 dump_printf_loc (MSG_NOTE, vect_location,
3165 "not vectorized: compilation time alias.\n");
3167 return false;
3170 dr_with_seg_len_pair_t dr_with_seg_len_pair
3171 (dr_with_seg_len (dr_a, segment_length_a),
3172 dr_with_seg_len (dr_b, segment_length_b));
3174 /* Canonicalize pairs by sorting the two DR members. */
3175 if (comp_res > 0)
3176 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3178 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3181 prune_runtime_alias_test_list (&comp_alias_ddrs,
3182 (unsigned HOST_WIDE_INT) vect_factor);
3184 unsigned int count = (comp_alias_ddrs.length ()
3185 + check_unequal_addrs.length ());
3186 dump_printf_loc (MSG_NOTE, vect_location,
3187 "improved number of alias checks from %d to %d\n",
3188 may_alias_ddrs.length (), count);
3189 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3191 if (dump_enabled_p ())
3192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3193 "number of versioning for alias "
3194 "run-time tests exceeds %d "
3195 "(--param vect-max-version-for-alias-checks)\n",
3196 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3197 return false;
3200 return true;
3203 /* Return true if a non-affine read or write in STMT is suitable for a
3204 gather load or scatter store. Describe the operation in *INFO if so. */
3206 bool
3207 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3208 gather_scatter_info *info)
3210 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3211 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3212 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3213 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3214 tree offtype = NULL_TREE;
3215 tree decl, base, off;
3216 machine_mode pmode;
3217 int punsignedp, reversep, pvolatilep = 0;
3219 base = DR_REF (dr);
3220 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3221 see if we can use the def stmt of the address. */
3222 if (is_gimple_call (stmt)
3223 && gimple_call_internal_p (stmt)
3224 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3225 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3226 && TREE_CODE (base) == MEM_REF
3227 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3228 && integer_zerop (TREE_OPERAND (base, 1))
3229 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3231 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3232 if (is_gimple_assign (def_stmt)
3233 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3234 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3237 /* The gather and scatter builtins need address of the form
3238 loop_invariant + vector * {1, 2, 4, 8}
3240 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3241 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3242 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3243 multiplications and additions in it. To get a vector, we need
3244 a single SSA_NAME that will be defined in the loop and will
3245 contain everything that is not loop invariant and that can be
3246 vectorized. The following code attempts to find such a preexistng
3247 SSA_NAME OFF and put the loop invariants into a tree BASE
3248 that can be gimplified before the loop. */
3249 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3250 &punsignedp, &reversep, &pvolatilep);
3251 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3253 if (TREE_CODE (base) == MEM_REF)
3255 if (!integer_zerop (TREE_OPERAND (base, 1)))
3257 if (off == NULL_TREE)
3259 offset_int moff = mem_ref_offset (base);
3260 off = wide_int_to_tree (sizetype, moff);
3262 else
3263 off = size_binop (PLUS_EXPR, off,
3264 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3266 base = TREE_OPERAND (base, 0);
3268 else
3269 base = build_fold_addr_expr (base);
3271 if (off == NULL_TREE)
3272 off = size_zero_node;
3274 /* If base is not loop invariant, either off is 0, then we start with just
3275 the constant offset in the loop invariant BASE and continue with base
3276 as OFF, otherwise give up.
3277 We could handle that case by gimplifying the addition of base + off
3278 into some SSA_NAME and use that as off, but for now punt. */
3279 if (!expr_invariant_in_loop_p (loop, base))
3281 if (!integer_zerop (off))
3282 return false;
3283 off = base;
3284 base = size_int (pbitpos / BITS_PER_UNIT);
3286 /* Otherwise put base + constant offset into the loop invariant BASE
3287 and continue with OFF. */
3288 else
3290 base = fold_convert (sizetype, base);
3291 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3294 /* OFF at this point may be either a SSA_NAME or some tree expression
3295 from get_inner_reference. Try to peel off loop invariants from it
3296 into BASE as long as possible. */
3297 STRIP_NOPS (off);
3298 while (offtype == NULL_TREE)
3300 enum tree_code code;
3301 tree op0, op1, add = NULL_TREE;
3303 if (TREE_CODE (off) == SSA_NAME)
3305 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3307 if (expr_invariant_in_loop_p (loop, off))
3308 return false;
3310 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3311 break;
3313 op0 = gimple_assign_rhs1 (def_stmt);
3314 code = gimple_assign_rhs_code (def_stmt);
3315 op1 = gimple_assign_rhs2 (def_stmt);
3317 else
3319 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3320 return false;
3321 code = TREE_CODE (off);
3322 extract_ops_from_tree (off, &code, &op0, &op1);
3324 switch (code)
3326 case POINTER_PLUS_EXPR:
3327 case PLUS_EXPR:
3328 if (expr_invariant_in_loop_p (loop, op0))
3330 add = op0;
3331 off = op1;
3332 do_add:
3333 add = fold_convert (sizetype, add);
3334 if (scale != 1)
3335 add = size_binop (MULT_EXPR, add, size_int (scale));
3336 base = size_binop (PLUS_EXPR, base, add);
3337 continue;
3339 if (expr_invariant_in_loop_p (loop, op1))
3341 add = op1;
3342 off = op0;
3343 goto do_add;
3345 break;
3346 case MINUS_EXPR:
3347 if (expr_invariant_in_loop_p (loop, op1))
3349 add = fold_convert (sizetype, op1);
3350 add = size_binop (MINUS_EXPR, size_zero_node, add);
3351 off = op0;
3352 goto do_add;
3354 break;
3355 case MULT_EXPR:
3356 if (scale == 1 && tree_fits_shwi_p (op1))
3358 scale = tree_to_shwi (op1);
3359 off = op0;
3360 continue;
3362 break;
3363 case SSA_NAME:
3364 off = op0;
3365 continue;
3366 CASE_CONVERT:
3367 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3368 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3369 break;
3370 if (TYPE_PRECISION (TREE_TYPE (op0))
3371 == TYPE_PRECISION (TREE_TYPE (off)))
3373 off = op0;
3374 continue;
3376 if (TYPE_PRECISION (TREE_TYPE (op0))
3377 < TYPE_PRECISION (TREE_TYPE (off)))
3379 off = op0;
3380 offtype = TREE_TYPE (off);
3381 STRIP_NOPS (off);
3382 continue;
3384 break;
3385 default:
3386 break;
3388 break;
3391 /* If at the end OFF still isn't a SSA_NAME or isn't
3392 defined in the loop, punt. */
3393 if (TREE_CODE (off) != SSA_NAME
3394 || expr_invariant_in_loop_p (loop, off))
3395 return false;
3397 if (offtype == NULL_TREE)
3398 offtype = TREE_TYPE (off);
3400 if (DR_IS_READ (dr))
3401 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3402 offtype, scale);
3403 else
3404 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3405 offtype, scale);
3407 if (decl == NULL_TREE)
3408 return false;
3410 info->decl = decl;
3411 info->base = base;
3412 info->offset = off;
3413 info->offset_dt = vect_unknown_def_type;
3414 info->offset_vectype = NULL_TREE;
3415 info->scale = scale;
3416 return true;
3419 /* Function vect_analyze_data_refs.
3421 Find all the data references in the loop or basic block.
3423 The general structure of the analysis of data refs in the vectorizer is as
3424 follows:
3425 1- vect_analyze_data_refs(loop/bb): call
3426 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3427 in the loop/bb and their dependences.
3428 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3429 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3430 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3434 bool
3435 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3437 struct loop *loop = NULL;
3438 unsigned int i;
3439 struct data_reference *dr;
3440 tree scalar_type;
3442 if (dump_enabled_p ())
3443 dump_printf_loc (MSG_NOTE, vect_location,
3444 "=== vect_analyze_data_refs ===\n");
3446 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3447 loop = LOOP_VINFO_LOOP (loop_vinfo);
3449 /* Go through the data-refs, check that the analysis succeeded. Update
3450 pointer from stmt_vec_info struct to DR and vectype. */
3452 vec<data_reference_p> datarefs = vinfo->datarefs;
3453 FOR_EACH_VEC_ELT (datarefs, i, dr)
3455 gimple *stmt;
3456 stmt_vec_info stmt_info;
3457 tree base, offset, init;
3458 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3459 bool simd_lane_access = false;
3460 int vf;
3462 again:
3463 if (!dr || !DR_REF (dr))
3465 if (dump_enabled_p ())
3466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3467 "not vectorized: unhandled data-ref\n");
3468 return false;
3471 stmt = DR_STMT (dr);
3472 stmt_info = vinfo_for_stmt (stmt);
3474 /* Discard clobbers from the dataref vector. We will remove
3475 clobber stmts during vectorization. */
3476 if (gimple_clobber_p (stmt))
3478 free_data_ref (dr);
3479 if (i == datarefs.length () - 1)
3481 datarefs.pop ();
3482 break;
3484 datarefs.ordered_remove (i);
3485 dr = datarefs[i];
3486 goto again;
3489 /* Check that analysis of the data-ref succeeded. */
3490 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3491 || !DR_STEP (dr))
3493 bool maybe_gather
3494 = DR_IS_READ (dr)
3495 && !TREE_THIS_VOLATILE (DR_REF (dr))
3496 && targetm.vectorize.builtin_gather != NULL;
3497 bool maybe_scatter
3498 = DR_IS_WRITE (dr)
3499 && !TREE_THIS_VOLATILE (DR_REF (dr))
3500 && targetm.vectorize.builtin_scatter != NULL;
3501 bool maybe_simd_lane_access
3502 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3504 /* If target supports vector gather loads or scatter stores, or if
3505 this might be a SIMD lane access, see if they can't be used. */
3506 if (is_a <loop_vec_info> (vinfo)
3507 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3508 && !nested_in_vect_loop_p (loop, stmt))
3510 struct data_reference *newdr
3511 = create_data_ref (NULL, loop_containing_stmt (stmt),
3512 DR_REF (dr), stmt, !maybe_scatter,
3513 DR_IS_CONDITIONAL_IN_STMT (dr));
3514 gcc_assert (newdr != NULL && DR_REF (newdr));
3515 if (DR_BASE_ADDRESS (newdr)
3516 && DR_OFFSET (newdr)
3517 && DR_INIT (newdr)
3518 && DR_STEP (newdr)
3519 && integer_zerop (DR_STEP (newdr)))
3521 if (maybe_simd_lane_access)
3523 tree off = DR_OFFSET (newdr);
3524 STRIP_NOPS (off);
3525 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3526 && TREE_CODE (off) == MULT_EXPR
3527 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3529 tree step = TREE_OPERAND (off, 1);
3530 off = TREE_OPERAND (off, 0);
3531 STRIP_NOPS (off);
3532 if (CONVERT_EXPR_P (off)
3533 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3534 0)))
3535 < TYPE_PRECISION (TREE_TYPE (off)))
3536 off = TREE_OPERAND (off, 0);
3537 if (TREE_CODE (off) == SSA_NAME)
3539 gimple *def = SSA_NAME_DEF_STMT (off);
3540 tree reft = TREE_TYPE (DR_REF (newdr));
3541 if (is_gimple_call (def)
3542 && gimple_call_internal_p (def)
3543 && (gimple_call_internal_fn (def)
3544 == IFN_GOMP_SIMD_LANE))
3546 tree arg = gimple_call_arg (def, 0);
3547 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3548 arg = SSA_NAME_VAR (arg);
3549 if (arg == loop->simduid
3550 /* For now. */
3551 && tree_int_cst_equal
3552 (TYPE_SIZE_UNIT (reft),
3553 step))
3555 DR_OFFSET (newdr) = ssize_int (0);
3556 DR_STEP (newdr) = step;
3557 DR_OFFSET_ALIGNMENT (newdr)
3558 = BIGGEST_ALIGNMENT;
3559 DR_STEP_ALIGNMENT (newdr)
3560 = highest_pow2_factor (step);
3561 dr = newdr;
3562 simd_lane_access = true;
3568 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3570 dr = newdr;
3571 if (maybe_gather)
3572 gatherscatter = GATHER;
3573 else
3574 gatherscatter = SCATTER;
3577 if (gatherscatter == SG_NONE && !simd_lane_access)
3578 free_data_ref (newdr);
3581 if (gatherscatter == SG_NONE && !simd_lane_access)
3583 if (dump_enabled_p ())
3585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3586 "not vectorized: data ref analysis "
3587 "failed ");
3588 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3591 if (is_a <bb_vec_info> (vinfo))
3592 break;
3594 return false;
3598 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3600 if (dump_enabled_p ())
3601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3602 "not vectorized: base addr of dr is a "
3603 "constant\n");
3605 if (is_a <bb_vec_info> (vinfo))
3606 break;
3608 if (gatherscatter != SG_NONE || simd_lane_access)
3609 free_data_ref (dr);
3610 return false;
3613 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3615 if (dump_enabled_p ())
3617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3618 "not vectorized: volatile type ");
3619 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3622 if (is_a <bb_vec_info> (vinfo))
3623 break;
3625 return false;
3628 if (stmt_can_throw_internal (stmt))
3630 if (dump_enabled_p ())
3632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3633 "not vectorized: statement can throw an "
3634 "exception ");
3635 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3638 if (is_a <bb_vec_info> (vinfo))
3639 break;
3641 if (gatherscatter != SG_NONE || simd_lane_access)
3642 free_data_ref (dr);
3643 return false;
3646 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3647 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3649 if (dump_enabled_p ())
3651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3652 "not vectorized: statement is bitfield "
3653 "access ");
3654 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3657 if (is_a <bb_vec_info> (vinfo))
3658 break;
3660 if (gatherscatter != SG_NONE || simd_lane_access)
3661 free_data_ref (dr);
3662 return false;
3665 base = unshare_expr (DR_BASE_ADDRESS (dr));
3666 offset = unshare_expr (DR_OFFSET (dr));
3667 init = unshare_expr (DR_INIT (dr));
3669 if (is_gimple_call (stmt)
3670 && (!gimple_call_internal_p (stmt)
3671 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3672 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3674 if (dump_enabled_p ())
3676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3677 "not vectorized: dr in a call ");
3678 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3681 if (is_a <bb_vec_info> (vinfo))
3682 break;
3684 if (gatherscatter != SG_NONE || simd_lane_access)
3685 free_data_ref (dr);
3686 return false;
3689 /* Update DR field in stmt_vec_info struct. */
3691 /* If the dataref is in an inner-loop of the loop that is considered for
3692 for vectorization, we also want to analyze the access relative to
3693 the outer-loop (DR contains information only relative to the
3694 inner-most enclosing loop). We do that by building a reference to the
3695 first location accessed by the inner-loop, and analyze it relative to
3696 the outer-loop. */
3697 if (loop && nested_in_vect_loop_p (loop, stmt))
3699 /* Build a reference to the first location accessed by the
3700 inner loop: *(BASE + INIT + OFFSET). By construction,
3701 this address must be invariant in the inner loop, so we
3702 can consider it as being used in the outer loop. */
3703 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3704 init, offset);
3705 tree init_addr = fold_build_pointer_plus (base, init_offset);
3706 tree init_ref = build_fold_indirect_ref (init_addr);
3708 if (dump_enabled_p ())
3710 dump_printf_loc (MSG_NOTE, vect_location,
3711 "analyze in outer loop: ");
3712 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3713 dump_printf (MSG_NOTE, "\n");
3716 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3717 init_ref, loop))
3718 /* dr_analyze_innermost already explained the failure. */
3719 return false;
3721 if (dump_enabled_p ())
3723 dump_printf_loc (MSG_NOTE, vect_location,
3724 "\touter base_address: ");
3725 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3726 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3727 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3728 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3729 STMT_VINFO_DR_OFFSET (stmt_info));
3730 dump_printf (MSG_NOTE,
3731 "\n\touter constant offset from base address: ");
3732 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3733 STMT_VINFO_DR_INIT (stmt_info));
3734 dump_printf (MSG_NOTE, "\n\touter step: ");
3735 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3736 STMT_VINFO_DR_STEP (stmt_info));
3737 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3738 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3739 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3740 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3741 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3742 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3743 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3744 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3748 if (STMT_VINFO_DATA_REF (stmt_info))
3750 if (dump_enabled_p ())
3752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3753 "not vectorized: more than one data ref "
3754 "in stmt: ");
3755 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3758 if (is_a <bb_vec_info> (vinfo))
3759 break;
3761 if (gatherscatter != SG_NONE || simd_lane_access)
3762 free_data_ref (dr);
3763 return false;
3766 STMT_VINFO_DATA_REF (stmt_info) = dr;
3767 if (simd_lane_access)
3769 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3770 free_data_ref (datarefs[i]);
3771 datarefs[i] = dr;
3774 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3775 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3776 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3778 if (dump_enabled_p ())
3780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3781 "not vectorized: base object not addressable "
3782 "for stmt: ");
3783 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3785 if (is_a <bb_vec_info> (vinfo))
3787 /* In BB vectorization the ref can still participate
3788 in dependence analysis, we just can't vectorize it. */
3789 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3790 continue;
3792 return false;
3795 /* Set vectype for STMT. */
3796 scalar_type = TREE_TYPE (DR_REF (dr));
3797 STMT_VINFO_VECTYPE (stmt_info)
3798 = get_vectype_for_scalar_type (scalar_type);
3799 if (!STMT_VINFO_VECTYPE (stmt_info))
3801 if (dump_enabled_p ())
3803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3804 "not vectorized: no vectype for stmt: ");
3805 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3806 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3807 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3808 scalar_type);
3809 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3812 if (is_a <bb_vec_info> (vinfo))
3814 /* No vector type is fine, the ref can still participate
3815 in dependence analysis, we just can't vectorize it. */
3816 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3817 continue;
3820 if (gatherscatter != SG_NONE || simd_lane_access)
3822 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3823 if (gatherscatter != SG_NONE)
3824 free_data_ref (dr);
3826 return false;
3828 else
3830 if (dump_enabled_p ())
3832 dump_printf_loc (MSG_NOTE, vect_location,
3833 "got vectype for stmt: ");
3834 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3835 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3836 STMT_VINFO_VECTYPE (stmt_info));
3837 dump_printf (MSG_NOTE, "\n");
3841 /* Adjust the minimal vectorization factor according to the
3842 vector type. */
3843 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3844 if (vf > *min_vf)
3845 *min_vf = vf;
3847 if (gatherscatter != SG_NONE)
3849 gather_scatter_info gs_info;
3850 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3851 &gs_info)
3852 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3854 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3855 free_data_ref (dr);
3856 if (dump_enabled_p ())
3858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3859 (gatherscatter == GATHER) ?
3860 "not vectorized: not suitable for gather "
3861 "load " :
3862 "not vectorized: not suitable for scatter "
3863 "store ");
3864 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3866 return false;
3869 free_data_ref (datarefs[i]);
3870 datarefs[i] = dr;
3871 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3874 else if (is_a <loop_vec_info> (vinfo)
3875 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3877 if (nested_in_vect_loop_p (loop, stmt))
3879 if (dump_enabled_p ())
3881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3882 "not vectorized: not suitable for strided "
3883 "load ");
3884 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3886 return false;
3888 STMT_VINFO_STRIDED_P (stmt_info) = true;
3892 /* If we stopped analysis at the first dataref we could not analyze
3893 when trying to vectorize a basic-block mark the rest of the datarefs
3894 as not vectorizable and truncate the vector of datarefs. That
3895 avoids spending useless time in analyzing their dependence. */
3896 if (i != datarefs.length ())
3898 gcc_assert (is_a <bb_vec_info> (vinfo));
3899 for (unsigned j = i; j < datarefs.length (); ++j)
3901 data_reference_p dr = datarefs[j];
3902 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3903 free_data_ref (dr);
3905 datarefs.truncate (i);
3908 return true;
3912 /* Function vect_get_new_vect_var.
3914 Returns a name for a new variable. The current naming scheme appends the
3915 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3916 the name of vectorizer generated variables, and appends that to NAME if
3917 provided. */
3919 tree
3920 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3922 const char *prefix;
3923 tree new_vect_var;
3925 switch (var_kind)
3927 case vect_simple_var:
3928 prefix = "vect";
3929 break;
3930 case vect_scalar_var:
3931 prefix = "stmp";
3932 break;
3933 case vect_mask_var:
3934 prefix = "mask";
3935 break;
3936 case vect_pointer_var:
3937 prefix = "vectp";
3938 break;
3939 default:
3940 gcc_unreachable ();
3943 if (name)
3945 char* tmp = concat (prefix, "_", name, NULL);
3946 new_vect_var = create_tmp_reg (type, tmp);
3947 free (tmp);
3949 else
3950 new_vect_var = create_tmp_reg (type, prefix);
3952 return new_vect_var;
3955 /* Like vect_get_new_vect_var but return an SSA name. */
3957 tree
3958 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3960 const char *prefix;
3961 tree new_vect_var;
3963 switch (var_kind)
3965 case vect_simple_var:
3966 prefix = "vect";
3967 break;
3968 case vect_scalar_var:
3969 prefix = "stmp";
3970 break;
3971 case vect_pointer_var:
3972 prefix = "vectp";
3973 break;
3974 default:
3975 gcc_unreachable ();
3978 if (name)
3980 char* tmp = concat (prefix, "_", name, NULL);
3981 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3982 free (tmp);
3984 else
3985 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3987 return new_vect_var;
3990 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3992 static void
3993 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3994 stmt_vec_info stmt_info)
3996 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3997 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3998 int misalign = DR_MISALIGNMENT (dr);
3999 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4000 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4001 else
4002 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4005 /* Function vect_create_addr_base_for_vector_ref.
4007 Create an expression that computes the address of the first memory location
4008 that will be accessed for a data reference.
4010 Input:
4011 STMT: The statement containing the data reference.
4012 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4013 OFFSET: Optional. If supplied, it is be added to the initial address.
4014 LOOP: Specify relative to which loop-nest should the address be computed.
4015 For example, when the dataref is in an inner-loop nested in an
4016 outer-loop that is now being vectorized, LOOP can be either the
4017 outer-loop, or the inner-loop. The first memory location accessed
4018 by the following dataref ('in' points to short):
4020 for (i=0; i<N; i++)
4021 for (j=0; j<M; j++)
4022 s += in[i+j]
4024 is as follows:
4025 if LOOP=i_loop: &in (relative to i_loop)
4026 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4027 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4028 initial address. Unlike OFFSET, which is number of elements to
4029 be added, BYTE_OFFSET is measured in bytes.
4031 Output:
4032 1. Return an SSA_NAME whose value is the address of the memory location of
4033 the first vector of the data reference.
4034 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4035 these statement(s) which define the returned SSA_NAME.
4037 FORNOW: We are only handling array accesses with step 1. */
4039 tree
4040 vect_create_addr_base_for_vector_ref (gimple *stmt,
4041 gimple_seq *new_stmt_list,
4042 tree offset,
4043 tree byte_offset)
4045 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4046 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4047 const char *base_name;
4048 tree addr_base;
4049 tree dest;
4050 gimple_seq seq = NULL;
4051 tree vect_ptr_type;
4052 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4053 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4054 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4056 tree data_ref_base = unshare_expr (drb->base_address);
4057 tree base_offset = unshare_expr (drb->offset);
4058 tree init = unshare_expr (drb->init);
4060 if (loop_vinfo)
4061 base_name = get_name (data_ref_base);
4062 else
4064 base_offset = ssize_int (0);
4065 init = ssize_int (0);
4066 base_name = get_name (DR_REF (dr));
4069 /* Create base_offset */
4070 base_offset = size_binop (PLUS_EXPR,
4071 fold_convert (sizetype, base_offset),
4072 fold_convert (sizetype, init));
4074 if (offset)
4076 offset = fold_build2 (MULT_EXPR, sizetype,
4077 fold_convert (sizetype, offset), step);
4078 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4079 base_offset, offset);
4081 if (byte_offset)
4083 byte_offset = fold_convert (sizetype, byte_offset);
4084 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4085 base_offset, byte_offset);
4088 /* base + base_offset */
4089 if (loop_vinfo)
4090 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4091 else
4093 addr_base = build1 (ADDR_EXPR,
4094 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4095 unshare_expr (DR_REF (dr)));
4098 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4099 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4100 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4101 gimple_seq_add_seq (new_stmt_list, seq);
4103 if (DR_PTR_INFO (dr)
4104 && TREE_CODE (addr_base) == SSA_NAME
4105 && !SSA_NAME_PTR_INFO (addr_base))
4107 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4108 if (offset || byte_offset)
4109 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4112 if (dump_enabled_p ())
4114 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4115 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4116 dump_printf (MSG_NOTE, "\n");
4119 return addr_base;
4123 /* Function vect_create_data_ref_ptr.
4125 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4126 location accessed in the loop by STMT, along with the def-use update
4127 chain to appropriately advance the pointer through the loop iterations.
4128 Also set aliasing information for the pointer. This pointer is used by
4129 the callers to this function to create a memory reference expression for
4130 vector load/store access.
4132 Input:
4133 1. STMT: a stmt that references memory. Expected to be of the form
4134 GIMPLE_ASSIGN <name, data-ref> or
4135 GIMPLE_ASSIGN <data-ref, name>.
4136 2. AGGR_TYPE: the type of the reference, which should be either a vector
4137 or an array.
4138 3. AT_LOOP: the loop where the vector memref is to be created.
4139 4. OFFSET (optional): an offset to be added to the initial address accessed
4140 by the data-ref in STMT.
4141 5. BSI: location where the new stmts are to be placed if there is no loop
4142 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4143 pointing to the initial address.
4144 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4145 to the initial address accessed by the data-ref in STMT. This is
4146 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4147 in bytes.
4149 Output:
4150 1. Declare a new ptr to vector_type, and have it point to the base of the
4151 data reference (initial addressed accessed by the data reference).
4152 For example, for vector of type V8HI, the following code is generated:
4154 v8hi *ap;
4155 ap = (v8hi *)initial_address;
4157 if OFFSET is not supplied:
4158 initial_address = &a[init];
4159 if OFFSET is supplied:
4160 initial_address = &a[init + OFFSET];
4161 if BYTE_OFFSET is supplied:
4162 initial_address = &a[init] + BYTE_OFFSET;
4164 Return the initial_address in INITIAL_ADDRESS.
4166 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4167 update the pointer in each iteration of the loop.
4169 Return the increment stmt that updates the pointer in PTR_INCR.
4171 3. Set INV_P to true if the access pattern of the data reference in the
4172 vectorized loop is invariant. Set it to false otherwise.
4174 4. Return the pointer. */
4176 tree
4177 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4178 tree offset, tree *initial_address,
4179 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4180 bool only_init, bool *inv_p, tree byte_offset)
4182 const char *base_name;
4183 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4184 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4185 struct loop *loop = NULL;
4186 bool nested_in_vect_loop = false;
4187 struct loop *containing_loop = NULL;
4188 tree aggr_ptr_type;
4189 tree aggr_ptr;
4190 tree new_temp;
4191 gimple_seq new_stmt_list = NULL;
4192 edge pe = NULL;
4193 basic_block new_bb;
4194 tree aggr_ptr_init;
4195 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4196 tree aptr;
4197 gimple_stmt_iterator incr_gsi;
4198 bool insert_after;
4199 tree indx_before_incr, indx_after_incr;
4200 gimple *incr;
4201 tree step;
4202 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4204 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4205 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4207 if (loop_vinfo)
4209 loop = LOOP_VINFO_LOOP (loop_vinfo);
4210 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4211 containing_loop = (gimple_bb (stmt))->loop_father;
4212 pe = loop_preheader_edge (loop);
4214 else
4216 gcc_assert (bb_vinfo);
4217 only_init = true;
4218 *ptr_incr = NULL;
4221 /* Check the step (evolution) of the load in LOOP, and record
4222 whether it's invariant. */
4223 step = vect_dr_behavior (dr)->step;
4224 if (integer_zerop (step))
4225 *inv_p = true;
4226 else
4227 *inv_p = false;
4229 /* Create an expression for the first address accessed by this load
4230 in LOOP. */
4231 base_name = get_name (DR_BASE_ADDRESS (dr));
4233 if (dump_enabled_p ())
4235 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4236 dump_printf_loc (MSG_NOTE, vect_location,
4237 "create %s-pointer variable to type: ",
4238 get_tree_code_name (TREE_CODE (aggr_type)));
4239 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4240 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4241 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4242 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4243 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4244 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4245 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4246 else
4247 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4248 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4249 dump_printf (MSG_NOTE, "\n");
4252 /* (1) Create the new aggregate-pointer variable.
4253 Vector and array types inherit the alias set of their component
4254 type by default so we need to use a ref-all pointer if the data
4255 reference does not conflict with the created aggregated data
4256 reference because it is not addressable. */
4257 bool need_ref_all = false;
4258 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4259 get_alias_set (DR_REF (dr))))
4260 need_ref_all = true;
4261 /* Likewise for any of the data references in the stmt group. */
4262 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4264 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4267 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4268 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4269 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4270 get_alias_set (DR_REF (sdr))))
4272 need_ref_all = true;
4273 break;
4275 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4277 while (orig_stmt);
4279 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4280 need_ref_all);
4281 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4284 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4285 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4286 def-use update cycles for the pointer: one relative to the outer-loop
4287 (LOOP), which is what steps (3) and (4) below do. The other is relative
4288 to the inner-loop (which is the inner-most loop containing the dataref),
4289 and this is done be step (5) below.
4291 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4292 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4293 redundant. Steps (3),(4) create the following:
4295 vp0 = &base_addr;
4296 LOOP: vp1 = phi(vp0,vp2)
4299 vp2 = vp1 + step
4300 goto LOOP
4302 If there is an inner-loop nested in loop, then step (5) will also be
4303 applied, and an additional update in the inner-loop will be created:
4305 vp0 = &base_addr;
4306 LOOP: vp1 = phi(vp0,vp2)
4308 inner: vp3 = phi(vp1,vp4)
4309 vp4 = vp3 + inner_step
4310 if () goto inner
4312 vp2 = vp1 + step
4313 if () goto LOOP */
4315 /* (2) Calculate the initial address of the aggregate-pointer, and set
4316 the aggregate-pointer to point to it before the loop. */
4318 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4320 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4321 offset, byte_offset);
4322 if (new_stmt_list)
4324 if (pe)
4326 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4327 gcc_assert (!new_bb);
4329 else
4330 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4333 *initial_address = new_temp;
4334 aggr_ptr_init = new_temp;
4336 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4337 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4338 inner-loop nested in LOOP (during outer-loop vectorization). */
4340 /* No update in loop is required. */
4341 if (only_init && (!loop_vinfo || at_loop == loop))
4342 aptr = aggr_ptr_init;
4343 else
4345 /* The step of the aggregate pointer is the type size. */
4346 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4347 /* One exception to the above is when the scalar step of the load in
4348 LOOP is zero. In this case the step here is also zero. */
4349 if (*inv_p)
4350 iv_step = size_zero_node;
4351 else if (tree_int_cst_sgn (step) == -1)
4352 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4354 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4356 create_iv (aggr_ptr_init,
4357 fold_convert (aggr_ptr_type, iv_step),
4358 aggr_ptr, loop, &incr_gsi, insert_after,
4359 &indx_before_incr, &indx_after_incr);
4360 incr = gsi_stmt (incr_gsi);
4361 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4363 /* Copy the points-to information if it exists. */
4364 if (DR_PTR_INFO (dr))
4366 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4367 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4369 if (ptr_incr)
4370 *ptr_incr = incr;
4372 aptr = indx_before_incr;
4375 if (!nested_in_vect_loop || only_init)
4376 return aptr;
4379 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4380 nested in LOOP, if exists. */
4382 gcc_assert (nested_in_vect_loop);
4383 if (!only_init)
4385 standard_iv_increment_position (containing_loop, &incr_gsi,
4386 &insert_after);
4387 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4388 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4389 &indx_after_incr);
4390 incr = gsi_stmt (incr_gsi);
4391 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4393 /* Copy the points-to information if it exists. */
4394 if (DR_PTR_INFO (dr))
4396 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4397 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4399 if (ptr_incr)
4400 *ptr_incr = incr;
4402 return indx_before_incr;
4404 else
4405 gcc_unreachable ();
4409 /* Function bump_vector_ptr
4411 Increment a pointer (to a vector type) by vector-size. If requested,
4412 i.e. if PTR-INCR is given, then also connect the new increment stmt
4413 to the existing def-use update-chain of the pointer, by modifying
4414 the PTR_INCR as illustrated below:
4416 The pointer def-use update-chain before this function:
4417 DATAREF_PTR = phi (p_0, p_2)
4418 ....
4419 PTR_INCR: p_2 = DATAREF_PTR + step
4421 The pointer def-use update-chain after this function:
4422 DATAREF_PTR = phi (p_0, p_2)
4423 ....
4424 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4425 ....
4426 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4428 Input:
4429 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4430 in the loop.
4431 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4432 the loop. The increment amount across iterations is expected
4433 to be vector_size.
4434 BSI - location where the new update stmt is to be placed.
4435 STMT - the original scalar memory-access stmt that is being vectorized.
4436 BUMP - optional. The offset by which to bump the pointer. If not given,
4437 the offset is assumed to be vector_size.
4439 Output: Return NEW_DATAREF_PTR as illustrated above.
4443 tree
4444 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4445 gimple *stmt, tree bump)
4447 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4448 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4449 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4450 tree update = TYPE_SIZE_UNIT (vectype);
4451 gassign *incr_stmt;
4452 ssa_op_iter iter;
4453 use_operand_p use_p;
4454 tree new_dataref_ptr;
4456 if (bump)
4457 update = bump;
4459 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4460 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4461 else
4462 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4463 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4464 dataref_ptr, update);
4465 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4467 /* Copy the points-to information if it exists. */
4468 if (DR_PTR_INFO (dr))
4470 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4471 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4474 if (!ptr_incr)
4475 return new_dataref_ptr;
4477 /* Update the vector-pointer's cross-iteration increment. */
4478 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4480 tree use = USE_FROM_PTR (use_p);
4482 if (use == dataref_ptr)
4483 SET_USE (use_p, new_dataref_ptr);
4484 else
4485 gcc_assert (tree_int_cst_compare (use, update) == 0);
4488 return new_dataref_ptr;
4492 /* Function vect_create_destination_var.
4494 Create a new temporary of type VECTYPE. */
4496 tree
4497 vect_create_destination_var (tree scalar_dest, tree vectype)
4499 tree vec_dest;
4500 const char *name;
4501 char *new_name;
4502 tree type;
4503 enum vect_var_kind kind;
4505 kind = vectype
4506 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4507 ? vect_mask_var
4508 : vect_simple_var
4509 : vect_scalar_var;
4510 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4512 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4514 name = get_name (scalar_dest);
4515 if (name)
4516 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4517 else
4518 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4519 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4520 free (new_name);
4522 return vec_dest;
4525 /* Function vect_grouped_store_supported.
4527 Returns TRUE if interleave high and interleave low permutations
4528 are supported, and FALSE otherwise. */
4530 bool
4531 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4533 machine_mode mode = TYPE_MODE (vectype);
4535 /* vect_permute_store_chain requires the group size to be equal to 3 or
4536 be a power of two. */
4537 if (count != 3 && exact_log2 (count) == -1)
4539 if (dump_enabled_p ())
4540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4541 "the size of the group of accesses"
4542 " is not a power of 2 or not eqaul to 3\n");
4543 return false;
4546 /* Check that the permutation is supported. */
4547 if (VECTOR_MODE_P (mode))
4549 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4550 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4552 if (count == 3)
4554 unsigned int j0 = 0, j1 = 0, j2 = 0;
4555 unsigned int i, j;
4557 for (j = 0; j < 3; j++)
4559 int nelt0 = ((3 - j) * nelt) % 3;
4560 int nelt1 = ((3 - j) * nelt + 1) % 3;
4561 int nelt2 = ((3 - j) * nelt + 2) % 3;
4562 for (i = 0; i < nelt; i++)
4564 if (3 * i + nelt0 < nelt)
4565 sel[3 * i + nelt0] = j0++;
4566 if (3 * i + nelt1 < nelt)
4567 sel[3 * i + nelt1] = nelt + j1++;
4568 if (3 * i + nelt2 < nelt)
4569 sel[3 * i + nelt2] = 0;
4571 if (!can_vec_perm_p (mode, false, sel))
4573 if (dump_enabled_p ())
4574 dump_printf (MSG_MISSED_OPTIMIZATION,
4575 "permutaion op not supported by target.\n");
4576 return false;
4579 for (i = 0; i < nelt; i++)
4581 if (3 * i + nelt0 < nelt)
4582 sel[3 * i + nelt0] = 3 * i + nelt0;
4583 if (3 * i + nelt1 < nelt)
4584 sel[3 * i + nelt1] = 3 * i + nelt1;
4585 if (3 * i + nelt2 < nelt)
4586 sel[3 * i + nelt2] = nelt + j2++;
4588 if (!can_vec_perm_p (mode, false, sel))
4590 if (dump_enabled_p ())
4591 dump_printf (MSG_MISSED_OPTIMIZATION,
4592 "permutaion op not supported by target.\n");
4593 return false;
4596 return true;
4598 else
4600 /* If length is not equal to 3 then only power of 2 is supported. */
4601 gcc_assert (pow2p_hwi (count));
4603 for (i = 0; i < nelt / 2; i++)
4605 sel[i * 2] = i;
4606 sel[i * 2 + 1] = i + nelt;
4608 if (can_vec_perm_p (mode, false, sel))
4610 for (i = 0; i < nelt; i++)
4611 sel[i] += nelt / 2;
4612 if (can_vec_perm_p (mode, false, sel))
4613 return true;
4618 if (dump_enabled_p ())
4619 dump_printf (MSG_MISSED_OPTIMIZATION,
4620 "permutaion op not supported by target.\n");
4621 return false;
4625 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4626 type VECTYPE. */
4628 bool
4629 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4631 return vect_lanes_optab_supported_p ("vec_store_lanes",
4632 vec_store_lanes_optab,
4633 vectype, count);
4637 /* Function vect_permute_store_chain.
4639 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4640 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4641 the data correctly for the stores. Return the final references for stores
4642 in RESULT_CHAIN.
4644 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4645 The input is 4 vectors each containing 8 elements. We assign a number to
4646 each element, the input sequence is:
4648 1st vec: 0 1 2 3 4 5 6 7
4649 2nd vec: 8 9 10 11 12 13 14 15
4650 3rd vec: 16 17 18 19 20 21 22 23
4651 4th vec: 24 25 26 27 28 29 30 31
4653 The output sequence should be:
4655 1st vec: 0 8 16 24 1 9 17 25
4656 2nd vec: 2 10 18 26 3 11 19 27
4657 3rd vec: 4 12 20 28 5 13 21 30
4658 4th vec: 6 14 22 30 7 15 23 31
4660 i.e., we interleave the contents of the four vectors in their order.
4662 We use interleave_high/low instructions to create such output. The input of
4663 each interleave_high/low operation is two vectors:
4664 1st vec 2nd vec
4665 0 1 2 3 4 5 6 7
4666 the even elements of the result vector are obtained left-to-right from the
4667 high/low elements of the first vector. The odd elements of the result are
4668 obtained left-to-right from the high/low elements of the second vector.
4669 The output of interleave_high will be: 0 4 1 5
4670 and of interleave_low: 2 6 3 7
4673 The permutation is done in log LENGTH stages. In each stage interleave_high
4674 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4675 where the first argument is taken from the first half of DR_CHAIN and the
4676 second argument from it's second half.
4677 In our example,
4679 I1: interleave_high (1st vec, 3rd vec)
4680 I2: interleave_low (1st vec, 3rd vec)
4681 I3: interleave_high (2nd vec, 4th vec)
4682 I4: interleave_low (2nd vec, 4th vec)
4684 The output for the first stage is:
4686 I1: 0 16 1 17 2 18 3 19
4687 I2: 4 20 5 21 6 22 7 23
4688 I3: 8 24 9 25 10 26 11 27
4689 I4: 12 28 13 29 14 30 15 31
4691 The output of the second stage, i.e. the final result is:
4693 I1: 0 8 16 24 1 9 17 25
4694 I2: 2 10 18 26 3 11 19 27
4695 I3: 4 12 20 28 5 13 21 30
4696 I4: 6 14 22 30 7 15 23 31. */
4698 void
4699 vect_permute_store_chain (vec<tree> dr_chain,
4700 unsigned int length,
4701 gimple *stmt,
4702 gimple_stmt_iterator *gsi,
4703 vec<tree> *result_chain)
4705 tree vect1, vect2, high, low;
4706 gimple *perm_stmt;
4707 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4708 tree perm_mask_low, perm_mask_high;
4709 tree data_ref;
4710 tree perm3_mask_low, perm3_mask_high;
4711 unsigned int i, n, log_length = exact_log2 (length);
4712 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4713 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4715 result_chain->quick_grow (length);
4716 memcpy (result_chain->address (), dr_chain.address (),
4717 length * sizeof (tree));
4719 if (length == 3)
4721 unsigned int j0 = 0, j1 = 0, j2 = 0;
4723 for (j = 0; j < 3; j++)
4725 int nelt0 = ((3 - j) * nelt) % 3;
4726 int nelt1 = ((3 - j) * nelt + 1) % 3;
4727 int nelt2 = ((3 - j) * nelt + 2) % 3;
4729 for (i = 0; i < nelt; i++)
4731 if (3 * i + nelt0 < nelt)
4732 sel[3 * i + nelt0] = j0++;
4733 if (3 * i + nelt1 < nelt)
4734 sel[3 * i + nelt1] = nelt + j1++;
4735 if (3 * i + nelt2 < nelt)
4736 sel[3 * i + nelt2] = 0;
4738 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4740 for (i = 0; i < nelt; i++)
4742 if (3 * i + nelt0 < nelt)
4743 sel[3 * i + nelt0] = 3 * i + nelt0;
4744 if (3 * i + nelt1 < nelt)
4745 sel[3 * i + nelt1] = 3 * i + nelt1;
4746 if (3 * i + nelt2 < nelt)
4747 sel[3 * i + nelt2] = nelt + j2++;
4749 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4751 vect1 = dr_chain[0];
4752 vect2 = dr_chain[1];
4754 /* Create interleaving stmt:
4755 low = VEC_PERM_EXPR <vect1, vect2,
4756 {j, nelt, *, j + 1, nelt + j + 1, *,
4757 j + 2, nelt + j + 2, *, ...}> */
4758 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4759 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4760 vect2, perm3_mask_low);
4761 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4763 vect1 = data_ref;
4764 vect2 = dr_chain[2];
4765 /* Create interleaving stmt:
4766 low = VEC_PERM_EXPR <vect1, vect2,
4767 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4768 6, 7, nelt + j + 2, ...}> */
4769 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4770 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4771 vect2, perm3_mask_high);
4772 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4773 (*result_chain)[j] = data_ref;
4776 else
4778 /* If length is not equal to 3 then only power of 2 is supported. */
4779 gcc_assert (pow2p_hwi (length));
4781 for (i = 0, n = nelt / 2; i < n; i++)
4783 sel[i * 2] = i;
4784 sel[i * 2 + 1] = i + nelt;
4786 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4788 for (i = 0; i < nelt; i++)
4789 sel[i] += nelt / 2;
4790 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4792 for (i = 0, n = log_length; i < n; i++)
4794 for (j = 0; j < length/2; j++)
4796 vect1 = dr_chain[j];
4797 vect2 = dr_chain[j+length/2];
4799 /* Create interleaving stmt:
4800 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4801 ...}> */
4802 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4803 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4804 vect2, perm_mask_high);
4805 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4806 (*result_chain)[2*j] = high;
4808 /* Create interleaving stmt:
4809 low = VEC_PERM_EXPR <vect1, vect2,
4810 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4811 ...}> */
4812 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4813 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4814 vect2, perm_mask_low);
4815 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4816 (*result_chain)[2*j+1] = low;
4818 memcpy (dr_chain.address (), result_chain->address (),
4819 length * sizeof (tree));
4824 /* Function vect_setup_realignment
4826 This function is called when vectorizing an unaligned load using
4827 the dr_explicit_realign[_optimized] scheme.
4828 This function generates the following code at the loop prolog:
4830 p = initial_addr;
4831 x msq_init = *(floor(p)); # prolog load
4832 realignment_token = call target_builtin;
4833 loop:
4834 x msq = phi (msq_init, ---)
4836 The stmts marked with x are generated only for the case of
4837 dr_explicit_realign_optimized.
4839 The code above sets up a new (vector) pointer, pointing to the first
4840 location accessed by STMT, and a "floor-aligned" load using that pointer.
4841 It also generates code to compute the "realignment-token" (if the relevant
4842 target hook was defined), and creates a phi-node at the loop-header bb
4843 whose arguments are the result of the prolog-load (created by this
4844 function) and the result of a load that takes place in the loop (to be
4845 created by the caller to this function).
4847 For the case of dr_explicit_realign_optimized:
4848 The caller to this function uses the phi-result (msq) to create the
4849 realignment code inside the loop, and sets up the missing phi argument,
4850 as follows:
4851 loop:
4852 msq = phi (msq_init, lsq)
4853 lsq = *(floor(p')); # load in loop
4854 result = realign_load (msq, lsq, realignment_token);
4856 For the case of dr_explicit_realign:
4857 loop:
4858 msq = *(floor(p)); # load in loop
4859 p' = p + (VS-1);
4860 lsq = *(floor(p')); # load in loop
4861 result = realign_load (msq, lsq, realignment_token);
4863 Input:
4864 STMT - (scalar) load stmt to be vectorized. This load accesses
4865 a memory location that may be unaligned.
4866 BSI - place where new code is to be inserted.
4867 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4868 is used.
4870 Output:
4871 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4872 target hook, if defined.
4873 Return value - the result of the loop-header phi node. */
4875 tree
4876 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4877 tree *realignment_token,
4878 enum dr_alignment_support alignment_support_scheme,
4879 tree init_addr,
4880 struct loop **at_loop)
4882 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4883 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4884 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4885 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4886 struct loop *loop = NULL;
4887 edge pe = NULL;
4888 tree scalar_dest = gimple_assign_lhs (stmt);
4889 tree vec_dest;
4890 gimple *inc;
4891 tree ptr;
4892 tree data_ref;
4893 basic_block new_bb;
4894 tree msq_init = NULL_TREE;
4895 tree new_temp;
4896 gphi *phi_stmt;
4897 tree msq = NULL_TREE;
4898 gimple_seq stmts = NULL;
4899 bool inv_p;
4900 bool compute_in_loop = false;
4901 bool nested_in_vect_loop = false;
4902 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4903 struct loop *loop_for_initial_load = NULL;
4905 if (loop_vinfo)
4907 loop = LOOP_VINFO_LOOP (loop_vinfo);
4908 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4911 gcc_assert (alignment_support_scheme == dr_explicit_realign
4912 || alignment_support_scheme == dr_explicit_realign_optimized);
4914 /* We need to generate three things:
4915 1. the misalignment computation
4916 2. the extra vector load (for the optimized realignment scheme).
4917 3. the phi node for the two vectors from which the realignment is
4918 done (for the optimized realignment scheme). */
4920 /* 1. Determine where to generate the misalignment computation.
4922 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4923 calculation will be generated by this function, outside the loop (in the
4924 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4925 caller, inside the loop.
4927 Background: If the misalignment remains fixed throughout the iterations of
4928 the loop, then both realignment schemes are applicable, and also the
4929 misalignment computation can be done outside LOOP. This is because we are
4930 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4931 are a multiple of VS (the Vector Size), and therefore the misalignment in
4932 different vectorized LOOP iterations is always the same.
4933 The problem arises only if the memory access is in an inner-loop nested
4934 inside LOOP, which is now being vectorized using outer-loop vectorization.
4935 This is the only case when the misalignment of the memory access may not
4936 remain fixed throughout the iterations of the inner-loop (as explained in
4937 detail in vect_supportable_dr_alignment). In this case, not only is the
4938 optimized realignment scheme not applicable, but also the misalignment
4939 computation (and generation of the realignment token that is passed to
4940 REALIGN_LOAD) have to be done inside the loop.
4942 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4943 or not, which in turn determines if the misalignment is computed inside
4944 the inner-loop, or outside LOOP. */
4946 if (init_addr != NULL_TREE || !loop_vinfo)
4948 compute_in_loop = true;
4949 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4953 /* 2. Determine where to generate the extra vector load.
4955 For the optimized realignment scheme, instead of generating two vector
4956 loads in each iteration, we generate a single extra vector load in the
4957 preheader of the loop, and in each iteration reuse the result of the
4958 vector load from the previous iteration. In case the memory access is in
4959 an inner-loop nested inside LOOP, which is now being vectorized using
4960 outer-loop vectorization, we need to determine whether this initial vector
4961 load should be generated at the preheader of the inner-loop, or can be
4962 generated at the preheader of LOOP. If the memory access has no evolution
4963 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4964 to be generated inside LOOP (in the preheader of the inner-loop). */
4966 if (nested_in_vect_loop)
4968 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4969 bool invariant_in_outerloop =
4970 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4971 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4973 else
4974 loop_for_initial_load = loop;
4975 if (at_loop)
4976 *at_loop = loop_for_initial_load;
4978 if (loop_for_initial_load)
4979 pe = loop_preheader_edge (loop_for_initial_load);
4981 /* 3. For the case of the optimized realignment, create the first vector
4982 load at the loop preheader. */
4984 if (alignment_support_scheme == dr_explicit_realign_optimized)
4986 /* Create msq_init = *(floor(p1)) in the loop preheader */
4987 gassign *new_stmt;
4989 gcc_assert (!compute_in_loop);
4990 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4991 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4992 NULL_TREE, &init_addr, NULL, &inc,
4993 true, &inv_p);
4994 if (TREE_CODE (ptr) == SSA_NAME)
4995 new_temp = copy_ssa_name (ptr);
4996 else
4997 new_temp = make_ssa_name (TREE_TYPE (ptr));
4998 new_stmt = gimple_build_assign
4999 (new_temp, BIT_AND_EXPR, ptr,
5000 build_int_cst (TREE_TYPE (ptr),
5001 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5002 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5003 gcc_assert (!new_bb);
5004 data_ref
5005 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5006 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5007 new_stmt = gimple_build_assign (vec_dest, data_ref);
5008 new_temp = make_ssa_name (vec_dest, new_stmt);
5009 gimple_assign_set_lhs (new_stmt, new_temp);
5010 if (pe)
5012 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5013 gcc_assert (!new_bb);
5015 else
5016 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5018 msq_init = gimple_assign_lhs (new_stmt);
5021 /* 4. Create realignment token using a target builtin, if available.
5022 It is done either inside the containing loop, or before LOOP (as
5023 determined above). */
5025 if (targetm.vectorize.builtin_mask_for_load)
5027 gcall *new_stmt;
5028 tree builtin_decl;
5030 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5031 if (!init_addr)
5033 /* Generate the INIT_ADDR computation outside LOOP. */
5034 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5035 NULL_TREE);
5036 if (loop)
5038 pe = loop_preheader_edge (loop);
5039 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5040 gcc_assert (!new_bb);
5042 else
5043 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5046 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5047 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5048 vec_dest =
5049 vect_create_destination_var (scalar_dest,
5050 gimple_call_return_type (new_stmt));
5051 new_temp = make_ssa_name (vec_dest, new_stmt);
5052 gimple_call_set_lhs (new_stmt, new_temp);
5054 if (compute_in_loop)
5055 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5056 else
5058 /* Generate the misalignment computation outside LOOP. */
5059 pe = loop_preheader_edge (loop);
5060 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5061 gcc_assert (!new_bb);
5064 *realignment_token = gimple_call_lhs (new_stmt);
5066 /* The result of the CALL_EXPR to this builtin is determined from
5067 the value of the parameter and no global variables are touched
5068 which makes the builtin a "const" function. Requiring the
5069 builtin to have the "const" attribute makes it unnecessary
5070 to call mark_call_clobbered. */
5071 gcc_assert (TREE_READONLY (builtin_decl));
5074 if (alignment_support_scheme == dr_explicit_realign)
5075 return msq;
5077 gcc_assert (!compute_in_loop);
5078 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5081 /* 5. Create msq = phi <msq_init, lsq> in loop */
5083 pe = loop_preheader_edge (containing_loop);
5084 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5085 msq = make_ssa_name (vec_dest);
5086 phi_stmt = create_phi_node (msq, containing_loop->header);
5087 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5089 return msq;
5093 /* Function vect_grouped_load_supported.
5095 COUNT is the size of the load group (the number of statements plus the
5096 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5097 only one statement, with a gap of COUNT - 1.
5099 Returns true if a suitable permute exists. */
5101 bool
5102 vect_grouped_load_supported (tree vectype, bool single_element_p,
5103 unsigned HOST_WIDE_INT count)
5105 machine_mode mode = TYPE_MODE (vectype);
5107 /* If this is single-element interleaving with an element distance
5108 that leaves unused vector loads around punt - we at least create
5109 very sub-optimal code in that case (and blow up memory,
5110 see PR65518). */
5111 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5113 if (dump_enabled_p ())
5114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5115 "single-element interleaving not supported "
5116 "for not adjacent vector loads\n");
5117 return false;
5120 /* vect_permute_load_chain requires the group size to be equal to 3 or
5121 be a power of two. */
5122 if (count != 3 && exact_log2 (count) == -1)
5124 if (dump_enabled_p ())
5125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5126 "the size of the group of accesses"
5127 " is not a power of 2 or not equal to 3\n");
5128 return false;
5131 /* Check that the permutation is supported. */
5132 if (VECTOR_MODE_P (mode))
5134 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5135 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5137 if (count == 3)
5139 unsigned int k;
5140 for (k = 0; k < 3; k++)
5142 for (i = 0; i < nelt; i++)
5143 if (3 * i + k < 2 * nelt)
5144 sel[i] = 3 * i + k;
5145 else
5146 sel[i] = 0;
5147 if (!can_vec_perm_p (mode, false, sel))
5149 if (dump_enabled_p ())
5150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5151 "shuffle of 3 loads is not supported by"
5152 " target\n");
5153 return false;
5155 for (i = 0, j = 0; i < nelt; i++)
5156 if (3 * i + k < 2 * nelt)
5157 sel[i] = i;
5158 else
5159 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5160 if (!can_vec_perm_p (mode, false, sel))
5162 if (dump_enabled_p ())
5163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5164 "shuffle of 3 loads is not supported by"
5165 " target\n");
5166 return false;
5169 return true;
5171 else
5173 /* If length is not equal to 3 then only power of 2 is supported. */
5174 gcc_assert (pow2p_hwi (count));
5175 for (i = 0; i < nelt; i++)
5176 sel[i] = i * 2;
5177 if (can_vec_perm_p (mode, false, sel))
5179 for (i = 0; i < nelt; i++)
5180 sel[i] = i * 2 + 1;
5181 if (can_vec_perm_p (mode, false, sel))
5182 return true;
5187 if (dump_enabled_p ())
5188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5189 "extract even/odd not supported by target\n");
5190 return false;
5193 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5194 type VECTYPE. */
5196 bool
5197 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5199 return vect_lanes_optab_supported_p ("vec_load_lanes",
5200 vec_load_lanes_optab,
5201 vectype, count);
5204 /* Function vect_permute_load_chain.
5206 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5207 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5208 the input data correctly. Return the final references for loads in
5209 RESULT_CHAIN.
5211 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5212 The input is 4 vectors each containing 8 elements. We assign a number to each
5213 element, the input sequence is:
5215 1st vec: 0 1 2 3 4 5 6 7
5216 2nd vec: 8 9 10 11 12 13 14 15
5217 3rd vec: 16 17 18 19 20 21 22 23
5218 4th vec: 24 25 26 27 28 29 30 31
5220 The output sequence should be:
5222 1st vec: 0 4 8 12 16 20 24 28
5223 2nd vec: 1 5 9 13 17 21 25 29
5224 3rd vec: 2 6 10 14 18 22 26 30
5225 4th vec: 3 7 11 15 19 23 27 31
5227 i.e., the first output vector should contain the first elements of each
5228 interleaving group, etc.
5230 We use extract_even/odd instructions to create such output. The input of
5231 each extract_even/odd operation is two vectors
5232 1st vec 2nd vec
5233 0 1 2 3 4 5 6 7
5235 and the output is the vector of extracted even/odd elements. The output of
5236 extract_even will be: 0 2 4 6
5237 and of extract_odd: 1 3 5 7
5240 The permutation is done in log LENGTH stages. In each stage extract_even
5241 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5242 their order. In our example,
5244 E1: extract_even (1st vec, 2nd vec)
5245 E2: extract_odd (1st vec, 2nd vec)
5246 E3: extract_even (3rd vec, 4th vec)
5247 E4: extract_odd (3rd vec, 4th vec)
5249 The output for the first stage will be:
5251 E1: 0 2 4 6 8 10 12 14
5252 E2: 1 3 5 7 9 11 13 15
5253 E3: 16 18 20 22 24 26 28 30
5254 E4: 17 19 21 23 25 27 29 31
5256 In order to proceed and create the correct sequence for the next stage (or
5257 for the correct output, if the second stage is the last one, as in our
5258 example), we first put the output of extract_even operation and then the
5259 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5260 The input for the second stage is:
5262 1st vec (E1): 0 2 4 6 8 10 12 14
5263 2nd vec (E3): 16 18 20 22 24 26 28 30
5264 3rd vec (E2): 1 3 5 7 9 11 13 15
5265 4th vec (E4): 17 19 21 23 25 27 29 31
5267 The output of the second stage:
5269 E1: 0 4 8 12 16 20 24 28
5270 E2: 2 6 10 14 18 22 26 30
5271 E3: 1 5 9 13 17 21 25 29
5272 E4: 3 7 11 15 19 23 27 31
5274 And RESULT_CHAIN after reordering:
5276 1st vec (E1): 0 4 8 12 16 20 24 28
5277 2nd vec (E3): 1 5 9 13 17 21 25 29
5278 3rd vec (E2): 2 6 10 14 18 22 26 30
5279 4th vec (E4): 3 7 11 15 19 23 27 31. */
5281 static void
5282 vect_permute_load_chain (vec<tree> dr_chain,
5283 unsigned int length,
5284 gimple *stmt,
5285 gimple_stmt_iterator *gsi,
5286 vec<tree> *result_chain)
5288 tree data_ref, first_vect, second_vect;
5289 tree perm_mask_even, perm_mask_odd;
5290 tree perm3_mask_low, perm3_mask_high;
5291 gimple *perm_stmt;
5292 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5293 unsigned int i, j, log_length = exact_log2 (length);
5294 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5295 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5297 result_chain->quick_grow (length);
5298 memcpy (result_chain->address (), dr_chain.address (),
5299 length * sizeof (tree));
5301 if (length == 3)
5303 unsigned int k;
5305 for (k = 0; k < 3; k++)
5307 for (i = 0; i < nelt; i++)
5308 if (3 * i + k < 2 * nelt)
5309 sel[i] = 3 * i + k;
5310 else
5311 sel[i] = 0;
5312 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5314 for (i = 0, j = 0; i < nelt; i++)
5315 if (3 * i + k < 2 * nelt)
5316 sel[i] = i;
5317 else
5318 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5320 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5322 first_vect = dr_chain[0];
5323 second_vect = dr_chain[1];
5325 /* Create interleaving stmt (low part of):
5326 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5327 ...}> */
5328 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5329 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5330 second_vect, perm3_mask_low);
5331 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5333 /* Create interleaving stmt (high part of):
5334 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5335 ...}> */
5336 first_vect = data_ref;
5337 second_vect = dr_chain[2];
5338 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5339 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5340 second_vect, perm3_mask_high);
5341 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5342 (*result_chain)[k] = data_ref;
5345 else
5347 /* If length is not equal to 3 then only power of 2 is supported. */
5348 gcc_assert (pow2p_hwi (length));
5350 for (i = 0; i < nelt; ++i)
5351 sel[i] = i * 2;
5352 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5354 for (i = 0; i < nelt; ++i)
5355 sel[i] = i * 2 + 1;
5356 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5358 for (i = 0; i < log_length; i++)
5360 for (j = 0; j < length; j += 2)
5362 first_vect = dr_chain[j];
5363 second_vect = dr_chain[j+1];
5365 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5366 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5367 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5368 first_vect, second_vect,
5369 perm_mask_even);
5370 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5371 (*result_chain)[j/2] = data_ref;
5373 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5374 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5375 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5376 first_vect, second_vect,
5377 perm_mask_odd);
5378 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5379 (*result_chain)[j/2+length/2] = data_ref;
5381 memcpy (dr_chain.address (), result_chain->address (),
5382 length * sizeof (tree));
5387 /* Function vect_shift_permute_load_chain.
5389 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5390 sequence of stmts to reorder the input data accordingly.
5391 Return the final references for loads in RESULT_CHAIN.
5392 Return true if successed, false otherwise.
5394 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5395 The input is 3 vectors each containing 8 elements. We assign a
5396 number to each element, the input sequence is:
5398 1st vec: 0 1 2 3 4 5 6 7
5399 2nd vec: 8 9 10 11 12 13 14 15
5400 3rd vec: 16 17 18 19 20 21 22 23
5402 The output sequence should be:
5404 1st vec: 0 3 6 9 12 15 18 21
5405 2nd vec: 1 4 7 10 13 16 19 22
5406 3rd vec: 2 5 8 11 14 17 20 23
5408 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5410 First we shuffle all 3 vectors to get correct elements order:
5412 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5413 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5414 3rd vec: (16 19 22) (17 20 23) (18 21)
5416 Next we unite and shift vector 3 times:
5418 1st step:
5419 shift right by 6 the concatenation of:
5420 "1st vec" and "2nd vec"
5421 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5422 "2nd vec" and "3rd vec"
5423 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5424 "3rd vec" and "1st vec"
5425 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5426 | New vectors |
5428 So that now new vectors are:
5430 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5431 2nd vec: (10 13) (16 19 22) (17 20 23)
5432 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5434 2nd step:
5435 shift right by 5 the concatenation of:
5436 "1st vec" and "3rd vec"
5437 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5438 "2nd vec" and "1st vec"
5439 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5440 "3rd vec" and "2nd vec"
5441 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5442 | New vectors |
5444 So that now new vectors are:
5446 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5447 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5448 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5450 3rd step:
5451 shift right by 5 the concatenation of:
5452 "1st vec" and "1st vec"
5453 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5454 shift right by 3 the concatenation of:
5455 "2nd vec" and "2nd vec"
5456 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5457 | New vectors |
5459 So that now all vectors are READY:
5460 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5461 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5462 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5464 This algorithm is faster than one in vect_permute_load_chain if:
5465 1. "shift of a concatination" is faster than general permutation.
5466 This is usually so.
5467 2. The TARGET machine can't execute vector instructions in parallel.
5468 This is because each step of the algorithm depends on previous.
5469 The algorithm in vect_permute_load_chain is much more parallel.
5471 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5474 static bool
5475 vect_shift_permute_load_chain (vec<tree> dr_chain,
5476 unsigned int length,
5477 gimple *stmt,
5478 gimple_stmt_iterator *gsi,
5479 vec<tree> *result_chain)
5481 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5482 tree perm2_mask1, perm2_mask2, perm3_mask;
5483 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5484 gimple *perm_stmt;
5486 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5487 unsigned int i;
5488 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5489 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5490 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5491 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5493 result_chain->quick_grow (length);
5494 memcpy (result_chain->address (), dr_chain.address (),
5495 length * sizeof (tree));
5497 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5499 unsigned int j, log_length = exact_log2 (length);
5500 for (i = 0; i < nelt / 2; ++i)
5501 sel[i] = i * 2;
5502 for (i = 0; i < nelt / 2; ++i)
5503 sel[nelt / 2 + i] = i * 2 + 1;
5504 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5506 if (dump_enabled_p ())
5507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5508 "shuffle of 2 fields structure is not \
5509 supported by target\n");
5510 return false;
5512 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5514 for (i = 0; i < nelt / 2; ++i)
5515 sel[i] = i * 2 + 1;
5516 for (i = 0; i < nelt / 2; ++i)
5517 sel[nelt / 2 + i] = i * 2;
5518 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5520 if (dump_enabled_p ())
5521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5522 "shuffle of 2 fields structure is not \
5523 supported by target\n");
5524 return false;
5526 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5528 /* Generating permutation constant to shift all elements.
5529 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5530 for (i = 0; i < nelt; i++)
5531 sel[i] = nelt / 2 + i;
5532 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5534 if (dump_enabled_p ())
5535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5536 "shift permutation is not supported by target\n");
5537 return false;
5539 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5541 /* Generating permutation constant to select vector from 2.
5542 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5543 for (i = 0; i < nelt / 2; i++)
5544 sel[i] = i;
5545 for (i = nelt / 2; i < nelt; i++)
5546 sel[i] = nelt + i;
5547 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5549 if (dump_enabled_p ())
5550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5551 "select is not supported by target\n");
5552 return false;
5554 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5556 for (i = 0; i < log_length; i++)
5558 for (j = 0; j < length; j += 2)
5560 first_vect = dr_chain[j];
5561 second_vect = dr_chain[j + 1];
5563 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5564 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5565 first_vect, first_vect,
5566 perm2_mask1);
5567 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5568 vect[0] = data_ref;
5570 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5571 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5572 second_vect, second_vect,
5573 perm2_mask2);
5574 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5575 vect[1] = data_ref;
5577 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5578 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5579 vect[0], vect[1], shift1_mask);
5580 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5581 (*result_chain)[j/2 + length/2] = data_ref;
5583 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5584 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5585 vect[0], vect[1], select_mask);
5586 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5587 (*result_chain)[j/2] = data_ref;
5589 memcpy (dr_chain.address (), result_chain->address (),
5590 length * sizeof (tree));
5592 return true;
5594 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5596 unsigned int k = 0, l = 0;
5598 /* Generating permutation constant to get all elements in rigth order.
5599 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5600 for (i = 0; i < nelt; i++)
5602 if (3 * k + (l % 3) >= nelt)
5604 k = 0;
5605 l += (3 - (nelt % 3));
5607 sel[i] = 3 * k + (l % 3);
5608 k++;
5610 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5612 if (dump_enabled_p ())
5613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5614 "shuffle of 3 fields structure is not \
5615 supported by target\n");
5616 return false;
5618 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5620 /* Generating permutation constant to shift all elements.
5621 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5622 for (i = 0; i < nelt; i++)
5623 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5624 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5626 if (dump_enabled_p ())
5627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5628 "shift permutation is not supported by target\n");
5629 return false;
5631 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5633 /* Generating permutation constant to shift all elements.
5634 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5635 for (i = 0; i < nelt; i++)
5636 sel[i] = 2 * (nelt / 3) + 1 + i;
5637 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5639 if (dump_enabled_p ())
5640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5641 "shift permutation is not supported by target\n");
5642 return false;
5644 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5646 /* Generating permutation constant to shift all elements.
5647 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5648 for (i = 0; i < nelt; i++)
5649 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5650 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5652 if (dump_enabled_p ())
5653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5654 "shift permutation is not supported by target\n");
5655 return false;
5657 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5659 /* Generating permutation constant to shift all elements.
5660 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5661 for (i = 0; i < nelt; i++)
5662 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5663 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5665 if (dump_enabled_p ())
5666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5667 "shift permutation is not supported by target\n");
5668 return false;
5670 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5672 for (k = 0; k < 3; k++)
5674 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5675 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5676 dr_chain[k], dr_chain[k],
5677 perm3_mask);
5678 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5679 vect[k] = data_ref;
5682 for (k = 0; k < 3; k++)
5684 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5685 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5686 vect[k % 3], vect[(k + 1) % 3],
5687 shift1_mask);
5688 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5689 vect_shift[k] = data_ref;
5692 for (k = 0; k < 3; k++)
5694 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5695 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5696 vect_shift[(4 - k) % 3],
5697 vect_shift[(3 - k) % 3],
5698 shift2_mask);
5699 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5700 vect[k] = data_ref;
5703 (*result_chain)[3 - (nelt % 3)] = vect[2];
5705 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5706 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5707 vect[0], shift3_mask);
5708 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5709 (*result_chain)[nelt % 3] = data_ref;
5711 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5712 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5713 vect[1], shift4_mask);
5714 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5715 (*result_chain)[0] = data_ref;
5716 return true;
5718 return false;
5721 /* Function vect_transform_grouped_load.
5723 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5724 to perform their permutation and ascribe the result vectorized statements to
5725 the scalar statements.
5728 void
5729 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5730 gimple_stmt_iterator *gsi)
5732 machine_mode mode;
5733 vec<tree> result_chain = vNULL;
5735 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5736 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5737 vectors, that are ready for vector computation. */
5738 result_chain.create (size);
5740 /* If reassociation width for vector type is 2 or greater target machine can
5741 execute 2 or more vector instructions in parallel. Otherwise try to
5742 get chain for loads group using vect_shift_permute_load_chain. */
5743 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5744 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5745 || pow2p_hwi (size)
5746 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5747 gsi, &result_chain))
5748 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5749 vect_record_grouped_load_vectors (stmt, result_chain);
5750 result_chain.release ();
5753 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5754 generated as part of the vectorization of STMT. Assign the statement
5755 for each vector to the associated scalar statement. */
5757 void
5758 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5760 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5761 gimple *next_stmt, *new_stmt;
5762 unsigned int i, gap_count;
5763 tree tmp_data_ref;
5765 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5766 Since we scan the chain starting from it's first node, their order
5767 corresponds the order of data-refs in RESULT_CHAIN. */
5768 next_stmt = first_stmt;
5769 gap_count = 1;
5770 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5772 if (!next_stmt)
5773 break;
5775 /* Skip the gaps. Loads created for the gaps will be removed by dead
5776 code elimination pass later. No need to check for the first stmt in
5777 the group, since it always exists.
5778 GROUP_GAP is the number of steps in elements from the previous
5779 access (if there is no gap GROUP_GAP is 1). We skip loads that
5780 correspond to the gaps. */
5781 if (next_stmt != first_stmt
5782 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5784 gap_count++;
5785 continue;
5788 while (next_stmt)
5790 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5791 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5792 copies, and we put the new vector statement in the first available
5793 RELATED_STMT. */
5794 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5795 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5796 else
5798 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5800 gimple *prev_stmt =
5801 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5802 gimple *rel_stmt =
5803 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5804 while (rel_stmt)
5806 prev_stmt = rel_stmt;
5807 rel_stmt =
5808 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5811 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5812 new_stmt;
5816 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5817 gap_count = 1;
5818 /* If NEXT_STMT accesses the same DR as the previous statement,
5819 put the same TMP_DATA_REF as its vectorized statement; otherwise
5820 get the next data-ref from RESULT_CHAIN. */
5821 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5822 break;
5827 /* Function vect_force_dr_alignment_p.
5829 Returns whether the alignment of a DECL can be forced to be aligned
5830 on ALIGNMENT bit boundary. */
5832 bool
5833 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5835 if (!VAR_P (decl))
5836 return false;
5838 if (decl_in_symtab_p (decl)
5839 && !symtab_node::get (decl)->can_increase_alignment_p ())
5840 return false;
5842 if (TREE_STATIC (decl))
5843 return (alignment <= MAX_OFILE_ALIGNMENT);
5844 else
5845 return (alignment <= MAX_STACK_ALIGNMENT);
5849 /* Return whether the data reference DR is supported with respect to its
5850 alignment.
5851 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5852 it is aligned, i.e., check if it is possible to vectorize it with different
5853 alignment. */
5855 enum dr_alignment_support
5856 vect_supportable_dr_alignment (struct data_reference *dr,
5857 bool check_aligned_accesses)
5859 gimple *stmt = DR_STMT (dr);
5860 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5861 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5862 machine_mode mode = TYPE_MODE (vectype);
5863 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5864 struct loop *vect_loop = NULL;
5865 bool nested_in_vect_loop = false;
5867 if (aligned_access_p (dr) && !check_aligned_accesses)
5868 return dr_aligned;
5870 /* For now assume all conditional loads/stores support unaligned
5871 access without any special code. */
5872 if (is_gimple_call (stmt)
5873 && gimple_call_internal_p (stmt)
5874 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5875 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5876 return dr_unaligned_supported;
5878 if (loop_vinfo)
5880 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5881 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5884 /* Possibly unaligned access. */
5886 /* We can choose between using the implicit realignment scheme (generating
5887 a misaligned_move stmt) and the explicit realignment scheme (generating
5888 aligned loads with a REALIGN_LOAD). There are two variants to the
5889 explicit realignment scheme: optimized, and unoptimized.
5890 We can optimize the realignment only if the step between consecutive
5891 vector loads is equal to the vector size. Since the vector memory
5892 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5893 is guaranteed that the misalignment amount remains the same throughout the
5894 execution of the vectorized loop. Therefore, we can create the
5895 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5896 at the loop preheader.
5898 However, in the case of outer-loop vectorization, when vectorizing a
5899 memory access in the inner-loop nested within the LOOP that is now being
5900 vectorized, while it is guaranteed that the misalignment of the
5901 vectorized memory access will remain the same in different outer-loop
5902 iterations, it is *not* guaranteed that is will remain the same throughout
5903 the execution of the inner-loop. This is because the inner-loop advances
5904 with the original scalar step (and not in steps of VS). If the inner-loop
5905 step happens to be a multiple of VS, then the misalignment remains fixed
5906 and we can use the optimized realignment scheme. For example:
5908 for (i=0; i<N; i++)
5909 for (j=0; j<M; j++)
5910 s += a[i+j];
5912 When vectorizing the i-loop in the above example, the step between
5913 consecutive vector loads is 1, and so the misalignment does not remain
5914 fixed across the execution of the inner-loop, and the realignment cannot
5915 be optimized (as illustrated in the following pseudo vectorized loop):
5917 for (i=0; i<N; i+=4)
5918 for (j=0; j<M; j++){
5919 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5920 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5921 // (assuming that we start from an aligned address).
5924 We therefore have to use the unoptimized realignment scheme:
5926 for (i=0; i<N; i+=4)
5927 for (j=k; j<M; j+=4)
5928 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5929 // that the misalignment of the initial address is
5930 // 0).
5932 The loop can then be vectorized as follows:
5934 for (k=0; k<4; k++){
5935 rt = get_realignment_token (&vp[k]);
5936 for (i=0; i<N; i+=4){
5937 v1 = vp[i+k];
5938 for (j=k; j<M; j+=4){
5939 v2 = vp[i+j+VS-1];
5940 va = REALIGN_LOAD <v1,v2,rt>;
5941 vs += va;
5942 v1 = v2;
5945 } */
5947 if (DR_IS_READ (dr))
5949 bool is_packed = false;
5950 tree type = (TREE_TYPE (DR_REF (dr)));
5952 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5953 && (!targetm.vectorize.builtin_mask_for_load
5954 || targetm.vectorize.builtin_mask_for_load ()))
5956 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5958 /* If we are doing SLP then the accesses need not have the
5959 same alignment, instead it depends on the SLP group size. */
5960 if (loop_vinfo
5961 && STMT_SLP_TYPE (stmt_info)
5962 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5963 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5964 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5966 else if (!loop_vinfo
5967 || (nested_in_vect_loop
5968 && (TREE_INT_CST_LOW (DR_STEP (dr))
5969 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5970 return dr_explicit_realign;
5971 else
5972 return dr_explicit_realign_optimized;
5974 if (!known_alignment_for_access_p (dr))
5975 is_packed = not_size_aligned (DR_REF (dr));
5977 if (targetm.vectorize.support_vector_misalignment
5978 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5979 /* Can't software pipeline the loads, but can at least do them. */
5980 return dr_unaligned_supported;
5982 else
5984 bool is_packed = false;
5985 tree type = (TREE_TYPE (DR_REF (dr)));
5987 if (!known_alignment_for_access_p (dr))
5988 is_packed = not_size_aligned (DR_REF (dr));
5990 if (targetm.vectorize.support_vector_misalignment
5991 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5992 return dr_unaligned_supported;
5995 /* Unsupported. */
5996 return dr_unaligned_unsupported;