PR middle-end/71758
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob353d1df25d494ee17370901b22942f29326f7c21
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
61 bool limit_p;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
91 return true;
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
230 /* We do not have to consider dependences between accesses that belong
231 to the same group. */
232 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
233 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
234 return false;
236 /* Even if we have an anti-dependence then, as the vectorized loop covers at
237 least two scalar iterations, there is always also a true dependence.
238 As the vectorizer does not re-order loads and stores we can ignore
239 the anti-dependence if TBAA can disambiguate both DRs similar to the
240 case with known negative distance anti-dependences (positive
241 distance anti-dependences would violate TBAA constraints). */
242 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
243 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
244 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
245 get_alias_set (DR_REF (drb))))
246 return false;
248 /* Unknown data dependence. */
249 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
251 /* If user asserted safelen consecutive iterations can be
252 executed concurrently, assume independence. */
253 if (loop->safelen >= 2)
255 if (loop->safelen < *max_vf)
256 *max_vf = loop->safelen;
257 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
258 return false;
261 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
262 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
264 if (dump_enabled_p ())
266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
267 "versioning for alias not supported for: "
268 "can't determine dependence between ");
269 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
270 DR_REF (dra));
271 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
272 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
273 DR_REF (drb));
274 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
276 return true;
279 if (dump_enabled_p ())
281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
282 "versioning for alias required: "
283 "can't determine dependence between ");
284 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
285 DR_REF (dra));
286 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
287 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
288 DR_REF (drb));
289 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
292 /* Add to list of ddrs that need to be tested at run-time. */
293 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
296 /* Known data dependence. */
297 if (DDR_NUM_DIST_VECTS (ddr) == 0)
299 /* If user asserted safelen consecutive iterations can be
300 executed concurrently, assume independence. */
301 if (loop->safelen >= 2)
303 if (loop->safelen < *max_vf)
304 *max_vf = loop->safelen;
305 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
306 return false;
309 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
310 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
312 if (dump_enabled_p ())
314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
315 "versioning for alias not supported for: "
316 "bad dist vector for ");
317 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
318 DR_REF (dra));
319 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
320 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
321 DR_REF (drb));
322 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
324 return true;
327 if (dump_enabled_p ())
329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
330 "versioning for alias required: "
331 "bad dist vector for ");
332 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
333 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
334 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
335 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
337 /* Add to list of ddrs that need to be tested at run-time. */
338 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
341 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
342 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
344 int dist = dist_v[loop_depth];
346 if (dump_enabled_p ())
347 dump_printf_loc (MSG_NOTE, vect_location,
348 "dependence distance = %d.\n", dist);
350 if (dist == 0)
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_NOTE, vect_location,
355 "dependence distance == 0 between ");
356 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
357 dump_printf (MSG_NOTE, " and ");
358 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
359 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
362 /* When we perform grouped accesses and perform implicit CSE
363 by detecting equal accesses and doing disambiguation with
364 runtime alias tests like for
365 .. = a[i];
366 .. = a[i+1];
367 a[i] = ..;
368 a[i+1] = ..;
369 *p = ..;
370 .. = a[i];
371 .. = a[i+1];
372 where we will end up loading { a[i], a[i+1] } once, make
373 sure that inserting group loads before the first load and
374 stores after the last store will do the right thing.
375 Similar for groups like
376 a[i] = ...;
377 ... = a[i];
378 a[i+1] = ...;
379 where loads from the group interleave with the store. */
380 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
381 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
383 gimple *earlier_stmt;
384 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
385 if (DR_IS_WRITE
386 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
388 if (dump_enabled_p ())
389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
390 "READ_WRITE dependence in interleaving."
391 "\n");
392 return true;
396 continue;
399 if (dist > 0 && DDR_REVERSED_P (ddr))
401 /* If DDR_REVERSED_P the order of the data-refs in DDR was
402 reversed (to make distance vector positive), and the actual
403 distance is negative. */
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
406 "dependence distance negative.\n");
407 /* Record a negative dependence distance to later limit the
408 amount of stmt copying / unrolling we can perform.
409 Only need to handle read-after-write dependence. */
410 if (DR_IS_READ (drb)
411 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
412 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
413 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
414 continue;
417 if (abs (dist) >= 2
418 && abs (dist) < *max_vf)
420 /* The dependence distance requires reduction of the maximal
421 vectorization factor. */
422 *max_vf = abs (dist);
423 if (dump_enabled_p ())
424 dump_printf_loc (MSG_NOTE, vect_location,
425 "adjusting maximal vectorization factor to %i\n",
426 *max_vf);
429 if (abs (dist) >= *max_vf)
431 /* Dependence distance does not create dependence, as far as
432 vectorization is concerned, in this case. */
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_NOTE, vect_location,
435 "dependence distance >= VF.\n");
436 continue;
439 if (dump_enabled_p ())
441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
442 "not vectorized, possible dependence "
443 "between data-refs ");
444 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
445 dump_printf (MSG_NOTE, " and ");
446 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
447 dump_printf (MSG_NOTE, "\n");
450 return true;
453 return false;
456 /* Function vect_analyze_data_ref_dependences.
458 Examine all the data references in the loop, and make sure there do not
459 exist any data dependences between them. Set *MAX_VF according to
460 the maximum vectorization factor the data dependences allow. */
462 bool
463 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
465 unsigned int i;
466 struct data_dependence_relation *ddr;
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "=== vect_analyze_data_ref_dependences ===\n");
472 LOOP_VINFO_DDRS (loop_vinfo)
473 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
474 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
475 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
476 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
477 &LOOP_VINFO_DDRS (loop_vinfo),
478 LOOP_VINFO_LOOP_NEST (loop_vinfo), false))
479 return false;
481 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
482 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
483 return false;
485 return true;
489 /* Function vect_slp_analyze_data_ref_dependence.
491 Return TRUE if there (might) exist a dependence between a memory-reference
492 DRA and a memory-reference DRB. When versioning for alias may check a
493 dependence at run-time, return FALSE. Adjust *MAX_VF according to
494 the data dependence. */
496 static bool
497 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
499 struct data_reference *dra = DDR_A (ddr);
500 struct data_reference *drb = DDR_B (ddr);
502 /* We need to check dependences of statements marked as unvectorizable
503 as well, they still can prohibit vectorization. */
505 /* Independent data accesses. */
506 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
507 return false;
509 if (dra == drb)
510 return false;
512 /* Read-read is OK. */
513 if (DR_IS_READ (dra) && DR_IS_READ (drb))
514 return false;
516 /* If dra and drb are part of the same interleaving chain consider
517 them independent. */
518 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
519 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
520 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
521 return false;
523 /* Unknown data dependence. */
524 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
526 if (dump_enabled_p ())
528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
529 "can't determine dependence between ");
530 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
531 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
532 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
533 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
536 else if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location,
539 "determined dependence between ");
540 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
541 dump_printf (MSG_NOTE, " and ");
542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
543 dump_printf (MSG_NOTE, "\n");
546 return true;
550 /* Analyze dependences involved in the transform of SLP NODE. STORES
551 contain the vector of scalar stores of this instance if we are
552 disambiguating the loads. */
554 static bool
555 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
556 vec<gimple *> stores, gimple *last_store)
558 /* This walks over all stmts involved in the SLP load/store done
559 in NODE verifying we can sink them up to the last stmt in the
560 group. */
561 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
562 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
564 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
565 if (access == last_access)
566 continue;
567 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
568 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
569 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
571 gimple *stmt = gsi_stmt (gsi);
572 if (! gimple_vuse (stmt)
573 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
574 continue;
576 /* If we couldn't record a (single) data reference for this
577 stmt we have to give up. */
578 /* ??? Here and below if dependence analysis fails we can resort
579 to the alias oracle which can handle more kinds of stmts. */
580 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
581 if (!dr_b)
582 return false;
584 /* If we run into a store of this same instance (we've just
585 marked those) then delay dependence checking until we run
586 into the last store because this is where it will have
587 been sunk to (and we verify if we can do that as well). */
588 if (gimple_visited_p (stmt))
590 if (stmt != last_store)
591 continue;
592 unsigned i;
593 gimple *store;
594 FOR_EACH_VEC_ELT (stores, i, store)
596 data_reference *store_dr
597 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
598 ddr_p ddr = initialize_data_dependence_relation
599 (dr_a, store_dr, vNULL);
600 if (vect_slp_analyze_data_ref_dependence (ddr))
602 free_dependence_relation (ddr);
603 return false;
605 free_dependence_relation (ddr);
609 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
610 if (vect_slp_analyze_data_ref_dependence (ddr))
612 free_dependence_relation (ddr);
613 return false;
615 free_dependence_relation (ddr);
618 return true;
622 /* Function vect_analyze_data_ref_dependences.
624 Examine all the data references in the basic-block, and make sure there
625 do not exist any data dependences between them. Set *MAX_VF according to
626 the maximum vectorization factor the data dependences allow. */
628 bool
629 vect_slp_analyze_instance_dependence (slp_instance instance)
631 if (dump_enabled_p ())
632 dump_printf_loc (MSG_NOTE, vect_location,
633 "=== vect_slp_analyze_instance_dependence ===\n");
635 /* The stores of this instance are at the root of the SLP tree. */
636 slp_tree store = SLP_INSTANCE_TREE (instance);
637 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
638 store = NULL;
640 /* Verify we can sink stores to the vectorized stmt insert location. */
641 gimple *last_store = NULL;
642 if (store)
644 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
645 return false;
647 /* Mark stores in this instance and remember the last one. */
648 last_store = vect_find_last_scalar_stmt_in_slp (store);
649 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
650 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
653 bool res = true;
655 /* Verify we can sink loads to the vectorized stmt insert location,
656 special-casing stores of this instance. */
657 slp_tree load;
658 unsigned int i;
659 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
660 if (! vect_slp_analyze_node_dependences (instance, load,
661 store
662 ? SLP_TREE_SCALAR_STMTS (store)
663 : vNULL, last_store))
665 res = false;
666 break;
669 /* Unset the visited flag. */
670 if (store)
671 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
672 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
674 return res;
677 /* Function vect_compute_data_ref_alignment
679 Compute the misalignment of the data reference DR.
681 Output:
682 1. If during the misalignment computation it is found that the data reference
683 cannot be vectorized then false is returned.
684 2. DR_MISALIGNMENT (DR) is defined.
686 FOR NOW: No analysis is actually performed. Misalignment is calculated
687 only for trivial cases. TODO. */
689 bool
690 vect_compute_data_ref_alignment (struct data_reference *dr)
692 gimple *stmt = DR_STMT (dr);
693 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
694 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
695 struct loop *loop = NULL;
696 tree ref = DR_REF (dr);
697 tree vectype;
698 tree base, base_addr;
699 tree misalign = NULL_TREE;
700 tree aligned_to;
701 tree step;
702 unsigned HOST_WIDE_INT alignment;
704 if (dump_enabled_p ())
705 dump_printf_loc (MSG_NOTE, vect_location,
706 "vect_compute_data_ref_alignment:\n");
708 if (loop_vinfo)
709 loop = LOOP_VINFO_LOOP (loop_vinfo);
711 /* Initialize misalignment to unknown. */
712 SET_DR_MISALIGNMENT (dr, -1);
714 if (tree_fits_shwi_p (DR_STEP (dr)))
715 misalign = DR_INIT (dr);
716 aligned_to = DR_ALIGNED_TO (dr);
717 base_addr = DR_BASE_ADDRESS (dr);
718 vectype = STMT_VINFO_VECTYPE (stmt_info);
720 /* In case the dataref is in an inner-loop of the loop that is being
721 vectorized (LOOP), we use the base and misalignment information
722 relative to the outer-loop (LOOP). This is ok only if the misalignment
723 stays the same throughout the execution of the inner-loop, which is why
724 we have to check that the stride of the dataref in the inner-loop evenly
725 divides by the vector size. */
726 if (loop && nested_in_vect_loop_p (loop, stmt))
728 tree step = DR_STEP (dr);
730 if (tree_fits_shwi_p (step)
731 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
733 if (dump_enabled_p ())
734 dump_printf_loc (MSG_NOTE, vect_location,
735 "inner step divides the vector-size.\n");
736 misalign = STMT_VINFO_DR_INIT (stmt_info);
737 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
738 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
740 else
742 if (dump_enabled_p ())
743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
744 "inner step doesn't divide the vector-size.\n");
745 misalign = NULL_TREE;
749 /* Similarly we can only use base and misalignment information relative to
750 an innermost loop if the misalignment stays the same throughout the
751 execution of the loop. As above, this is the case if the stride of
752 the dataref evenly divides by the vector size. */
753 else
755 tree step = DR_STEP (dr);
756 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
758 if (tree_fits_shwi_p (step)
759 && ((tree_to_shwi (step) * vf)
760 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
762 if (dump_enabled_p ())
763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
764 "step doesn't divide the vector-size.\n");
765 misalign = NULL_TREE;
769 /* To look at alignment of the base we have to preserve an inner MEM_REF
770 as that carries alignment information of the actual access. */
771 base = ref;
772 while (handled_component_p (base))
773 base = TREE_OPERAND (base, 0);
774 if (TREE_CODE (base) == MEM_REF)
775 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
776 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
777 unsigned int base_alignment = get_object_alignment (base);
779 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
780 DR_VECT_AUX (dr)->base_element_aligned = true;
782 alignment = TYPE_ALIGN_UNIT (vectype);
784 if ((compare_tree_int (aligned_to, alignment) < 0)
785 || !misalign)
787 if (dump_enabled_p ())
789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
790 "Unknown alignment for access: ");
791 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
792 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
794 return true;
797 if (base_alignment < TYPE_ALIGN (vectype))
799 /* Strip an inner MEM_REF to a bare decl if possible. */
800 if (TREE_CODE (base) == MEM_REF
801 && integer_zerop (TREE_OPERAND (base, 1))
802 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
803 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
805 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
807 if (dump_enabled_p ())
809 dump_printf_loc (MSG_NOTE, vect_location,
810 "can't force alignment of ref: ");
811 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
812 dump_printf (MSG_NOTE, "\n");
814 return true;
817 /* Force the alignment of the decl.
818 NOTE: This is the only change to the code we make during
819 the analysis phase, before deciding to vectorize the loop. */
820 if (dump_enabled_p ())
822 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
823 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
824 dump_printf (MSG_NOTE, "\n");
827 DR_VECT_AUX (dr)->base_decl = base;
828 DR_VECT_AUX (dr)->base_misaligned = true;
829 DR_VECT_AUX (dr)->base_element_aligned = true;
832 if (loop && nested_in_vect_loop_p (loop, stmt))
833 step = STMT_VINFO_DR_STEP (stmt_info);
834 else
835 step = DR_STEP (dr);
836 /* If this is a backward running DR then first access in the larger
837 vectype actually is N-1 elements before the address in the DR.
838 Adjust misalign accordingly. */
839 if (tree_int_cst_sgn (step) < 0)
841 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
842 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
843 otherwise we wouldn't be here. */
844 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
845 /* PLUS because STEP was negative. */
846 misalign = size_binop (PLUS_EXPR, misalign, offset);
849 SET_DR_MISALIGNMENT (dr,
850 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
852 if (dump_enabled_p ())
854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
855 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
856 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
857 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
860 return true;
864 /* Function vect_update_misalignment_for_peel
866 DR - the data reference whose misalignment is to be adjusted.
867 DR_PEEL - the data reference whose misalignment is being made
868 zero in the vector loop by the peel.
869 NPEEL - the number of iterations in the peel loop if the misalignment
870 of DR_PEEL is known at compile time. */
872 static void
873 vect_update_misalignment_for_peel (struct data_reference *dr,
874 struct data_reference *dr_peel, int npeel)
876 unsigned int i;
877 vec<dr_p> same_align_drs;
878 struct data_reference *current_dr;
879 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
880 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
881 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
882 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
884 /* For interleaved data accesses the step in the loop must be multiplied by
885 the size of the interleaving group. */
886 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
887 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
888 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
889 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
891 /* It can be assumed that the data refs with the same alignment as dr_peel
892 are aligned in the vector loop. */
893 same_align_drs
894 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
895 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
897 if (current_dr != dr)
898 continue;
899 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
900 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
901 SET_DR_MISALIGNMENT (dr, 0);
902 return;
905 if (known_alignment_for_access_p (dr)
906 && known_alignment_for_access_p (dr_peel))
908 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
909 int misal = DR_MISALIGNMENT (dr);
910 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
911 misal += negative ? -npeel * dr_size : npeel * dr_size;
912 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
913 SET_DR_MISALIGNMENT (dr, misal);
914 return;
917 if (dump_enabled_p ())
918 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
919 SET_DR_MISALIGNMENT (dr, -1);
923 /* Function verify_data_ref_alignment
925 Return TRUE if DR can be handled with respect to alignment. */
927 static bool
928 verify_data_ref_alignment (data_reference_p dr)
930 enum dr_alignment_support supportable_dr_alignment
931 = vect_supportable_dr_alignment (dr, false);
932 if (!supportable_dr_alignment)
934 if (dump_enabled_p ())
936 if (DR_IS_READ (dr))
937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
938 "not vectorized: unsupported unaligned load.");
939 else
940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
941 "not vectorized: unsupported unaligned "
942 "store.");
944 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
945 DR_REF (dr));
946 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
948 return false;
951 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
952 dump_printf_loc (MSG_NOTE, vect_location,
953 "Vectorizing an unaligned access.\n");
955 return true;
958 /* Function vect_verify_datarefs_alignment
960 Return TRUE if all data references in the loop can be
961 handled with respect to alignment. */
963 bool
964 vect_verify_datarefs_alignment (loop_vec_info vinfo)
966 vec<data_reference_p> datarefs = vinfo->datarefs;
967 struct data_reference *dr;
968 unsigned int i;
970 FOR_EACH_VEC_ELT (datarefs, i, dr)
972 gimple *stmt = DR_STMT (dr);
973 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
975 if (!STMT_VINFO_RELEVANT_P (stmt_info))
976 continue;
978 /* For interleaving, only the alignment of the first access matters. */
979 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
980 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
981 continue;
983 /* Strided accesses perform only component accesses, alignment is
984 irrelevant for them. */
985 if (STMT_VINFO_STRIDED_P (stmt_info)
986 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
987 continue;
989 if (! verify_data_ref_alignment (dr))
990 return false;
993 return true;
996 /* Given an memory reference EXP return whether its alignment is less
997 than its size. */
999 static bool
1000 not_size_aligned (tree exp)
1002 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1003 return true;
1005 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1006 > get_object_alignment (exp));
1009 /* Function vector_alignment_reachable_p
1011 Return true if vector alignment for DR is reachable by peeling
1012 a few loop iterations. Return false otherwise. */
1014 static bool
1015 vector_alignment_reachable_p (struct data_reference *dr)
1017 gimple *stmt = DR_STMT (dr);
1018 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1019 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1021 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1023 /* For interleaved access we peel only if number of iterations in
1024 the prolog loop ({VF - misalignment}), is a multiple of the
1025 number of the interleaved accesses. */
1026 int elem_size, mis_in_elements;
1027 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1029 /* FORNOW: handle only known alignment. */
1030 if (!known_alignment_for_access_p (dr))
1031 return false;
1033 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1034 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1036 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1037 return false;
1040 /* If misalignment is known at the compile time then allow peeling
1041 only if natural alignment is reachable through peeling. */
1042 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1044 HOST_WIDE_INT elmsize =
1045 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1046 if (dump_enabled_p ())
1048 dump_printf_loc (MSG_NOTE, vect_location,
1049 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1050 dump_printf (MSG_NOTE,
1051 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1053 if (DR_MISALIGNMENT (dr) % elmsize)
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "data size does not divide the misalignment.\n");
1058 return false;
1062 if (!known_alignment_for_access_p (dr))
1064 tree type = TREE_TYPE (DR_REF (dr));
1065 bool is_packed = not_size_aligned (DR_REF (dr));
1066 if (dump_enabled_p ())
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068 "Unknown misalignment, is_packed = %d\n",is_packed);
1069 if ((TYPE_USER_ALIGN (type) && !is_packed)
1070 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1071 return true;
1072 else
1073 return false;
1076 return true;
1080 /* Calculate the cost of the memory access represented by DR. */
1082 static void
1083 vect_get_data_access_cost (struct data_reference *dr,
1084 unsigned int *inside_cost,
1085 unsigned int *outside_cost,
1086 stmt_vector_for_cost *body_cost_vec)
1088 gimple *stmt = DR_STMT (dr);
1089 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1090 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1091 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1092 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1093 int ncopies = vf / nunits;
1095 if (DR_IS_READ (dr))
1096 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1097 NULL, body_cost_vec, false);
1098 else
1099 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1101 if (dump_enabled_p ())
1102 dump_printf_loc (MSG_NOTE, vect_location,
1103 "vect_get_data_access_cost: inside_cost = %d, "
1104 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1108 typedef struct _vect_peel_info
1110 int npeel;
1111 struct data_reference *dr;
1112 unsigned int count;
1113 } *vect_peel_info;
1115 typedef struct _vect_peel_extended_info
1117 struct _vect_peel_info peel_info;
1118 unsigned int inside_cost;
1119 unsigned int outside_cost;
1120 stmt_vector_for_cost body_cost_vec;
1121 } *vect_peel_extended_info;
1124 /* Peeling hashtable helpers. */
1126 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1128 static inline hashval_t hash (const _vect_peel_info *);
1129 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1132 inline hashval_t
1133 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1135 return (hashval_t) peel_info->npeel;
1138 inline bool
1139 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1141 return (a->npeel == b->npeel);
1145 /* Insert DR into peeling hash table with NPEEL as key. */
1147 static void
1148 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1149 loop_vec_info loop_vinfo, struct data_reference *dr,
1150 int npeel)
1152 struct _vect_peel_info elem, *slot;
1153 _vect_peel_info **new_slot;
1154 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1156 elem.npeel = npeel;
1157 slot = peeling_htab->find (&elem);
1158 if (slot)
1159 slot->count++;
1160 else
1162 slot = XNEW (struct _vect_peel_info);
1163 slot->npeel = npeel;
1164 slot->dr = dr;
1165 slot->count = 1;
1166 new_slot = peeling_htab->find_slot (slot, INSERT);
1167 *new_slot = slot;
1170 if (!supportable_dr_alignment
1171 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1172 slot->count += VECT_MAX_COST;
1176 /* Traverse peeling hash table to find peeling option that aligns maximum
1177 number of data accesses. */
1180 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1181 _vect_peel_extended_info *max)
1183 vect_peel_info elem = *slot;
1185 if (elem->count > max->peel_info.count
1186 || (elem->count == max->peel_info.count
1187 && max->peel_info.npeel > elem->npeel))
1189 max->peel_info.npeel = elem->npeel;
1190 max->peel_info.count = elem->count;
1191 max->peel_info.dr = elem->dr;
1194 return 1;
1198 /* Traverse peeling hash table and calculate cost for each peeling option.
1199 Find the one with the lowest cost. */
1202 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1203 _vect_peel_extended_info *min)
1205 vect_peel_info elem = *slot;
1206 int save_misalignment, dummy;
1207 unsigned int inside_cost = 0, outside_cost = 0, i;
1208 gimple *stmt = DR_STMT (elem->dr);
1209 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1210 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1211 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1212 struct data_reference *dr;
1213 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1215 prologue_cost_vec.create (2);
1216 body_cost_vec.create (2);
1217 epilogue_cost_vec.create (2);
1219 FOR_EACH_VEC_ELT (datarefs, i, dr)
1221 stmt = DR_STMT (dr);
1222 stmt_info = vinfo_for_stmt (stmt);
1223 /* For interleaving, only the alignment of the first access
1224 matters. */
1225 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1226 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1227 continue;
1229 /* Strided accesses perform only component accesses, alignment is
1230 irrelevant for them. */
1231 if (STMT_VINFO_STRIDED_P (stmt_info)
1232 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1233 continue;
1235 save_misalignment = DR_MISALIGNMENT (dr);
1236 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1237 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1238 &body_cost_vec);
1239 SET_DR_MISALIGNMENT (dr, save_misalignment);
1242 outside_cost += vect_get_known_peeling_cost
1243 (loop_vinfo, elem->npeel, &dummy,
1244 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1245 &prologue_cost_vec, &epilogue_cost_vec);
1247 /* Prologue and epilogue costs are added to the target model later.
1248 These costs depend only on the scalar iteration cost, the
1249 number of peeling iterations finally chosen, and the number of
1250 misaligned statements. So discard the information found here. */
1251 prologue_cost_vec.release ();
1252 epilogue_cost_vec.release ();
1254 if (inside_cost < min->inside_cost
1255 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1257 min->inside_cost = inside_cost;
1258 min->outside_cost = outside_cost;
1259 min->body_cost_vec.release ();
1260 min->body_cost_vec = body_cost_vec;
1261 min->peel_info.dr = elem->dr;
1262 min->peel_info.npeel = elem->npeel;
1264 else
1265 body_cost_vec.release ();
1267 return 1;
1271 /* Choose best peeling option by traversing peeling hash table and either
1272 choosing an option with the lowest cost (if cost model is enabled) or the
1273 option that aligns as many accesses as possible. */
1275 static struct data_reference *
1276 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1277 loop_vec_info loop_vinfo,
1278 unsigned int *npeel,
1279 stmt_vector_for_cost *body_cost_vec)
1281 struct _vect_peel_extended_info res;
1283 res.peel_info.dr = NULL;
1284 res.body_cost_vec = stmt_vector_for_cost ();
1286 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1288 res.inside_cost = INT_MAX;
1289 res.outside_cost = INT_MAX;
1290 peeling_htab->traverse <_vect_peel_extended_info *,
1291 vect_peeling_hash_get_lowest_cost> (&res);
1293 else
1295 res.peel_info.count = 0;
1296 peeling_htab->traverse <_vect_peel_extended_info *,
1297 vect_peeling_hash_get_most_frequent> (&res);
1300 *npeel = res.peel_info.npeel;
1301 *body_cost_vec = res.body_cost_vec;
1302 return res.peel_info.dr;
1306 /* Function vect_enhance_data_refs_alignment
1308 This pass will use loop versioning and loop peeling in order to enhance
1309 the alignment of data references in the loop.
1311 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1312 original loop is to be vectorized. Any other loops that are created by
1313 the transformations performed in this pass - are not supposed to be
1314 vectorized. This restriction will be relaxed.
1316 This pass will require a cost model to guide it whether to apply peeling
1317 or versioning or a combination of the two. For example, the scheme that
1318 intel uses when given a loop with several memory accesses, is as follows:
1319 choose one memory access ('p') which alignment you want to force by doing
1320 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1321 other accesses are not necessarily aligned, or (2) use loop versioning to
1322 generate one loop in which all accesses are aligned, and another loop in
1323 which only 'p' is necessarily aligned.
1325 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1326 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1327 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1329 Devising a cost model is the most critical aspect of this work. It will
1330 guide us on which access to peel for, whether to use loop versioning, how
1331 many versions to create, etc. The cost model will probably consist of
1332 generic considerations as well as target specific considerations (on
1333 powerpc for example, misaligned stores are more painful than misaligned
1334 loads).
1336 Here are the general steps involved in alignment enhancements:
1338 -- original loop, before alignment analysis:
1339 for (i=0; i<N; i++){
1340 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1341 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1344 -- After vect_compute_data_refs_alignment:
1345 for (i=0; i<N; i++){
1346 x = q[i]; # DR_MISALIGNMENT(q) = 3
1347 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1350 -- Possibility 1: we do loop versioning:
1351 if (p is aligned) {
1352 for (i=0; i<N; i++){ # loop 1A
1353 x = q[i]; # DR_MISALIGNMENT(q) = 3
1354 p[i] = y; # DR_MISALIGNMENT(p) = 0
1357 else {
1358 for (i=0; i<N; i++){ # loop 1B
1359 x = q[i]; # DR_MISALIGNMENT(q) = 3
1360 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1364 -- Possibility 2: we do loop peeling:
1365 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1366 x = q[i];
1367 p[i] = y;
1369 for (i = 3; i < N; i++){ # loop 2A
1370 x = q[i]; # DR_MISALIGNMENT(q) = 0
1371 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1374 -- Possibility 3: combination of loop peeling and versioning:
1375 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1376 x = q[i];
1377 p[i] = y;
1379 if (p is aligned) {
1380 for (i = 3; i<N; i++){ # loop 3A
1381 x = q[i]; # DR_MISALIGNMENT(q) = 0
1382 p[i] = y; # DR_MISALIGNMENT(p) = 0
1385 else {
1386 for (i = 3; i<N; i++){ # loop 3B
1387 x = q[i]; # DR_MISALIGNMENT(q) = 0
1388 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1392 These loops are later passed to loop_transform to be vectorized. The
1393 vectorizer will use the alignment information to guide the transformation
1394 (whether to generate regular loads/stores, or with special handling for
1395 misalignment). */
1397 bool
1398 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1400 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1401 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1402 enum dr_alignment_support supportable_dr_alignment;
1403 struct data_reference *dr0 = NULL, *first_store = NULL;
1404 struct data_reference *dr;
1405 unsigned int i, j;
1406 bool do_peeling = false;
1407 bool do_versioning = false;
1408 bool stat;
1409 gimple *stmt;
1410 stmt_vec_info stmt_info;
1411 unsigned int npeel = 0;
1412 bool all_misalignments_unknown = true;
1413 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1414 unsigned possible_npeel_number = 1;
1415 tree vectype;
1416 unsigned int nelements, mis, same_align_drs_max = 0;
1417 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1418 hash_table<peel_info_hasher> peeling_htab (1);
1420 if (dump_enabled_p ())
1421 dump_printf_loc (MSG_NOTE, vect_location,
1422 "=== vect_enhance_data_refs_alignment ===\n");
1424 /* Reset data so we can safely be called multiple times. */
1425 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1426 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1428 /* While cost model enhancements are expected in the future, the high level
1429 view of the code at this time is as follows:
1431 A) If there is a misaligned access then see if peeling to align
1432 this access can make all data references satisfy
1433 vect_supportable_dr_alignment. If so, update data structures
1434 as needed and return true.
1436 B) If peeling wasn't possible and there is a data reference with an
1437 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1438 then see if loop versioning checks can be used to make all data
1439 references satisfy vect_supportable_dr_alignment. If so, update
1440 data structures as needed and return true.
1442 C) If neither peeling nor versioning were successful then return false if
1443 any data reference does not satisfy vect_supportable_dr_alignment.
1445 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1447 Note, Possibility 3 above (which is peeling and versioning together) is not
1448 being done at this time. */
1450 /* (1) Peeling to force alignment. */
1452 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1453 Considerations:
1454 + How many accesses will become aligned due to the peeling
1455 - How many accesses will become unaligned due to the peeling,
1456 and the cost of misaligned accesses.
1457 - The cost of peeling (the extra runtime checks, the increase
1458 in code size). */
1460 FOR_EACH_VEC_ELT (datarefs, i, dr)
1462 stmt = DR_STMT (dr);
1463 stmt_info = vinfo_for_stmt (stmt);
1465 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1466 continue;
1468 /* For interleaving, only the alignment of the first access
1469 matters. */
1470 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1471 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1472 continue;
1474 /* For invariant accesses there is nothing to enhance. */
1475 if (integer_zerop (DR_STEP (dr)))
1476 continue;
1478 /* Strided accesses perform only component accesses, alignment is
1479 irrelevant for them. */
1480 if (STMT_VINFO_STRIDED_P (stmt_info)
1481 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1482 continue;
1484 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1485 do_peeling = vector_alignment_reachable_p (dr);
1486 if (do_peeling)
1488 if (known_alignment_for_access_p (dr))
1490 unsigned int npeel_tmp;
1491 bool negative = tree_int_cst_compare (DR_STEP (dr),
1492 size_zero_node) < 0;
1494 /* Save info about DR in the hash table. */
1495 vectype = STMT_VINFO_VECTYPE (stmt_info);
1496 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1497 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1498 TREE_TYPE (DR_REF (dr))));
1499 npeel_tmp = (negative
1500 ? (mis - nelements) : (nelements - mis))
1501 & (nelements - 1);
1503 /* For multiple types, it is possible that the bigger type access
1504 will have more than one peeling option. E.g., a loop with two
1505 types: one of size (vector size / 4), and the other one of
1506 size (vector size / 8). Vectorization factor will 8. If both
1507 access are misaligned by 3, the first one needs one scalar
1508 iteration to be aligned, and the second one needs 5. But the
1509 first one will be aligned also by peeling 5 scalar
1510 iterations, and in that case both accesses will be aligned.
1511 Hence, except for the immediate peeling amount, we also want
1512 to try to add full vector size, while we don't exceed
1513 vectorization factor.
1514 We do this automtically for cost model, since we calculate cost
1515 for every peeling option. */
1516 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1518 if (STMT_SLP_TYPE (stmt_info))
1519 possible_npeel_number
1520 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1521 else
1522 possible_npeel_number = vf / nelements;
1525 /* Handle the aligned case. We may decide to align some other
1526 access, making DR unaligned. */
1527 if (DR_MISALIGNMENT (dr) == 0)
1529 npeel_tmp = 0;
1530 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1531 possible_npeel_number++;
1534 for (j = 0; j < possible_npeel_number; j++)
1536 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1537 dr, npeel_tmp);
1538 npeel_tmp += nelements;
1541 all_misalignments_unknown = false;
1542 /* Data-ref that was chosen for the case that all the
1543 misalignments are unknown is not relevant anymore, since we
1544 have a data-ref with known alignment. */
1545 dr0 = NULL;
1547 else
1549 /* If we don't know any misalignment values, we prefer
1550 peeling for data-ref that has the maximum number of data-refs
1551 with the same alignment, unless the target prefers to align
1552 stores over load. */
1553 if (all_misalignments_unknown)
1555 unsigned same_align_drs
1556 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1557 if (!dr0
1558 || same_align_drs_max < same_align_drs)
1560 same_align_drs_max = same_align_drs;
1561 dr0 = dr;
1563 /* For data-refs with the same number of related
1564 accesses prefer the one where the misalign
1565 computation will be invariant in the outermost loop. */
1566 else if (same_align_drs_max == same_align_drs)
1568 struct loop *ivloop0, *ivloop;
1569 ivloop0 = outermost_invariant_loop_for_expr
1570 (loop, DR_BASE_ADDRESS (dr0));
1571 ivloop = outermost_invariant_loop_for_expr
1572 (loop, DR_BASE_ADDRESS (dr));
1573 if ((ivloop && !ivloop0)
1574 || (ivloop && ivloop0
1575 && flow_loop_nested_p (ivloop, ivloop0)))
1576 dr0 = dr;
1579 if (!first_store && DR_IS_WRITE (dr))
1580 first_store = dr;
1583 /* If there are both known and unknown misaligned accesses in the
1584 loop, we choose peeling amount according to the known
1585 accesses. */
1586 if (!supportable_dr_alignment)
1588 dr0 = dr;
1589 if (!first_store && DR_IS_WRITE (dr))
1590 first_store = dr;
1594 else
1596 if (!aligned_access_p (dr))
1598 if (dump_enabled_p ())
1599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1600 "vector alignment may not be reachable\n");
1601 break;
1606 /* Check if we can possibly peel the loop. */
1607 if (!vect_can_advance_ivs_p (loop_vinfo)
1608 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1609 || loop->inner)
1610 do_peeling = false;
1612 if (do_peeling
1613 && all_misalignments_unknown
1614 && vect_supportable_dr_alignment (dr0, false))
1616 /* Check if the target requires to prefer stores over loads, i.e., if
1617 misaligned stores are more expensive than misaligned loads (taking
1618 drs with same alignment into account). */
1619 if (first_store && DR_IS_READ (dr0))
1621 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1622 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1623 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1624 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1625 stmt_vector_for_cost dummy;
1626 dummy.create (2);
1628 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1629 &dummy);
1630 vect_get_data_access_cost (first_store, &store_inside_cost,
1631 &store_outside_cost, &dummy);
1633 dummy.release ();
1635 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1636 aligning the load DR0). */
1637 load_inside_penalty = store_inside_cost;
1638 load_outside_penalty = store_outside_cost;
1639 for (i = 0;
1640 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1641 DR_STMT (first_store))).iterate (i, &dr);
1642 i++)
1643 if (DR_IS_READ (dr))
1645 load_inside_penalty += load_inside_cost;
1646 load_outside_penalty += load_outside_cost;
1648 else
1650 load_inside_penalty += store_inside_cost;
1651 load_outside_penalty += store_outside_cost;
1654 /* Calculate the penalty for leaving DR0 unaligned (by
1655 aligning the FIRST_STORE). */
1656 store_inside_penalty = load_inside_cost;
1657 store_outside_penalty = load_outside_cost;
1658 for (i = 0;
1659 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1660 DR_STMT (dr0))).iterate (i, &dr);
1661 i++)
1662 if (DR_IS_READ (dr))
1664 store_inside_penalty += load_inside_cost;
1665 store_outside_penalty += load_outside_cost;
1667 else
1669 store_inside_penalty += store_inside_cost;
1670 store_outside_penalty += store_outside_cost;
1673 if (load_inside_penalty > store_inside_penalty
1674 || (load_inside_penalty == store_inside_penalty
1675 && load_outside_penalty > store_outside_penalty))
1676 dr0 = first_store;
1679 /* In case there are only loads with different unknown misalignments, use
1680 peeling only if it may help to align other accesses in the loop or
1681 if it may help improving load bandwith when we'd end up using
1682 unaligned loads. */
1683 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1684 if (!first_store
1685 && !STMT_VINFO_SAME_ALIGN_REFS (
1686 vinfo_for_stmt (DR_STMT (dr0))).length ()
1687 && (vect_supportable_dr_alignment (dr0, false)
1688 != dr_unaligned_supported
1689 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1690 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1691 do_peeling = false;
1694 if (do_peeling && !dr0)
1696 /* Peeling is possible, but there is no data access that is not supported
1697 unless aligned. So we try to choose the best possible peeling. */
1699 /* We should get here only if there are drs with known misalignment. */
1700 gcc_assert (!all_misalignments_unknown);
1702 /* Choose the best peeling from the hash table. */
1703 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1704 loop_vinfo, &npeel,
1705 &body_cost_vec);
1706 if (!dr0 || !npeel)
1707 do_peeling = false;
1710 if (do_peeling)
1712 stmt = DR_STMT (dr0);
1713 stmt_info = vinfo_for_stmt (stmt);
1714 vectype = STMT_VINFO_VECTYPE (stmt_info);
1715 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1717 if (known_alignment_for_access_p (dr0))
1719 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1720 size_zero_node) < 0;
1721 if (!npeel)
1723 /* Since it's known at compile time, compute the number of
1724 iterations in the peeled loop (the peeling factor) for use in
1725 updating DR_MISALIGNMENT values. The peeling factor is the
1726 vectorization factor minus the misalignment as an element
1727 count. */
1728 mis = DR_MISALIGNMENT (dr0);
1729 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1730 npeel = ((negative ? mis - nelements : nelements - mis)
1731 & (nelements - 1));
1734 /* For interleaved data access every iteration accesses all the
1735 members of the group, therefore we divide the number of iterations
1736 by the group size. */
1737 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1738 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1739 npeel /= GROUP_SIZE (stmt_info);
1741 if (dump_enabled_p ())
1742 dump_printf_loc (MSG_NOTE, vect_location,
1743 "Try peeling by %d\n", npeel);
1746 /* Ensure that all data refs can be vectorized after the peel. */
1747 FOR_EACH_VEC_ELT (datarefs, i, dr)
1749 int save_misalignment;
1751 if (dr == dr0)
1752 continue;
1754 stmt = DR_STMT (dr);
1755 stmt_info = vinfo_for_stmt (stmt);
1756 /* For interleaving, only the alignment of the first access
1757 matters. */
1758 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1759 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1760 continue;
1762 /* Strided accesses perform only component accesses, alignment is
1763 irrelevant for them. */
1764 if (STMT_VINFO_STRIDED_P (stmt_info)
1765 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1766 continue;
1768 save_misalignment = DR_MISALIGNMENT (dr);
1769 vect_update_misalignment_for_peel (dr, dr0, npeel);
1770 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1771 SET_DR_MISALIGNMENT (dr, save_misalignment);
1773 if (!supportable_dr_alignment)
1775 do_peeling = false;
1776 break;
1780 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1782 stat = vect_verify_datarefs_alignment (loop_vinfo);
1783 if (!stat)
1784 do_peeling = false;
1785 else
1787 body_cost_vec.release ();
1788 return stat;
1792 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1793 if (do_peeling)
1795 unsigned max_allowed_peel
1796 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1797 if (max_allowed_peel != (unsigned)-1)
1799 unsigned max_peel = npeel;
1800 if (max_peel == 0)
1802 gimple *dr_stmt = DR_STMT (dr0);
1803 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1804 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1805 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1807 if (max_peel > max_allowed_peel)
1809 do_peeling = false;
1810 if (dump_enabled_p ())
1811 dump_printf_loc (MSG_NOTE, vect_location,
1812 "Disable peeling, max peels reached: %d\n", max_peel);
1817 /* Cost model #2 - if peeling may result in a remaining loop not
1818 iterating enough to be vectorized then do not peel. */
1819 if (do_peeling
1820 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1822 unsigned max_peel
1823 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1824 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1825 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1826 do_peeling = false;
1829 if (do_peeling)
1831 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1832 If the misalignment of DR_i is identical to that of dr0 then set
1833 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1834 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1835 by the peeling factor times the element size of DR_i (MOD the
1836 vectorization factor times the size). Otherwise, the
1837 misalignment of DR_i must be set to unknown. */
1838 FOR_EACH_VEC_ELT (datarefs, i, dr)
1839 if (dr != dr0)
1841 /* Strided accesses perform only component accesses, alignment
1842 is irrelevant for them. */
1843 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1844 if (STMT_VINFO_STRIDED_P (stmt_info)
1845 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1846 continue;
1848 vect_update_misalignment_for_peel (dr, dr0, npeel);
1851 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1852 if (npeel)
1853 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1854 else
1855 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1856 = DR_MISALIGNMENT (dr0);
1857 SET_DR_MISALIGNMENT (dr0, 0);
1858 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_NOTE, vect_location,
1861 "Alignment of access forced using peeling.\n");
1862 dump_printf_loc (MSG_NOTE, vect_location,
1863 "Peeling for alignment will be applied.\n");
1865 /* The inside-loop cost will be accounted for in vectorizable_load
1866 and vectorizable_store correctly with adjusted alignments.
1867 Drop the body_cst_vec on the floor here. */
1868 body_cost_vec.release ();
1870 stat = vect_verify_datarefs_alignment (loop_vinfo);
1871 gcc_assert (stat);
1872 return stat;
1876 body_cost_vec.release ();
1878 /* (2) Versioning to force alignment. */
1880 /* Try versioning if:
1881 1) optimize loop for speed
1882 2) there is at least one unsupported misaligned data ref with an unknown
1883 misalignment, and
1884 3) all misaligned data refs with a known misalignment are supported, and
1885 4) the number of runtime alignment checks is within reason. */
1887 do_versioning =
1888 optimize_loop_nest_for_speed_p (loop)
1889 && (!loop->inner); /* FORNOW */
1891 if (do_versioning)
1893 FOR_EACH_VEC_ELT (datarefs, i, dr)
1895 stmt = DR_STMT (dr);
1896 stmt_info = vinfo_for_stmt (stmt);
1898 /* For interleaving, only the alignment of the first access
1899 matters. */
1900 if (aligned_access_p (dr)
1901 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1902 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1903 continue;
1905 if (STMT_VINFO_STRIDED_P (stmt_info))
1907 /* Strided loads perform only component accesses, alignment is
1908 irrelevant for them. */
1909 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1910 continue;
1911 do_versioning = false;
1912 break;
1915 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1917 if (!supportable_dr_alignment)
1919 gimple *stmt;
1920 int mask;
1921 tree vectype;
1923 if (known_alignment_for_access_p (dr)
1924 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1925 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1927 do_versioning = false;
1928 break;
1931 stmt = DR_STMT (dr);
1932 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1933 gcc_assert (vectype);
1935 /* The rightmost bits of an aligned address must be zeros.
1936 Construct the mask needed for this test. For example,
1937 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1938 mask must be 15 = 0xf. */
1939 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1941 /* FORNOW: use the same mask to test all potentially unaligned
1942 references in the loop. The vectorizer currently supports
1943 a single vector size, see the reference to
1944 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1945 vectorization factor is computed. */
1946 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1947 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1948 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1949 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1950 DR_STMT (dr));
1954 /* Versioning requires at least one misaligned data reference. */
1955 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1956 do_versioning = false;
1957 else if (!do_versioning)
1958 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1961 if (do_versioning)
1963 vec<gimple *> may_misalign_stmts
1964 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1965 gimple *stmt;
1967 /* It can now be assumed that the data references in the statements
1968 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1969 of the loop being vectorized. */
1970 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1972 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1973 dr = STMT_VINFO_DATA_REF (stmt_info);
1974 SET_DR_MISALIGNMENT (dr, 0);
1975 if (dump_enabled_p ())
1976 dump_printf_loc (MSG_NOTE, vect_location,
1977 "Alignment of access forced using versioning.\n");
1980 if (dump_enabled_p ())
1981 dump_printf_loc (MSG_NOTE, vect_location,
1982 "Versioning for alignment will be applied.\n");
1984 /* Peeling and versioning can't be done together at this time. */
1985 gcc_assert (! (do_peeling && do_versioning));
1987 stat = vect_verify_datarefs_alignment (loop_vinfo);
1988 gcc_assert (stat);
1989 return stat;
1992 /* This point is reached if neither peeling nor versioning is being done. */
1993 gcc_assert (! (do_peeling || do_versioning));
1995 stat = vect_verify_datarefs_alignment (loop_vinfo);
1996 return stat;
2000 /* Function vect_find_same_alignment_drs.
2002 Update group and alignment relations according to the chosen
2003 vectorization factor. */
2005 static void
2006 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2007 loop_vec_info loop_vinfo)
2009 unsigned int i;
2010 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2011 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2012 struct data_reference *dra = DDR_A (ddr);
2013 struct data_reference *drb = DDR_B (ddr);
2014 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2015 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2016 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2017 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2018 lambda_vector dist_v;
2019 unsigned int loop_depth;
2021 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2022 return;
2024 if (dra == drb)
2025 return;
2027 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2028 return;
2030 /* Loop-based vectorization and known data dependence. */
2031 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2032 return;
2034 /* Data-dependence analysis reports a distance vector of zero
2035 for data-references that overlap only in the first iteration
2036 but have different sign step (see PR45764).
2037 So as a sanity check require equal DR_STEP. */
2038 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2039 return;
2041 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2042 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2044 int dist = dist_v[loop_depth];
2046 if (dump_enabled_p ())
2047 dump_printf_loc (MSG_NOTE, vect_location,
2048 "dependence distance = %d.\n", dist);
2050 /* Same loop iteration. */
2051 if (dist == 0
2052 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2054 /* Two references with distance zero have the same alignment. */
2055 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2056 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2057 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_NOTE, vect_location,
2060 "accesses have the same alignment.\n");
2061 dump_printf (MSG_NOTE,
2062 "dependence distance modulo vf == 0 between ");
2063 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2064 dump_printf (MSG_NOTE, " and ");
2065 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2066 dump_printf (MSG_NOTE, "\n");
2073 /* Function vect_analyze_data_refs_alignment
2075 Analyze the alignment of the data-references in the loop.
2076 Return FALSE if a data reference is found that cannot be vectorized. */
2078 bool
2079 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2081 if (dump_enabled_p ())
2082 dump_printf_loc (MSG_NOTE, vect_location,
2083 "=== vect_analyze_data_refs_alignment ===\n");
2085 /* Mark groups of data references with same alignment using
2086 data dependence information. */
2087 vec<ddr_p> ddrs = vinfo->ddrs;
2088 struct data_dependence_relation *ddr;
2089 unsigned int i;
2091 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2092 vect_find_same_alignment_drs (ddr, vinfo);
2094 vec<data_reference_p> datarefs = vinfo->datarefs;
2095 struct data_reference *dr;
2097 FOR_EACH_VEC_ELT (datarefs, i, dr)
2099 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2100 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2101 && !vect_compute_data_ref_alignment (dr))
2103 /* Strided accesses perform only component accesses, misalignment
2104 information is irrelevant for them. */
2105 if (STMT_VINFO_STRIDED_P (stmt_info)
2106 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2107 continue;
2109 if (dump_enabled_p ())
2110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2111 "not vectorized: can't calculate alignment "
2112 "for data ref.\n");
2114 return false;
2118 return true;
2122 /* Analyze alignment of DRs of stmts in NODE. */
2124 static bool
2125 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2127 /* We vectorize from the first scalar stmt in the node unless
2128 the node is permuted in which case we start from the first
2129 element in the group. */
2130 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2131 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2132 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2133 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2135 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2136 if (! vect_compute_data_ref_alignment (dr)
2137 /* For creating the data-ref pointer we need alignment of the
2138 first element anyway. */
2139 || (dr != first_dr
2140 && ! vect_compute_data_ref_alignment (first_dr))
2141 || ! verify_data_ref_alignment (dr))
2143 if (dump_enabled_p ())
2144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2145 "not vectorized: bad data alignment in basic "
2146 "block.\n");
2147 return false;
2150 return true;
2153 /* Function vect_slp_analyze_instance_alignment
2155 Analyze the alignment of the data-references in the SLP instance.
2156 Return FALSE if a data reference is found that cannot be vectorized. */
2158 bool
2159 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2161 if (dump_enabled_p ())
2162 dump_printf_loc (MSG_NOTE, vect_location,
2163 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2165 slp_tree node;
2166 unsigned i;
2167 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2168 if (! vect_slp_analyze_and_verify_node_alignment (node))
2169 return false;
2171 node = SLP_INSTANCE_TREE (instance);
2172 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2173 && ! vect_slp_analyze_and_verify_node_alignment
2174 (SLP_INSTANCE_TREE (instance)))
2175 return false;
2177 return true;
2181 /* Analyze groups of accesses: check that DR belongs to a group of
2182 accesses of legal size, step, etc. Detect gaps, single element
2183 interleaving, and other special cases. Set grouped access info.
2184 Collect groups of strided stores for further use in SLP analysis.
2185 Worker for vect_analyze_group_access. */
2187 static bool
2188 vect_analyze_group_access_1 (struct data_reference *dr)
2190 tree step = DR_STEP (dr);
2191 tree scalar_type = TREE_TYPE (DR_REF (dr));
2192 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2193 gimple *stmt = DR_STMT (dr);
2194 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2195 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2196 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2197 HOST_WIDE_INT dr_step = -1;
2198 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2199 bool slp_impossible = false;
2201 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2202 size of the interleaving group (including gaps). */
2203 if (tree_fits_shwi_p (step))
2205 dr_step = tree_to_shwi (step);
2206 /* Check that STEP is a multiple of type size. Otherwise there is
2207 a non-element-sized gap at the end of the group which we
2208 cannot represent in GROUP_GAP or GROUP_SIZE.
2209 ??? As we can handle non-constant step fine here we should
2210 simply remove uses of GROUP_GAP between the last and first
2211 element and instead rely on DR_STEP. GROUP_SIZE then would
2212 simply not include that gap. */
2213 if ((dr_step % type_size) != 0)
2215 if (dump_enabled_p ())
2217 dump_printf_loc (MSG_NOTE, vect_location,
2218 "Step ");
2219 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2220 dump_printf (MSG_NOTE,
2221 " is not a multiple of the element size for ");
2222 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2223 dump_printf (MSG_NOTE, "\n");
2225 return false;
2227 groupsize = absu_hwi (dr_step) / type_size;
2229 else
2230 groupsize = 0;
2232 /* Not consecutive access is possible only if it is a part of interleaving. */
2233 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2235 /* Check if it this DR is a part of interleaving, and is a single
2236 element of the group that is accessed in the loop. */
2238 /* Gaps are supported only for loads. STEP must be a multiple of the type
2239 size. The size of the group must be a power of 2. */
2240 if (DR_IS_READ (dr)
2241 && (dr_step % type_size) == 0
2242 && groupsize > 0
2243 && exact_log2 (groupsize) != -1)
2245 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2246 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2247 GROUP_GAP (stmt_info) = groupsize - 1;
2248 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE, vect_location,
2251 "Detected single element interleaving ");
2252 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2253 dump_printf (MSG_NOTE, " step ");
2254 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2255 dump_printf (MSG_NOTE, "\n");
2258 return true;
2261 if (dump_enabled_p ())
2263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264 "not consecutive access ");
2265 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2268 if (bb_vinfo)
2270 /* Mark the statement as unvectorizable. */
2271 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2272 return true;
2275 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2276 STMT_VINFO_STRIDED_P (stmt_info) = true;
2277 return true;
2280 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2282 /* First stmt in the interleaving chain. Check the chain. */
2283 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2284 struct data_reference *data_ref = dr;
2285 unsigned int count = 1;
2286 tree prev_init = DR_INIT (data_ref);
2287 gimple *prev = stmt;
2288 HOST_WIDE_INT diff, gaps = 0;
2290 while (next)
2292 /* Skip same data-refs. In case that two or more stmts share
2293 data-ref (supported only for loads), we vectorize only the first
2294 stmt, and the rest get their vectorized loads from the first
2295 one. */
2296 if (!tree_int_cst_compare (DR_INIT (data_ref),
2297 DR_INIT (STMT_VINFO_DATA_REF (
2298 vinfo_for_stmt (next)))))
2300 if (DR_IS_WRITE (data_ref))
2302 if (dump_enabled_p ())
2303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2304 "Two store stmts share the same dr.\n");
2305 return false;
2308 if (dump_enabled_p ())
2309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2310 "Two or more load stmts share the same dr.\n");
2312 /* For load use the same data-ref load. */
2313 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2315 prev = next;
2316 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2317 continue;
2320 prev = next;
2321 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2323 /* All group members have the same STEP by construction. */
2324 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2326 /* Check that the distance between two accesses is equal to the type
2327 size. Otherwise, we have gaps. */
2328 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2329 - TREE_INT_CST_LOW (prev_init)) / type_size;
2330 if (diff != 1)
2332 /* FORNOW: SLP of accesses with gaps is not supported. */
2333 slp_impossible = true;
2334 if (DR_IS_WRITE (data_ref))
2336 if (dump_enabled_p ())
2337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2338 "interleaved store with gaps\n");
2339 return false;
2342 gaps += diff - 1;
2345 last_accessed_element += diff;
2347 /* Store the gap from the previous member of the group. If there is no
2348 gap in the access, GROUP_GAP is always 1. */
2349 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2351 prev_init = DR_INIT (data_ref);
2352 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2353 /* Count the number of data-refs in the chain. */
2354 count++;
2357 if (groupsize == 0)
2358 groupsize = count + gaps;
2360 if (groupsize > UINT_MAX)
2362 if (dump_enabled_p ())
2363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2364 "group is too large\n");
2365 return false;
2368 /* Check that the size of the interleaving is equal to count for stores,
2369 i.e., that there are no gaps. */
2370 if (groupsize != count
2371 && !DR_IS_READ (dr))
2373 if (dump_enabled_p ())
2374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2375 "interleaved store with gaps\n");
2376 return false;
2379 /* If there is a gap after the last load in the group it is the
2380 difference between the groupsize and the last accessed
2381 element.
2382 When there is no gap, this difference should be 0. */
2383 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2385 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2386 if (dump_enabled_p ())
2388 dump_printf_loc (MSG_NOTE, vect_location,
2389 "Detected interleaving ");
2390 if (DR_IS_READ (dr))
2391 dump_printf (MSG_NOTE, "load ");
2392 else
2393 dump_printf (MSG_NOTE, "store ");
2394 dump_printf (MSG_NOTE, "of size %u starting with ",
2395 (unsigned)groupsize);
2396 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2397 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2398 dump_printf_loc (MSG_NOTE, vect_location,
2399 "There is a gap of %u elements after the group\n",
2400 GROUP_GAP (vinfo_for_stmt (stmt)));
2403 /* SLP: create an SLP data structure for every interleaving group of
2404 stores for further analysis in vect_analyse_slp. */
2405 if (DR_IS_WRITE (dr) && !slp_impossible)
2407 if (loop_vinfo)
2408 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2409 if (bb_vinfo)
2410 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2414 return true;
2417 /* Analyze groups of accesses: check that DR belongs to a group of
2418 accesses of legal size, step, etc. Detect gaps, single element
2419 interleaving, and other special cases. Set grouped access info.
2420 Collect groups of strided stores for further use in SLP analysis. */
2422 static bool
2423 vect_analyze_group_access (struct data_reference *dr)
2425 if (!vect_analyze_group_access_1 (dr))
2427 /* Dissolve the group if present. */
2428 gimple *next;
2429 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2430 while (stmt)
2432 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2433 next = GROUP_NEXT_ELEMENT (vinfo);
2434 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2435 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2436 stmt = next;
2438 return false;
2440 return true;
2443 /* Analyze the access pattern of the data-reference DR.
2444 In case of non-consecutive accesses call vect_analyze_group_access() to
2445 analyze groups of accesses. */
2447 static bool
2448 vect_analyze_data_ref_access (struct data_reference *dr)
2450 tree step = DR_STEP (dr);
2451 tree scalar_type = TREE_TYPE (DR_REF (dr));
2452 gimple *stmt = DR_STMT (dr);
2453 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2454 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2455 struct loop *loop = NULL;
2457 if (loop_vinfo)
2458 loop = LOOP_VINFO_LOOP (loop_vinfo);
2460 if (loop_vinfo && !step)
2462 if (dump_enabled_p ())
2463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2464 "bad data-ref access in loop\n");
2465 return false;
2468 /* Allow loads with zero step in inner-loop vectorization. */
2469 if (loop_vinfo && integer_zerop (step))
2471 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2472 if (!nested_in_vect_loop_p (loop, stmt))
2473 return DR_IS_READ (dr);
2474 /* Allow references with zero step for outer loops marked
2475 with pragma omp simd only - it guarantees absence of
2476 loop-carried dependencies between inner loop iterations. */
2477 if (!loop->force_vectorize)
2479 if (dump_enabled_p ())
2480 dump_printf_loc (MSG_NOTE, vect_location,
2481 "zero step in inner loop of nest\n");
2482 return false;
2486 if (loop && nested_in_vect_loop_p (loop, stmt))
2488 /* Interleaved accesses are not yet supported within outer-loop
2489 vectorization for references in the inner-loop. */
2490 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2492 /* For the rest of the analysis we use the outer-loop step. */
2493 step = STMT_VINFO_DR_STEP (stmt_info);
2494 if (integer_zerop (step))
2496 if (dump_enabled_p ())
2497 dump_printf_loc (MSG_NOTE, vect_location,
2498 "zero step in outer loop.\n");
2499 return DR_IS_READ (dr);
2503 /* Consecutive? */
2504 if (TREE_CODE (step) == INTEGER_CST)
2506 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2507 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2508 || (dr_step < 0
2509 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2511 /* Mark that it is not interleaving. */
2512 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2513 return true;
2517 if (loop && nested_in_vect_loop_p (loop, stmt))
2519 if (dump_enabled_p ())
2520 dump_printf_loc (MSG_NOTE, vect_location,
2521 "grouped access in outer loop.\n");
2522 return false;
2526 /* Assume this is a DR handled by non-constant strided load case. */
2527 if (TREE_CODE (step) != INTEGER_CST)
2528 return (STMT_VINFO_STRIDED_P (stmt_info)
2529 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2530 || vect_analyze_group_access (dr)));
2532 /* Not consecutive access - check if it's a part of interleaving group. */
2533 return vect_analyze_group_access (dr);
2538 /* A helper function used in the comparator function to sort data
2539 references. T1 and T2 are two data references to be compared.
2540 The function returns -1, 0, or 1. */
2542 static int
2543 compare_tree (tree t1, tree t2)
2545 int i, cmp;
2546 enum tree_code code;
2547 char tclass;
2549 if (t1 == t2)
2550 return 0;
2551 if (t1 == NULL)
2552 return -1;
2553 if (t2 == NULL)
2554 return 1;
2556 STRIP_NOPS (t1);
2557 STRIP_NOPS (t2);
2559 if (TREE_CODE (t1) != TREE_CODE (t2))
2560 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2562 code = TREE_CODE (t1);
2563 switch (code)
2565 /* For const values, we can just use hash values for comparisons. */
2566 case INTEGER_CST:
2567 case REAL_CST:
2568 case FIXED_CST:
2569 case STRING_CST:
2570 case COMPLEX_CST:
2571 case VECTOR_CST:
2573 hashval_t h1 = iterative_hash_expr (t1, 0);
2574 hashval_t h2 = iterative_hash_expr (t2, 0);
2575 if (h1 != h2)
2576 return h1 < h2 ? -1 : 1;
2577 break;
2580 case SSA_NAME:
2581 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2582 if (cmp != 0)
2583 return cmp;
2585 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2586 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2587 break;
2589 default:
2590 tclass = TREE_CODE_CLASS (code);
2592 /* For var-decl, we could compare their UIDs. */
2593 if (tclass == tcc_declaration)
2595 if (DECL_UID (t1) != DECL_UID (t2))
2596 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2597 break;
2600 /* For expressions with operands, compare their operands recursively. */
2601 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2603 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2604 if (cmp != 0)
2605 return cmp;
2609 return 0;
2613 /* Compare two data-references DRA and DRB to group them into chunks
2614 suitable for grouping. */
2616 static int
2617 dr_group_sort_cmp (const void *dra_, const void *drb_)
2619 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2620 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2621 int cmp;
2623 /* Stabilize sort. */
2624 if (dra == drb)
2625 return 0;
2627 /* DRs in different loops never belong to the same group. */
2628 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2629 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2630 if (loopa != loopb)
2631 return loopa->num < loopb->num ? -1 : 1;
2633 /* Ordering of DRs according to base. */
2634 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2636 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2637 if (cmp != 0)
2638 return cmp;
2641 /* And according to DR_OFFSET. */
2642 if (!dr_equal_offsets_p (dra, drb))
2644 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2645 if (cmp != 0)
2646 return cmp;
2649 /* Put reads before writes. */
2650 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2651 return DR_IS_READ (dra) ? -1 : 1;
2653 /* Then sort after access size. */
2654 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2655 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2657 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2658 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2659 if (cmp != 0)
2660 return cmp;
2663 /* And after step. */
2664 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2666 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2667 if (cmp != 0)
2668 return cmp;
2671 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2672 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2673 if (cmp == 0)
2674 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2675 return cmp;
2678 /* Function vect_analyze_data_ref_accesses.
2680 Analyze the access pattern of all the data references in the loop.
2682 FORNOW: the only access pattern that is considered vectorizable is a
2683 simple step 1 (consecutive) access.
2685 FORNOW: handle only arrays and pointer accesses. */
2687 bool
2688 vect_analyze_data_ref_accesses (vec_info *vinfo)
2690 unsigned int i;
2691 vec<data_reference_p> datarefs = vinfo->datarefs;
2692 struct data_reference *dr;
2694 if (dump_enabled_p ())
2695 dump_printf_loc (MSG_NOTE, vect_location,
2696 "=== vect_analyze_data_ref_accesses ===\n");
2698 if (datarefs.is_empty ())
2699 return true;
2701 /* Sort the array of datarefs to make building the interleaving chains
2702 linear. Don't modify the original vector's order, it is needed for
2703 determining what dependencies are reversed. */
2704 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2705 datarefs_copy.qsort (dr_group_sort_cmp);
2707 /* Build the interleaving chains. */
2708 for (i = 0; i < datarefs_copy.length () - 1;)
2710 data_reference_p dra = datarefs_copy[i];
2711 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2712 stmt_vec_info lastinfo = NULL;
2713 for (i = i + 1; i < datarefs_copy.length (); ++i)
2715 data_reference_p drb = datarefs_copy[i];
2716 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2718 /* ??? Imperfect sorting (non-compatible types, non-modulo
2719 accesses, same accesses) can lead to a group to be artificially
2720 split here as we don't just skip over those. If it really
2721 matters we can push those to a worklist and re-iterate
2722 over them. The we can just skip ahead to the next DR here. */
2724 /* DRs in a different loop should not be put into the same
2725 interleaving group. */
2726 if (gimple_bb (DR_STMT (dra))->loop_father
2727 != gimple_bb (DR_STMT (drb))->loop_father)
2728 break;
2730 /* Check that the data-refs have same first location (except init)
2731 and they are both either store or load (not load and store,
2732 not masked loads or stores). */
2733 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2734 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2735 DR_BASE_ADDRESS (drb), 0)
2736 || !dr_equal_offsets_p (dra, drb)
2737 || !gimple_assign_single_p (DR_STMT (dra))
2738 || !gimple_assign_single_p (DR_STMT (drb)))
2739 break;
2741 /* Check that the data-refs have the same constant size. */
2742 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2743 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2744 if (!tree_fits_uhwi_p (sza)
2745 || !tree_fits_uhwi_p (szb)
2746 || !tree_int_cst_equal (sza, szb))
2747 break;
2749 /* Check that the data-refs have the same step. */
2750 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2751 break;
2753 /* Do not place the same access in the interleaving chain twice. */
2754 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2755 break;
2757 /* Check the types are compatible.
2758 ??? We don't distinguish this during sorting. */
2759 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2760 TREE_TYPE (DR_REF (drb))))
2761 break;
2763 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2764 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2765 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2766 gcc_assert (init_a <= init_b);
2768 /* If init_b == init_a + the size of the type * k, we have an
2769 interleaving, and DRA is accessed before DRB. */
2770 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2771 if (type_size_a == 0
2772 || (init_b - init_a) % type_size_a != 0)
2773 break;
2775 /* If we have a store, the accesses are adjacent. This splits
2776 groups into chunks we support (we don't support vectorization
2777 of stores with gaps). */
2778 if (!DR_IS_READ (dra)
2779 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2780 (DR_INIT (datarefs_copy[i-1]))
2781 != type_size_a))
2782 break;
2784 /* If the step (if not zero or non-constant) is greater than the
2785 difference between data-refs' inits this splits groups into
2786 suitable sizes. */
2787 if (tree_fits_shwi_p (DR_STEP (dra)))
2789 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2790 if (step != 0 && step <= (init_b - init_a))
2791 break;
2794 if (dump_enabled_p ())
2796 dump_printf_loc (MSG_NOTE, vect_location,
2797 "Detected interleaving ");
2798 if (DR_IS_READ (dra))
2799 dump_printf (MSG_NOTE, "load ");
2800 else
2801 dump_printf (MSG_NOTE, "store ");
2802 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2803 dump_printf (MSG_NOTE, " and ");
2804 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2805 dump_printf (MSG_NOTE, "\n");
2808 /* Link the found element into the group list. */
2809 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2811 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2812 lastinfo = stmtinfo_a;
2814 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2815 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2816 lastinfo = stmtinfo_b;
2820 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2821 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2822 && !vect_analyze_data_ref_access (dr))
2824 if (dump_enabled_p ())
2825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2826 "not vectorized: complicated access pattern.\n");
2828 if (is_a <bb_vec_info> (vinfo))
2830 /* Mark the statement as not vectorizable. */
2831 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2832 continue;
2834 else
2836 datarefs_copy.release ();
2837 return false;
2841 datarefs_copy.release ();
2842 return true;
2846 /* Operator == between two dr_with_seg_len objects.
2848 This equality operator is used to make sure two data refs
2849 are the same one so that we will consider to combine the
2850 aliasing checks of those two pairs of data dependent data
2851 refs. */
2853 static bool
2854 operator == (const dr_with_seg_len& d1,
2855 const dr_with_seg_len& d2)
2857 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2858 DR_BASE_ADDRESS (d2.dr), 0)
2859 && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2860 && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2861 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2864 /* Function comp_dr_with_seg_len_pair.
2866 Comparison function for sorting objects of dr_with_seg_len_pair_t
2867 so that we can combine aliasing checks in one scan. */
2869 static int
2870 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2872 const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2873 const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
2874 const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
2875 const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
2877 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2878 if a and c have the same basic address snd step, and b and d have the same
2879 address and step. Therefore, if any a&c or b&d don't have the same address
2880 and step, we don't care the order of those two pairs after sorting. */
2881 int comp_res;
2883 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
2884 DR_BASE_ADDRESS (b1.dr))) != 0)
2885 return comp_res;
2886 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
2887 DR_BASE_ADDRESS (b2.dr))) != 0)
2888 return comp_res;
2889 if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
2890 return comp_res;
2891 if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
2892 return comp_res;
2893 if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
2894 return comp_res;
2895 if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
2896 return comp_res;
2897 if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
2898 return comp_res;
2899 if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
2900 return comp_res;
2902 return 0;
2905 /* Function vect_vfa_segment_size.
2907 Create an expression that computes the size of segment
2908 that will be accessed for a data reference. The functions takes into
2909 account that realignment loads may access one more vector.
2911 Input:
2912 DR: The data reference.
2913 LENGTH_FACTOR: segment length to consider.
2915 Return an expression whose value is the size of segment which will be
2916 accessed by DR. */
2918 static tree
2919 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2921 tree segment_length;
2923 if (integer_zerop (DR_STEP (dr)))
2924 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2925 else
2926 segment_length = size_binop (MULT_EXPR,
2927 fold_convert (sizetype, DR_STEP (dr)),
2928 fold_convert (sizetype, length_factor));
2930 if (vect_supportable_dr_alignment (dr, false)
2931 == dr_explicit_realign_optimized)
2933 tree vector_size = TYPE_SIZE_UNIT
2934 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2936 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2938 return segment_length;
2941 /* Function vect_prune_runtime_alias_test_list.
2943 Prune a list of ddrs to be tested at run-time by versioning for alias.
2944 Merge several alias checks into one if possible.
2945 Return FALSE if resulting list of ddrs is longer then allowed by
2946 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2948 bool
2949 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2951 vec<ddr_p> may_alias_ddrs =
2952 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2953 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2954 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2955 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2956 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2958 ddr_p ddr;
2959 unsigned int i;
2960 tree length_factor;
2962 if (dump_enabled_p ())
2963 dump_printf_loc (MSG_NOTE, vect_location,
2964 "=== vect_prune_runtime_alias_test_list ===\n");
2966 if (may_alias_ddrs.is_empty ())
2967 return true;
2969 /* Basically, for each pair of dependent data refs store_ptr_0
2970 and load_ptr_0, we create an expression:
2972 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2973 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2975 for aliasing checks. However, in some cases we can decrease
2976 the number of checks by combining two checks into one. For
2977 example, suppose we have another pair of data refs store_ptr_0
2978 and load_ptr_1, and if the following condition is satisfied:
2980 load_ptr_0 < load_ptr_1 &&
2981 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2983 (this condition means, in each iteration of vectorized loop,
2984 the accessed memory of store_ptr_0 cannot be between the memory
2985 of load_ptr_0 and load_ptr_1.)
2987 we then can use only the following expression to finish the
2988 alising checks between store_ptr_0 & load_ptr_0 and
2989 store_ptr_0 & load_ptr_1:
2991 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2992 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2994 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2995 same basic address. */
2997 comp_alias_ddrs.create (may_alias_ddrs.length ());
2999 /* First, we collect all data ref pairs for aliasing checks. */
3000 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3002 int comp_res;
3003 struct data_reference *dr_a, *dr_b;
3004 gimple *dr_group_first_a, *dr_group_first_b;
3005 tree segment_length_a, segment_length_b;
3006 gimple *stmt_a, *stmt_b;
3008 dr_a = DDR_A (ddr);
3009 stmt_a = DR_STMT (DDR_A (ddr));
3010 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3011 if (dr_group_first_a)
3013 stmt_a = dr_group_first_a;
3014 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3017 dr_b = DDR_B (ddr);
3018 stmt_b = DR_STMT (DDR_B (ddr));
3019 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3020 if (dr_group_first_b)
3022 stmt_b = dr_group_first_b;
3023 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3026 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3027 length_factor = scalar_loop_iters;
3028 else
3029 length_factor = size_int (vect_factor);
3030 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3031 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3033 dr_with_seg_len_pair_t dr_with_seg_len_pair
3034 (dr_with_seg_len (dr_a, segment_length_a),
3035 dr_with_seg_len (dr_b, segment_length_b));
3037 /* Canonicalize pairs by sorting the two DR members. */
3038 comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3039 if (comp_res > 0
3040 || (comp_res == 0
3041 && compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b)) > 0))
3042 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3044 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3047 /* Second, we sort the collected data ref pairs so that we can scan
3048 them once to combine all possible aliasing checks. */
3049 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3051 /* Third, we scan the sorted dr pairs and check if we can combine
3052 alias checks of two neighboring dr pairs. */
3053 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3055 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3056 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3057 *dr_b1 = &comp_alias_ddrs[i-1].second,
3058 *dr_a2 = &comp_alias_ddrs[i].first,
3059 *dr_b2 = &comp_alias_ddrs[i].second;
3061 /* Remove duplicate data ref pairs. */
3062 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3064 if (dump_enabled_p ())
3066 dump_printf_loc (MSG_NOTE, vect_location,
3067 "found equal ranges ");
3068 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3069 DR_REF (dr_a1->dr));
3070 dump_printf (MSG_NOTE, ", ");
3071 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3072 DR_REF (dr_b1->dr));
3073 dump_printf (MSG_NOTE, " and ");
3074 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3075 DR_REF (dr_a2->dr));
3076 dump_printf (MSG_NOTE, ", ");
3077 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3078 DR_REF (dr_b2->dr));
3079 dump_printf (MSG_NOTE, "\n");
3082 comp_alias_ddrs.ordered_remove (i--);
3083 continue;
3086 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3088 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3089 and DR_A1 and DR_A2 are two consecutive memrefs. */
3090 if (*dr_a1 == *dr_a2)
3092 std::swap (dr_a1, dr_b1);
3093 std::swap (dr_a2, dr_b2);
3096 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3097 DR_BASE_ADDRESS (dr_a2->dr), 0)
3098 || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3099 DR_OFFSET (dr_a2->dr), 0)
3100 || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3101 || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3102 continue;
3104 /* Make sure dr_a1 starts left of dr_a2. */
3105 if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3106 std::swap (*dr_a1, *dr_a2);
3108 bool do_remove = false;
3109 unsigned HOST_WIDE_INT diff
3110 = (tree_to_shwi (DR_INIT (dr_a2->dr))
3111 - tree_to_shwi (DR_INIT (dr_a1->dr)));
3113 /* If the left segment does not extend beyond the start of the
3114 right segment the new segment length is that of the right
3115 plus the segment distance. */
3116 if (tree_fits_uhwi_p (dr_a1->seg_len)
3117 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3119 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3120 size_int (diff));
3121 do_remove = true;
3123 /* Generally the new segment length is the maximum of the
3124 left segment size and the right segment size plus the distance.
3125 ??? We can also build tree MAX_EXPR here but it's not clear this
3126 is profitable. */
3127 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3128 && tree_fits_uhwi_p (dr_a2->seg_len))
3130 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3131 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3132 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3133 do_remove = true;
3135 /* Now we check if the following condition is satisfied:
3137 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3139 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3140 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3141 have to make a best estimation. We can get the minimum value
3142 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3143 then either of the following two conditions can guarantee the
3144 one above:
3146 1: DIFF <= MIN_SEG_LEN_B
3147 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3148 else
3150 unsigned HOST_WIDE_INT min_seg_len_b
3151 = (tree_fits_uhwi_p (dr_b1->seg_len)
3152 ? tree_to_uhwi (dr_b1->seg_len)
3153 : vect_factor);
3155 if (diff <= min_seg_len_b
3156 || (tree_fits_uhwi_p (dr_a1->seg_len)
3157 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3159 dr_a1->seg_len = size_binop (PLUS_EXPR,
3160 dr_a2->seg_len, size_int (diff));
3161 do_remove = true;
3165 if (do_remove)
3167 if (dump_enabled_p ())
3169 dump_printf_loc (MSG_NOTE, vect_location,
3170 "merging ranges for ");
3171 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3172 dump_printf (MSG_NOTE, ", ");
3173 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3174 dump_printf (MSG_NOTE, " and ");
3175 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3176 dump_printf (MSG_NOTE, ", ");
3177 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3178 dump_printf (MSG_NOTE, "\n");
3180 comp_alias_ddrs.ordered_remove (i--);
3185 dump_printf_loc (MSG_NOTE, vect_location,
3186 "improved number of alias checks from %d to %d\n",
3187 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3188 if ((int) comp_alias_ddrs.length () >
3189 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3190 return false;
3192 return true;
3195 /* Return true if a non-affine read or write in STMT is suitable for a
3196 gather load or scatter store. Describe the operation in *INFO if so. */
3198 bool
3199 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3200 gather_scatter_info *info)
3202 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3203 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3205 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3206 tree offtype = NULL_TREE;
3207 tree decl, base, off;
3208 machine_mode pmode;
3209 int punsignedp, reversep, pvolatilep = 0;
3211 base = DR_REF (dr);
3212 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3213 see if we can use the def stmt of the address. */
3214 if (is_gimple_call (stmt)
3215 && gimple_call_internal_p (stmt)
3216 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3217 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3218 && TREE_CODE (base) == MEM_REF
3219 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3220 && integer_zerop (TREE_OPERAND (base, 1))
3221 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3223 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3224 if (is_gimple_assign (def_stmt)
3225 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3226 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3229 /* The gather and scatter builtins need address of the form
3230 loop_invariant + vector * {1, 2, 4, 8}
3232 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3233 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3234 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3235 multiplications and additions in it. To get a vector, we need
3236 a single SSA_NAME that will be defined in the loop and will
3237 contain everything that is not loop invariant and that can be
3238 vectorized. The following code attempts to find such a preexistng
3239 SSA_NAME OFF and put the loop invariants into a tree BASE
3240 that can be gimplified before the loop. */
3241 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3242 &punsignedp, &reversep, &pvolatilep);
3243 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3245 if (TREE_CODE (base) == MEM_REF)
3247 if (!integer_zerop (TREE_OPERAND (base, 1)))
3249 if (off == NULL_TREE)
3251 offset_int moff = mem_ref_offset (base);
3252 off = wide_int_to_tree (sizetype, moff);
3254 else
3255 off = size_binop (PLUS_EXPR, off,
3256 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3258 base = TREE_OPERAND (base, 0);
3260 else
3261 base = build_fold_addr_expr (base);
3263 if (off == NULL_TREE)
3264 off = size_zero_node;
3266 /* If base is not loop invariant, either off is 0, then we start with just
3267 the constant offset in the loop invariant BASE and continue with base
3268 as OFF, otherwise give up.
3269 We could handle that case by gimplifying the addition of base + off
3270 into some SSA_NAME and use that as off, but for now punt. */
3271 if (!expr_invariant_in_loop_p (loop, base))
3273 if (!integer_zerop (off))
3274 return false;
3275 off = base;
3276 base = size_int (pbitpos / BITS_PER_UNIT);
3278 /* Otherwise put base + constant offset into the loop invariant BASE
3279 and continue with OFF. */
3280 else
3282 base = fold_convert (sizetype, base);
3283 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3286 /* OFF at this point may be either a SSA_NAME or some tree expression
3287 from get_inner_reference. Try to peel off loop invariants from it
3288 into BASE as long as possible. */
3289 STRIP_NOPS (off);
3290 while (offtype == NULL_TREE)
3292 enum tree_code code;
3293 tree op0, op1, add = NULL_TREE;
3295 if (TREE_CODE (off) == SSA_NAME)
3297 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3299 if (expr_invariant_in_loop_p (loop, off))
3300 return false;
3302 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3303 break;
3305 op0 = gimple_assign_rhs1 (def_stmt);
3306 code = gimple_assign_rhs_code (def_stmt);
3307 op1 = gimple_assign_rhs2 (def_stmt);
3309 else
3311 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3312 return false;
3313 code = TREE_CODE (off);
3314 extract_ops_from_tree (off, &code, &op0, &op1);
3316 switch (code)
3318 case POINTER_PLUS_EXPR:
3319 case PLUS_EXPR:
3320 if (expr_invariant_in_loop_p (loop, op0))
3322 add = op0;
3323 off = op1;
3324 do_add:
3325 add = fold_convert (sizetype, add);
3326 if (scale != 1)
3327 add = size_binop (MULT_EXPR, add, size_int (scale));
3328 base = size_binop (PLUS_EXPR, base, add);
3329 continue;
3331 if (expr_invariant_in_loop_p (loop, op1))
3333 add = op1;
3334 off = op0;
3335 goto do_add;
3337 break;
3338 case MINUS_EXPR:
3339 if (expr_invariant_in_loop_p (loop, op1))
3341 add = fold_convert (sizetype, op1);
3342 add = size_binop (MINUS_EXPR, size_zero_node, add);
3343 off = op0;
3344 goto do_add;
3346 break;
3347 case MULT_EXPR:
3348 if (scale == 1 && tree_fits_shwi_p (op1))
3350 scale = tree_to_shwi (op1);
3351 off = op0;
3352 continue;
3354 break;
3355 case SSA_NAME:
3356 off = op0;
3357 continue;
3358 CASE_CONVERT:
3359 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3360 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3361 break;
3362 if (TYPE_PRECISION (TREE_TYPE (op0))
3363 == TYPE_PRECISION (TREE_TYPE (off)))
3365 off = op0;
3366 continue;
3368 if (TYPE_PRECISION (TREE_TYPE (op0))
3369 < TYPE_PRECISION (TREE_TYPE (off)))
3371 off = op0;
3372 offtype = TREE_TYPE (off);
3373 STRIP_NOPS (off);
3374 continue;
3376 break;
3377 default:
3378 break;
3380 break;
3383 /* If at the end OFF still isn't a SSA_NAME or isn't
3384 defined in the loop, punt. */
3385 if (TREE_CODE (off) != SSA_NAME
3386 || expr_invariant_in_loop_p (loop, off))
3387 return false;
3389 if (offtype == NULL_TREE)
3390 offtype = TREE_TYPE (off);
3392 if (DR_IS_READ (dr))
3393 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3394 offtype, scale);
3395 else
3396 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3397 offtype, scale);
3399 if (decl == NULL_TREE)
3400 return false;
3402 info->decl = decl;
3403 info->base = base;
3404 info->offset = off;
3405 info->offset_dt = vect_unknown_def_type;
3406 info->offset_vectype = NULL_TREE;
3407 info->scale = scale;
3408 return true;
3411 /* Function vect_analyze_data_refs.
3413 Find all the data references in the loop or basic block.
3415 The general structure of the analysis of data refs in the vectorizer is as
3416 follows:
3417 1- vect_analyze_data_refs(loop/bb): call
3418 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3419 in the loop/bb and their dependences.
3420 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3421 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3422 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3426 bool
3427 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3429 struct loop *loop = NULL;
3430 unsigned int i;
3431 struct data_reference *dr;
3432 tree scalar_type;
3434 if (dump_enabled_p ())
3435 dump_printf_loc (MSG_NOTE, vect_location,
3436 "=== vect_analyze_data_refs ===\n");
3438 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3439 loop = LOOP_VINFO_LOOP (loop_vinfo);
3441 /* Go through the data-refs, check that the analysis succeeded. Update
3442 pointer from stmt_vec_info struct to DR and vectype. */
3444 vec<data_reference_p> datarefs = vinfo->datarefs;
3445 FOR_EACH_VEC_ELT (datarefs, i, dr)
3447 gimple *stmt;
3448 stmt_vec_info stmt_info;
3449 tree base, offset, init;
3450 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3451 bool simd_lane_access = false;
3452 int vf;
3454 again:
3455 if (!dr || !DR_REF (dr))
3457 if (dump_enabled_p ())
3458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3459 "not vectorized: unhandled data-ref\n");
3460 return false;
3463 stmt = DR_STMT (dr);
3464 stmt_info = vinfo_for_stmt (stmt);
3466 /* Discard clobbers from the dataref vector. We will remove
3467 clobber stmts during vectorization. */
3468 if (gimple_clobber_p (stmt))
3470 free_data_ref (dr);
3471 if (i == datarefs.length () - 1)
3473 datarefs.pop ();
3474 break;
3476 datarefs.ordered_remove (i);
3477 dr = datarefs[i];
3478 goto again;
3481 /* Check that analysis of the data-ref succeeded. */
3482 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3483 || !DR_STEP (dr))
3485 bool maybe_gather
3486 = DR_IS_READ (dr)
3487 && !TREE_THIS_VOLATILE (DR_REF (dr))
3488 && targetm.vectorize.builtin_gather != NULL;
3489 bool maybe_scatter
3490 = DR_IS_WRITE (dr)
3491 && !TREE_THIS_VOLATILE (DR_REF (dr))
3492 && targetm.vectorize.builtin_scatter != NULL;
3493 bool maybe_simd_lane_access
3494 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3496 /* If target supports vector gather loads or scatter stores, or if
3497 this might be a SIMD lane access, see if they can't be used. */
3498 if (is_a <loop_vec_info> (vinfo)
3499 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3500 && !nested_in_vect_loop_p (loop, stmt))
3502 struct data_reference *newdr
3503 = create_data_ref (NULL, loop_containing_stmt (stmt),
3504 DR_REF (dr), stmt, maybe_scatter ? false : true);
3505 gcc_assert (newdr != NULL && DR_REF (newdr));
3506 if (DR_BASE_ADDRESS (newdr)
3507 && DR_OFFSET (newdr)
3508 && DR_INIT (newdr)
3509 && DR_STEP (newdr)
3510 && integer_zerop (DR_STEP (newdr)))
3512 if (maybe_simd_lane_access)
3514 tree off = DR_OFFSET (newdr);
3515 STRIP_NOPS (off);
3516 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3517 && TREE_CODE (off) == MULT_EXPR
3518 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3520 tree step = TREE_OPERAND (off, 1);
3521 off = TREE_OPERAND (off, 0);
3522 STRIP_NOPS (off);
3523 if (CONVERT_EXPR_P (off)
3524 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3525 0)))
3526 < TYPE_PRECISION (TREE_TYPE (off)))
3527 off = TREE_OPERAND (off, 0);
3528 if (TREE_CODE (off) == SSA_NAME)
3530 gimple *def = SSA_NAME_DEF_STMT (off);
3531 tree reft = TREE_TYPE (DR_REF (newdr));
3532 if (is_gimple_call (def)
3533 && gimple_call_internal_p (def)
3534 && (gimple_call_internal_fn (def)
3535 == IFN_GOMP_SIMD_LANE))
3537 tree arg = gimple_call_arg (def, 0);
3538 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3539 arg = SSA_NAME_VAR (arg);
3540 if (arg == loop->simduid
3541 /* For now. */
3542 && tree_int_cst_equal
3543 (TYPE_SIZE_UNIT (reft),
3544 step))
3546 DR_OFFSET (newdr) = ssize_int (0);
3547 DR_STEP (newdr) = step;
3548 DR_ALIGNED_TO (newdr)
3549 = size_int (BIGGEST_ALIGNMENT);
3550 dr = newdr;
3551 simd_lane_access = true;
3557 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3559 dr = newdr;
3560 if (maybe_gather)
3561 gatherscatter = GATHER;
3562 else
3563 gatherscatter = SCATTER;
3566 if (gatherscatter == SG_NONE && !simd_lane_access)
3567 free_data_ref (newdr);
3570 if (gatherscatter == SG_NONE && !simd_lane_access)
3572 if (dump_enabled_p ())
3574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3575 "not vectorized: data ref analysis "
3576 "failed ");
3577 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3580 if (is_a <bb_vec_info> (vinfo))
3581 break;
3583 return false;
3587 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3589 if (dump_enabled_p ())
3590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3591 "not vectorized: base addr of dr is a "
3592 "constant\n");
3594 if (is_a <bb_vec_info> (vinfo))
3595 break;
3597 if (gatherscatter != SG_NONE || simd_lane_access)
3598 free_data_ref (dr);
3599 return false;
3602 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3604 if (dump_enabled_p ())
3606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3607 "not vectorized: volatile type ");
3608 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3611 if (is_a <bb_vec_info> (vinfo))
3612 break;
3614 return false;
3617 if (stmt_can_throw_internal (stmt))
3619 if (dump_enabled_p ())
3621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3622 "not vectorized: statement can throw an "
3623 "exception ");
3624 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3627 if (is_a <bb_vec_info> (vinfo))
3628 break;
3630 if (gatherscatter != SG_NONE || simd_lane_access)
3631 free_data_ref (dr);
3632 return false;
3635 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3636 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3638 if (dump_enabled_p ())
3640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3641 "not vectorized: statement is bitfield "
3642 "access ");
3643 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3646 if (is_a <bb_vec_info> (vinfo))
3647 break;
3649 if (gatherscatter != SG_NONE || simd_lane_access)
3650 free_data_ref (dr);
3651 return false;
3654 base = unshare_expr (DR_BASE_ADDRESS (dr));
3655 offset = unshare_expr (DR_OFFSET (dr));
3656 init = unshare_expr (DR_INIT (dr));
3658 if (is_gimple_call (stmt)
3659 && (!gimple_call_internal_p (stmt)
3660 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3661 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3663 if (dump_enabled_p ())
3665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3666 "not vectorized: dr in a call ");
3667 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3670 if (is_a <bb_vec_info> (vinfo))
3671 break;
3673 if (gatherscatter != SG_NONE || simd_lane_access)
3674 free_data_ref (dr);
3675 return false;
3678 /* Update DR field in stmt_vec_info struct. */
3680 /* If the dataref is in an inner-loop of the loop that is considered for
3681 for vectorization, we also want to analyze the access relative to
3682 the outer-loop (DR contains information only relative to the
3683 inner-most enclosing loop). We do that by building a reference to the
3684 first location accessed by the inner-loop, and analyze it relative to
3685 the outer-loop. */
3686 if (loop && nested_in_vect_loop_p (loop, stmt))
3688 tree outer_step, outer_base, outer_init;
3689 HOST_WIDE_INT pbitsize, pbitpos;
3690 tree poffset;
3691 machine_mode pmode;
3692 int punsignedp, preversep, pvolatilep;
3693 affine_iv base_iv, offset_iv;
3694 tree dinit;
3696 /* Build a reference to the first location accessed by the
3697 inner-loop: *(BASE+INIT). (The first location is actually
3698 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3699 tree inner_base = build_fold_indirect_ref
3700 (fold_build_pointer_plus (base, init));
3702 if (dump_enabled_p ())
3704 dump_printf_loc (MSG_NOTE, vect_location,
3705 "analyze in outer-loop: ");
3706 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3707 dump_printf (MSG_NOTE, "\n");
3710 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3711 &poffset, &pmode, &punsignedp,
3712 &preversep, &pvolatilep);
3713 gcc_assert (outer_base != NULL_TREE);
3715 if (pbitpos % BITS_PER_UNIT != 0)
3717 if (dump_enabled_p ())
3718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3719 "failed: bit offset alignment.\n");
3720 return false;
3723 if (preversep)
3725 if (dump_enabled_p ())
3726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3727 "failed: reverse storage order.\n");
3728 return false;
3731 outer_base = build_fold_addr_expr (outer_base);
3732 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3733 &base_iv, false))
3735 if (dump_enabled_p ())
3736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3737 "failed: evolution of base is not affine.\n");
3738 return false;
3741 if (offset)
3743 if (poffset)
3744 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3745 poffset);
3746 else
3747 poffset = offset;
3750 if (!poffset)
3752 offset_iv.base = ssize_int (0);
3753 offset_iv.step = ssize_int (0);
3755 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3756 &offset_iv, false))
3758 if (dump_enabled_p ())
3759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3760 "evolution of offset is not affine.\n");
3761 return false;
3764 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3765 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3766 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3767 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3768 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3770 outer_step = size_binop (PLUS_EXPR,
3771 fold_convert (ssizetype, base_iv.step),
3772 fold_convert (ssizetype, offset_iv.step));
3774 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3775 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3776 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3777 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3778 STMT_VINFO_DR_OFFSET (stmt_info) =
3779 fold_convert (ssizetype, offset_iv.base);
3780 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3781 size_int (highest_pow2_factor (offset_iv.base));
3783 if (dump_enabled_p ())
3785 dump_printf_loc (MSG_NOTE, vect_location,
3786 "\touter base_address: ");
3787 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3788 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3789 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3790 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3791 STMT_VINFO_DR_OFFSET (stmt_info));
3792 dump_printf (MSG_NOTE,
3793 "\n\touter constant offset from base address: ");
3794 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3795 STMT_VINFO_DR_INIT (stmt_info));
3796 dump_printf (MSG_NOTE, "\n\touter step: ");
3797 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3798 STMT_VINFO_DR_STEP (stmt_info));
3799 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3800 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3801 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3802 dump_printf (MSG_NOTE, "\n");
3806 if (STMT_VINFO_DATA_REF (stmt_info))
3808 if (dump_enabled_p ())
3810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3811 "not vectorized: more than one data ref "
3812 "in stmt: ");
3813 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3816 if (is_a <bb_vec_info> (vinfo))
3817 break;
3819 if (gatherscatter != SG_NONE || simd_lane_access)
3820 free_data_ref (dr);
3821 return false;
3824 STMT_VINFO_DATA_REF (stmt_info) = dr;
3825 if (simd_lane_access)
3827 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3828 free_data_ref (datarefs[i]);
3829 datarefs[i] = dr;
3832 /* Set vectype for STMT. */
3833 scalar_type = TREE_TYPE (DR_REF (dr));
3834 STMT_VINFO_VECTYPE (stmt_info)
3835 = get_vectype_for_scalar_type (scalar_type);
3836 if (!STMT_VINFO_VECTYPE (stmt_info))
3838 if (dump_enabled_p ())
3840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3841 "not vectorized: no vectype for stmt: ");
3842 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3843 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3844 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3845 scalar_type);
3846 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3849 if (is_a <bb_vec_info> (vinfo))
3851 /* No vector type is fine, the ref can still participate
3852 in dependence analysis, we just can't vectorize it. */
3853 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3854 continue;
3857 if (gatherscatter != SG_NONE || simd_lane_access)
3859 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3860 if (gatherscatter != SG_NONE)
3861 free_data_ref (dr);
3863 return false;
3865 else
3867 if (dump_enabled_p ())
3869 dump_printf_loc (MSG_NOTE, vect_location,
3870 "got vectype for stmt: ");
3871 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3872 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3873 STMT_VINFO_VECTYPE (stmt_info));
3874 dump_printf (MSG_NOTE, "\n");
3878 /* Adjust the minimal vectorization factor according to the
3879 vector type. */
3880 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3881 if (vf > *min_vf)
3882 *min_vf = vf;
3884 if (gatherscatter != SG_NONE)
3886 gather_scatter_info gs_info;
3887 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3888 &gs_info)
3889 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3891 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3892 free_data_ref (dr);
3893 if (dump_enabled_p ())
3895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3896 (gatherscatter == GATHER) ?
3897 "not vectorized: not suitable for gather "
3898 "load " :
3899 "not vectorized: not suitable for scatter "
3900 "store ");
3901 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3903 return false;
3906 free_data_ref (datarefs[i]);
3907 datarefs[i] = dr;
3908 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3911 else if (is_a <loop_vec_info> (vinfo)
3912 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3914 if (nested_in_vect_loop_p (loop, stmt))
3916 if (dump_enabled_p ())
3918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3919 "not vectorized: not suitable for strided "
3920 "load ");
3921 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3923 return false;
3925 STMT_VINFO_STRIDED_P (stmt_info) = true;
3929 /* If we stopped analysis at the first dataref we could not analyze
3930 when trying to vectorize a basic-block mark the rest of the datarefs
3931 as not vectorizable and truncate the vector of datarefs. That
3932 avoids spending useless time in analyzing their dependence. */
3933 if (i != datarefs.length ())
3935 gcc_assert (is_a <bb_vec_info> (vinfo));
3936 for (unsigned j = i; j < datarefs.length (); ++j)
3938 data_reference_p dr = datarefs[j];
3939 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3940 free_data_ref (dr);
3942 datarefs.truncate (i);
3945 return true;
3949 /* Function vect_get_new_vect_var.
3951 Returns a name for a new variable. The current naming scheme appends the
3952 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3953 the name of vectorizer generated variables, and appends that to NAME if
3954 provided. */
3956 tree
3957 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3959 const char *prefix;
3960 tree new_vect_var;
3962 switch (var_kind)
3964 case vect_simple_var:
3965 prefix = "vect";
3966 break;
3967 case vect_scalar_var:
3968 prefix = "stmp";
3969 break;
3970 case vect_mask_var:
3971 prefix = "mask";
3972 break;
3973 case vect_pointer_var:
3974 prefix = "vectp";
3975 break;
3976 default:
3977 gcc_unreachable ();
3980 if (name)
3982 char* tmp = concat (prefix, "_", name, NULL);
3983 new_vect_var = create_tmp_reg (type, tmp);
3984 free (tmp);
3986 else
3987 new_vect_var = create_tmp_reg (type, prefix);
3989 return new_vect_var;
3992 /* Like vect_get_new_vect_var but return an SSA name. */
3994 tree
3995 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3997 const char *prefix;
3998 tree new_vect_var;
4000 switch (var_kind)
4002 case vect_simple_var:
4003 prefix = "vect";
4004 break;
4005 case vect_scalar_var:
4006 prefix = "stmp";
4007 break;
4008 case vect_pointer_var:
4009 prefix = "vectp";
4010 break;
4011 default:
4012 gcc_unreachable ();
4015 if (name)
4017 char* tmp = concat (prefix, "_", name, NULL);
4018 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4019 free (tmp);
4021 else
4022 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4024 return new_vect_var;
4027 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4029 static void
4030 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4031 stmt_vec_info stmt_info)
4033 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4034 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4035 int misalign = DR_MISALIGNMENT (dr);
4036 if (misalign == -1)
4037 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4038 else
4039 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4042 /* Function vect_create_addr_base_for_vector_ref.
4044 Create an expression that computes the address of the first memory location
4045 that will be accessed for a data reference.
4047 Input:
4048 STMT: The statement containing the data reference.
4049 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4050 OFFSET: Optional. If supplied, it is be added to the initial address.
4051 LOOP: Specify relative to which loop-nest should the address be computed.
4052 For example, when the dataref is in an inner-loop nested in an
4053 outer-loop that is now being vectorized, LOOP can be either the
4054 outer-loop, or the inner-loop. The first memory location accessed
4055 by the following dataref ('in' points to short):
4057 for (i=0; i<N; i++)
4058 for (j=0; j<M; j++)
4059 s += in[i+j]
4061 is as follows:
4062 if LOOP=i_loop: &in (relative to i_loop)
4063 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4064 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4065 initial address. Unlike OFFSET, which is number of elements to
4066 be added, BYTE_OFFSET is measured in bytes.
4068 Output:
4069 1. Return an SSA_NAME whose value is the address of the memory location of
4070 the first vector of the data reference.
4071 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4072 these statement(s) which define the returned SSA_NAME.
4074 FORNOW: We are only handling array accesses with step 1. */
4076 tree
4077 vect_create_addr_base_for_vector_ref (gimple *stmt,
4078 gimple_seq *new_stmt_list,
4079 tree offset,
4080 struct loop *loop,
4081 tree byte_offset)
4083 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4084 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4085 tree data_ref_base;
4086 const char *base_name;
4087 tree addr_base;
4088 tree dest;
4089 gimple_seq seq = NULL;
4090 tree base_offset;
4091 tree init;
4092 tree vect_ptr_type;
4093 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4094 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4096 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4098 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4100 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4102 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4103 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4104 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4106 else
4108 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4109 base_offset = unshare_expr (DR_OFFSET (dr));
4110 init = unshare_expr (DR_INIT (dr));
4113 if (loop_vinfo)
4114 base_name = get_name (data_ref_base);
4115 else
4117 base_offset = ssize_int (0);
4118 init = ssize_int (0);
4119 base_name = get_name (DR_REF (dr));
4122 /* Create base_offset */
4123 base_offset = size_binop (PLUS_EXPR,
4124 fold_convert (sizetype, base_offset),
4125 fold_convert (sizetype, init));
4127 if (offset)
4129 offset = fold_build2 (MULT_EXPR, sizetype,
4130 fold_convert (sizetype, offset), step);
4131 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4132 base_offset, offset);
4134 if (byte_offset)
4136 byte_offset = fold_convert (sizetype, byte_offset);
4137 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4138 base_offset, byte_offset);
4141 /* base + base_offset */
4142 if (loop_vinfo)
4143 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4144 else
4146 addr_base = build1 (ADDR_EXPR,
4147 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4148 unshare_expr (DR_REF (dr)));
4151 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4152 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4153 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4154 gimple_seq_add_seq (new_stmt_list, seq);
4156 if (DR_PTR_INFO (dr)
4157 && TREE_CODE (addr_base) == SSA_NAME
4158 && !SSA_NAME_PTR_INFO (addr_base))
4160 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4161 if (offset || byte_offset)
4162 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4165 if (dump_enabled_p ())
4167 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4168 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4169 dump_printf (MSG_NOTE, "\n");
4172 return addr_base;
4176 /* Function vect_create_data_ref_ptr.
4178 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4179 location accessed in the loop by STMT, along with the def-use update
4180 chain to appropriately advance the pointer through the loop iterations.
4181 Also set aliasing information for the pointer. This pointer is used by
4182 the callers to this function to create a memory reference expression for
4183 vector load/store access.
4185 Input:
4186 1. STMT: a stmt that references memory. Expected to be of the form
4187 GIMPLE_ASSIGN <name, data-ref> or
4188 GIMPLE_ASSIGN <data-ref, name>.
4189 2. AGGR_TYPE: the type of the reference, which should be either a vector
4190 or an array.
4191 3. AT_LOOP: the loop where the vector memref is to be created.
4192 4. OFFSET (optional): an offset to be added to the initial address accessed
4193 by the data-ref in STMT.
4194 5. BSI: location where the new stmts are to be placed if there is no loop
4195 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4196 pointing to the initial address.
4197 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4198 to the initial address accessed by the data-ref in STMT. This is
4199 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4200 in bytes.
4202 Output:
4203 1. Declare a new ptr to vector_type, and have it point to the base of the
4204 data reference (initial addressed accessed by the data reference).
4205 For example, for vector of type V8HI, the following code is generated:
4207 v8hi *ap;
4208 ap = (v8hi *)initial_address;
4210 if OFFSET is not supplied:
4211 initial_address = &a[init];
4212 if OFFSET is supplied:
4213 initial_address = &a[init + OFFSET];
4214 if BYTE_OFFSET is supplied:
4215 initial_address = &a[init] + BYTE_OFFSET;
4217 Return the initial_address in INITIAL_ADDRESS.
4219 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4220 update the pointer in each iteration of the loop.
4222 Return the increment stmt that updates the pointer in PTR_INCR.
4224 3. Set INV_P to true if the access pattern of the data reference in the
4225 vectorized loop is invariant. Set it to false otherwise.
4227 4. Return the pointer. */
4229 tree
4230 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4231 tree offset, tree *initial_address,
4232 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4233 bool only_init, bool *inv_p, tree byte_offset)
4235 const char *base_name;
4236 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4237 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4238 struct loop *loop = NULL;
4239 bool nested_in_vect_loop = false;
4240 struct loop *containing_loop = NULL;
4241 tree aggr_ptr_type;
4242 tree aggr_ptr;
4243 tree new_temp;
4244 gimple_seq new_stmt_list = NULL;
4245 edge pe = NULL;
4246 basic_block new_bb;
4247 tree aggr_ptr_init;
4248 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4249 tree aptr;
4250 gimple_stmt_iterator incr_gsi;
4251 bool insert_after;
4252 tree indx_before_incr, indx_after_incr;
4253 gimple *incr;
4254 tree step;
4255 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4257 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4258 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4260 if (loop_vinfo)
4262 loop = LOOP_VINFO_LOOP (loop_vinfo);
4263 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4264 containing_loop = (gimple_bb (stmt))->loop_father;
4265 pe = loop_preheader_edge (loop);
4267 else
4269 gcc_assert (bb_vinfo);
4270 only_init = true;
4271 *ptr_incr = NULL;
4274 /* Check the step (evolution) of the load in LOOP, and record
4275 whether it's invariant. */
4276 if (nested_in_vect_loop)
4277 step = STMT_VINFO_DR_STEP (stmt_info);
4278 else
4279 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4281 if (integer_zerop (step))
4282 *inv_p = true;
4283 else
4284 *inv_p = false;
4286 /* Create an expression for the first address accessed by this load
4287 in LOOP. */
4288 base_name = get_name (DR_BASE_ADDRESS (dr));
4290 if (dump_enabled_p ())
4292 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4293 dump_printf_loc (MSG_NOTE, vect_location,
4294 "create %s-pointer variable to type: ",
4295 get_tree_code_name (TREE_CODE (aggr_type)));
4296 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4297 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4298 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4299 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4300 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4301 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4302 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4303 else
4304 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4305 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4306 dump_printf (MSG_NOTE, "\n");
4309 /* (1) Create the new aggregate-pointer variable.
4310 Vector and array types inherit the alias set of their component
4311 type by default so we need to use a ref-all pointer if the data
4312 reference does not conflict with the created aggregated data
4313 reference because it is not addressable. */
4314 bool need_ref_all = false;
4315 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4316 get_alias_set (DR_REF (dr))))
4317 need_ref_all = true;
4318 /* Likewise for any of the data references in the stmt group. */
4319 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4321 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4324 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4325 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4326 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4327 get_alias_set (DR_REF (sdr))))
4329 need_ref_all = true;
4330 break;
4332 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4334 while (orig_stmt);
4336 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4337 need_ref_all);
4338 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4341 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4342 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4343 def-use update cycles for the pointer: one relative to the outer-loop
4344 (LOOP), which is what steps (3) and (4) below do. The other is relative
4345 to the inner-loop (which is the inner-most loop containing the dataref),
4346 and this is done be step (5) below.
4348 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4349 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4350 redundant. Steps (3),(4) create the following:
4352 vp0 = &base_addr;
4353 LOOP: vp1 = phi(vp0,vp2)
4356 vp2 = vp1 + step
4357 goto LOOP
4359 If there is an inner-loop nested in loop, then step (5) will also be
4360 applied, and an additional update in the inner-loop will be created:
4362 vp0 = &base_addr;
4363 LOOP: vp1 = phi(vp0,vp2)
4365 inner: vp3 = phi(vp1,vp4)
4366 vp4 = vp3 + inner_step
4367 if () goto inner
4369 vp2 = vp1 + step
4370 if () goto LOOP */
4372 /* (2) Calculate the initial address of the aggregate-pointer, and set
4373 the aggregate-pointer to point to it before the loop. */
4375 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4377 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4378 offset, loop, byte_offset);
4379 if (new_stmt_list)
4381 if (pe)
4383 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4384 gcc_assert (!new_bb);
4386 else
4387 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4390 *initial_address = new_temp;
4391 aggr_ptr_init = new_temp;
4393 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4394 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4395 inner-loop nested in LOOP (during outer-loop vectorization). */
4397 /* No update in loop is required. */
4398 if (only_init && (!loop_vinfo || at_loop == loop))
4399 aptr = aggr_ptr_init;
4400 else
4402 /* The step of the aggregate pointer is the type size. */
4403 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4404 /* One exception to the above is when the scalar step of the load in
4405 LOOP is zero. In this case the step here is also zero. */
4406 if (*inv_p)
4407 iv_step = size_zero_node;
4408 else if (tree_int_cst_sgn (step) == -1)
4409 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4411 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4413 create_iv (aggr_ptr_init,
4414 fold_convert (aggr_ptr_type, iv_step),
4415 aggr_ptr, loop, &incr_gsi, insert_after,
4416 &indx_before_incr, &indx_after_incr);
4417 incr = gsi_stmt (incr_gsi);
4418 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4420 /* Copy the points-to information if it exists. */
4421 if (DR_PTR_INFO (dr))
4423 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4424 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4426 if (ptr_incr)
4427 *ptr_incr = incr;
4429 aptr = indx_before_incr;
4432 if (!nested_in_vect_loop || only_init)
4433 return aptr;
4436 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4437 nested in LOOP, if exists. */
4439 gcc_assert (nested_in_vect_loop);
4440 if (!only_init)
4442 standard_iv_increment_position (containing_loop, &incr_gsi,
4443 &insert_after);
4444 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4445 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4446 &indx_after_incr);
4447 incr = gsi_stmt (incr_gsi);
4448 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4450 /* Copy the points-to information if it exists. */
4451 if (DR_PTR_INFO (dr))
4453 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4454 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4456 if (ptr_incr)
4457 *ptr_incr = incr;
4459 return indx_before_incr;
4461 else
4462 gcc_unreachable ();
4466 /* Function bump_vector_ptr
4468 Increment a pointer (to a vector type) by vector-size. If requested,
4469 i.e. if PTR-INCR is given, then also connect the new increment stmt
4470 to the existing def-use update-chain of the pointer, by modifying
4471 the PTR_INCR as illustrated below:
4473 The pointer def-use update-chain before this function:
4474 DATAREF_PTR = phi (p_0, p_2)
4475 ....
4476 PTR_INCR: p_2 = DATAREF_PTR + step
4478 The pointer def-use update-chain after this function:
4479 DATAREF_PTR = phi (p_0, p_2)
4480 ....
4481 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4482 ....
4483 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4485 Input:
4486 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4487 in the loop.
4488 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4489 the loop. The increment amount across iterations is expected
4490 to be vector_size.
4491 BSI - location where the new update stmt is to be placed.
4492 STMT - the original scalar memory-access stmt that is being vectorized.
4493 BUMP - optional. The offset by which to bump the pointer. If not given,
4494 the offset is assumed to be vector_size.
4496 Output: Return NEW_DATAREF_PTR as illustrated above.
4500 tree
4501 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4502 gimple *stmt, tree bump)
4504 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4505 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4506 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4507 tree update = TYPE_SIZE_UNIT (vectype);
4508 gassign *incr_stmt;
4509 ssa_op_iter iter;
4510 use_operand_p use_p;
4511 tree new_dataref_ptr;
4513 if (bump)
4514 update = bump;
4516 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4517 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4518 else
4519 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4520 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4521 dataref_ptr, update);
4522 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4524 /* Copy the points-to information if it exists. */
4525 if (DR_PTR_INFO (dr))
4527 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4528 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4531 if (!ptr_incr)
4532 return new_dataref_ptr;
4534 /* Update the vector-pointer's cross-iteration increment. */
4535 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4537 tree use = USE_FROM_PTR (use_p);
4539 if (use == dataref_ptr)
4540 SET_USE (use_p, new_dataref_ptr);
4541 else
4542 gcc_assert (tree_int_cst_compare (use, update) == 0);
4545 return new_dataref_ptr;
4549 /* Function vect_create_destination_var.
4551 Create a new temporary of type VECTYPE. */
4553 tree
4554 vect_create_destination_var (tree scalar_dest, tree vectype)
4556 tree vec_dest;
4557 const char *name;
4558 char *new_name;
4559 tree type;
4560 enum vect_var_kind kind;
4562 kind = vectype
4563 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4564 ? vect_mask_var
4565 : vect_simple_var
4566 : vect_scalar_var;
4567 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4569 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4571 name = get_name (scalar_dest);
4572 if (name)
4573 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4574 else
4575 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4576 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4577 free (new_name);
4579 return vec_dest;
4582 /* Function vect_grouped_store_supported.
4584 Returns TRUE if interleave high and interleave low permutations
4585 are supported, and FALSE otherwise. */
4587 bool
4588 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4590 machine_mode mode = TYPE_MODE (vectype);
4592 /* vect_permute_store_chain requires the group size to be equal to 3 or
4593 be a power of two. */
4594 if (count != 3 && exact_log2 (count) == -1)
4596 if (dump_enabled_p ())
4597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4598 "the size of the group of accesses"
4599 " is not a power of 2 or not eqaul to 3\n");
4600 return false;
4603 /* Check that the permutation is supported. */
4604 if (VECTOR_MODE_P (mode))
4606 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4607 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4609 if (count == 3)
4611 unsigned int j0 = 0, j1 = 0, j2 = 0;
4612 unsigned int i, j;
4614 for (j = 0; j < 3; j++)
4616 int nelt0 = ((3 - j) * nelt) % 3;
4617 int nelt1 = ((3 - j) * nelt + 1) % 3;
4618 int nelt2 = ((3 - j) * nelt + 2) % 3;
4619 for (i = 0; i < nelt; i++)
4621 if (3 * i + nelt0 < nelt)
4622 sel[3 * i + nelt0] = j0++;
4623 if (3 * i + nelt1 < nelt)
4624 sel[3 * i + nelt1] = nelt + j1++;
4625 if (3 * i + nelt2 < nelt)
4626 sel[3 * i + nelt2] = 0;
4628 if (!can_vec_perm_p (mode, false, sel))
4630 if (dump_enabled_p ())
4631 dump_printf (MSG_MISSED_OPTIMIZATION,
4632 "permutaion op not supported by target.\n");
4633 return false;
4636 for (i = 0; i < nelt; i++)
4638 if (3 * i + nelt0 < nelt)
4639 sel[3 * i + nelt0] = 3 * i + nelt0;
4640 if (3 * i + nelt1 < nelt)
4641 sel[3 * i + nelt1] = 3 * i + nelt1;
4642 if (3 * i + nelt2 < nelt)
4643 sel[3 * i + nelt2] = nelt + j2++;
4645 if (!can_vec_perm_p (mode, false, sel))
4647 if (dump_enabled_p ())
4648 dump_printf (MSG_MISSED_OPTIMIZATION,
4649 "permutaion op not supported by target.\n");
4650 return false;
4653 return true;
4655 else
4657 /* If length is not equal to 3 then only power of 2 is supported. */
4658 gcc_assert (exact_log2 (count) != -1);
4660 for (i = 0; i < nelt / 2; i++)
4662 sel[i * 2] = i;
4663 sel[i * 2 + 1] = i + nelt;
4665 if (can_vec_perm_p (mode, false, sel))
4667 for (i = 0; i < nelt; i++)
4668 sel[i] += nelt / 2;
4669 if (can_vec_perm_p (mode, false, sel))
4670 return true;
4675 if (dump_enabled_p ())
4676 dump_printf (MSG_MISSED_OPTIMIZATION,
4677 "permutaion op not supported by target.\n");
4678 return false;
4682 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4683 type VECTYPE. */
4685 bool
4686 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4688 return vect_lanes_optab_supported_p ("vec_store_lanes",
4689 vec_store_lanes_optab,
4690 vectype, count);
4694 /* Function vect_permute_store_chain.
4696 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4697 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4698 the data correctly for the stores. Return the final references for stores
4699 in RESULT_CHAIN.
4701 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4702 The input is 4 vectors each containing 8 elements. We assign a number to
4703 each element, the input sequence is:
4705 1st vec: 0 1 2 3 4 5 6 7
4706 2nd vec: 8 9 10 11 12 13 14 15
4707 3rd vec: 16 17 18 19 20 21 22 23
4708 4th vec: 24 25 26 27 28 29 30 31
4710 The output sequence should be:
4712 1st vec: 0 8 16 24 1 9 17 25
4713 2nd vec: 2 10 18 26 3 11 19 27
4714 3rd vec: 4 12 20 28 5 13 21 30
4715 4th vec: 6 14 22 30 7 15 23 31
4717 i.e., we interleave the contents of the four vectors in their order.
4719 We use interleave_high/low instructions to create such output. The input of
4720 each interleave_high/low operation is two vectors:
4721 1st vec 2nd vec
4722 0 1 2 3 4 5 6 7
4723 the even elements of the result vector are obtained left-to-right from the
4724 high/low elements of the first vector. The odd elements of the result are
4725 obtained left-to-right from the high/low elements of the second vector.
4726 The output of interleave_high will be: 0 4 1 5
4727 and of interleave_low: 2 6 3 7
4730 The permutation is done in log LENGTH stages. In each stage interleave_high
4731 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4732 where the first argument is taken from the first half of DR_CHAIN and the
4733 second argument from it's second half.
4734 In our example,
4736 I1: interleave_high (1st vec, 3rd vec)
4737 I2: interleave_low (1st vec, 3rd vec)
4738 I3: interleave_high (2nd vec, 4th vec)
4739 I4: interleave_low (2nd vec, 4th vec)
4741 The output for the first stage is:
4743 I1: 0 16 1 17 2 18 3 19
4744 I2: 4 20 5 21 6 22 7 23
4745 I3: 8 24 9 25 10 26 11 27
4746 I4: 12 28 13 29 14 30 15 31
4748 The output of the second stage, i.e. the final result is:
4750 I1: 0 8 16 24 1 9 17 25
4751 I2: 2 10 18 26 3 11 19 27
4752 I3: 4 12 20 28 5 13 21 30
4753 I4: 6 14 22 30 7 15 23 31. */
4755 void
4756 vect_permute_store_chain (vec<tree> dr_chain,
4757 unsigned int length,
4758 gimple *stmt,
4759 gimple_stmt_iterator *gsi,
4760 vec<tree> *result_chain)
4762 tree vect1, vect2, high, low;
4763 gimple *perm_stmt;
4764 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4765 tree perm_mask_low, perm_mask_high;
4766 tree data_ref;
4767 tree perm3_mask_low, perm3_mask_high;
4768 unsigned int i, n, log_length = exact_log2 (length);
4769 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4770 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4772 result_chain->quick_grow (length);
4773 memcpy (result_chain->address (), dr_chain.address (),
4774 length * sizeof (tree));
4776 if (length == 3)
4778 unsigned int j0 = 0, j1 = 0, j2 = 0;
4780 for (j = 0; j < 3; j++)
4782 int nelt0 = ((3 - j) * nelt) % 3;
4783 int nelt1 = ((3 - j) * nelt + 1) % 3;
4784 int nelt2 = ((3 - j) * nelt + 2) % 3;
4786 for (i = 0; i < nelt; i++)
4788 if (3 * i + nelt0 < nelt)
4789 sel[3 * i + nelt0] = j0++;
4790 if (3 * i + nelt1 < nelt)
4791 sel[3 * i + nelt1] = nelt + j1++;
4792 if (3 * i + nelt2 < nelt)
4793 sel[3 * i + nelt2] = 0;
4795 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4797 for (i = 0; i < nelt; i++)
4799 if (3 * i + nelt0 < nelt)
4800 sel[3 * i + nelt0] = 3 * i + nelt0;
4801 if (3 * i + nelt1 < nelt)
4802 sel[3 * i + nelt1] = 3 * i + nelt1;
4803 if (3 * i + nelt2 < nelt)
4804 sel[3 * i + nelt2] = nelt + j2++;
4806 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4808 vect1 = dr_chain[0];
4809 vect2 = dr_chain[1];
4811 /* Create interleaving stmt:
4812 low = VEC_PERM_EXPR <vect1, vect2,
4813 {j, nelt, *, j + 1, nelt + j + 1, *,
4814 j + 2, nelt + j + 2, *, ...}> */
4815 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4816 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4817 vect2, perm3_mask_low);
4818 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4820 vect1 = data_ref;
4821 vect2 = dr_chain[2];
4822 /* Create interleaving stmt:
4823 low = VEC_PERM_EXPR <vect1, vect2,
4824 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4825 6, 7, nelt + j + 2, ...}> */
4826 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4827 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4828 vect2, perm3_mask_high);
4829 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4830 (*result_chain)[j] = data_ref;
4833 else
4835 /* If length is not equal to 3 then only power of 2 is supported. */
4836 gcc_assert (exact_log2 (length) != -1);
4838 for (i = 0, n = nelt / 2; i < n; i++)
4840 sel[i * 2] = i;
4841 sel[i * 2 + 1] = i + nelt;
4843 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4845 for (i = 0; i < nelt; i++)
4846 sel[i] += nelt / 2;
4847 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4849 for (i = 0, n = log_length; i < n; i++)
4851 for (j = 0; j < length/2; j++)
4853 vect1 = dr_chain[j];
4854 vect2 = dr_chain[j+length/2];
4856 /* Create interleaving stmt:
4857 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4858 ...}> */
4859 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4860 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4861 vect2, perm_mask_high);
4862 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4863 (*result_chain)[2*j] = high;
4865 /* Create interleaving stmt:
4866 low = VEC_PERM_EXPR <vect1, vect2,
4867 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4868 ...}> */
4869 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4870 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4871 vect2, perm_mask_low);
4872 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4873 (*result_chain)[2*j+1] = low;
4875 memcpy (dr_chain.address (), result_chain->address (),
4876 length * sizeof (tree));
4881 /* Function vect_setup_realignment
4883 This function is called when vectorizing an unaligned load using
4884 the dr_explicit_realign[_optimized] scheme.
4885 This function generates the following code at the loop prolog:
4887 p = initial_addr;
4888 x msq_init = *(floor(p)); # prolog load
4889 realignment_token = call target_builtin;
4890 loop:
4891 x msq = phi (msq_init, ---)
4893 The stmts marked with x are generated only for the case of
4894 dr_explicit_realign_optimized.
4896 The code above sets up a new (vector) pointer, pointing to the first
4897 location accessed by STMT, and a "floor-aligned" load using that pointer.
4898 It also generates code to compute the "realignment-token" (if the relevant
4899 target hook was defined), and creates a phi-node at the loop-header bb
4900 whose arguments are the result of the prolog-load (created by this
4901 function) and the result of a load that takes place in the loop (to be
4902 created by the caller to this function).
4904 For the case of dr_explicit_realign_optimized:
4905 The caller to this function uses the phi-result (msq) to create the
4906 realignment code inside the loop, and sets up the missing phi argument,
4907 as follows:
4908 loop:
4909 msq = phi (msq_init, lsq)
4910 lsq = *(floor(p')); # load in loop
4911 result = realign_load (msq, lsq, realignment_token);
4913 For the case of dr_explicit_realign:
4914 loop:
4915 msq = *(floor(p)); # load in loop
4916 p' = p + (VS-1);
4917 lsq = *(floor(p')); # load in loop
4918 result = realign_load (msq, lsq, realignment_token);
4920 Input:
4921 STMT - (scalar) load stmt to be vectorized. This load accesses
4922 a memory location that may be unaligned.
4923 BSI - place where new code is to be inserted.
4924 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4925 is used.
4927 Output:
4928 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4929 target hook, if defined.
4930 Return value - the result of the loop-header phi node. */
4932 tree
4933 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4934 tree *realignment_token,
4935 enum dr_alignment_support alignment_support_scheme,
4936 tree init_addr,
4937 struct loop **at_loop)
4939 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4940 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4941 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4942 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4943 struct loop *loop = NULL;
4944 edge pe = NULL;
4945 tree scalar_dest = gimple_assign_lhs (stmt);
4946 tree vec_dest;
4947 gimple *inc;
4948 tree ptr;
4949 tree data_ref;
4950 basic_block new_bb;
4951 tree msq_init = NULL_TREE;
4952 tree new_temp;
4953 gphi *phi_stmt;
4954 tree msq = NULL_TREE;
4955 gimple_seq stmts = NULL;
4956 bool inv_p;
4957 bool compute_in_loop = false;
4958 bool nested_in_vect_loop = false;
4959 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4960 struct loop *loop_for_initial_load = NULL;
4962 if (loop_vinfo)
4964 loop = LOOP_VINFO_LOOP (loop_vinfo);
4965 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4968 gcc_assert (alignment_support_scheme == dr_explicit_realign
4969 || alignment_support_scheme == dr_explicit_realign_optimized);
4971 /* We need to generate three things:
4972 1. the misalignment computation
4973 2. the extra vector load (for the optimized realignment scheme).
4974 3. the phi node for the two vectors from which the realignment is
4975 done (for the optimized realignment scheme). */
4977 /* 1. Determine where to generate the misalignment computation.
4979 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4980 calculation will be generated by this function, outside the loop (in the
4981 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4982 caller, inside the loop.
4984 Background: If the misalignment remains fixed throughout the iterations of
4985 the loop, then both realignment schemes are applicable, and also the
4986 misalignment computation can be done outside LOOP. This is because we are
4987 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4988 are a multiple of VS (the Vector Size), and therefore the misalignment in
4989 different vectorized LOOP iterations is always the same.
4990 The problem arises only if the memory access is in an inner-loop nested
4991 inside LOOP, which is now being vectorized using outer-loop vectorization.
4992 This is the only case when the misalignment of the memory access may not
4993 remain fixed throughout the iterations of the inner-loop (as explained in
4994 detail in vect_supportable_dr_alignment). In this case, not only is the
4995 optimized realignment scheme not applicable, but also the misalignment
4996 computation (and generation of the realignment token that is passed to
4997 REALIGN_LOAD) have to be done inside the loop.
4999 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5000 or not, which in turn determines if the misalignment is computed inside
5001 the inner-loop, or outside LOOP. */
5003 if (init_addr != NULL_TREE || !loop_vinfo)
5005 compute_in_loop = true;
5006 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5010 /* 2. Determine where to generate the extra vector load.
5012 For the optimized realignment scheme, instead of generating two vector
5013 loads in each iteration, we generate a single extra vector load in the
5014 preheader of the loop, and in each iteration reuse the result of the
5015 vector load from the previous iteration. In case the memory access is in
5016 an inner-loop nested inside LOOP, which is now being vectorized using
5017 outer-loop vectorization, we need to determine whether this initial vector
5018 load should be generated at the preheader of the inner-loop, or can be
5019 generated at the preheader of LOOP. If the memory access has no evolution
5020 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5021 to be generated inside LOOP (in the preheader of the inner-loop). */
5023 if (nested_in_vect_loop)
5025 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5026 bool invariant_in_outerloop =
5027 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5028 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5030 else
5031 loop_for_initial_load = loop;
5032 if (at_loop)
5033 *at_loop = loop_for_initial_load;
5035 if (loop_for_initial_load)
5036 pe = loop_preheader_edge (loop_for_initial_load);
5038 /* 3. For the case of the optimized realignment, create the first vector
5039 load at the loop preheader. */
5041 if (alignment_support_scheme == dr_explicit_realign_optimized)
5043 /* Create msq_init = *(floor(p1)) in the loop preheader */
5044 gassign *new_stmt;
5046 gcc_assert (!compute_in_loop);
5047 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5048 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5049 NULL_TREE, &init_addr, NULL, &inc,
5050 true, &inv_p);
5051 if (TREE_CODE (ptr) == SSA_NAME)
5052 new_temp = copy_ssa_name (ptr);
5053 else
5054 new_temp = make_ssa_name (TREE_TYPE (ptr));
5055 new_stmt = gimple_build_assign
5056 (new_temp, BIT_AND_EXPR, ptr,
5057 build_int_cst (TREE_TYPE (ptr),
5058 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5059 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5060 gcc_assert (!new_bb);
5061 data_ref
5062 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5063 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5064 new_stmt = gimple_build_assign (vec_dest, data_ref);
5065 new_temp = make_ssa_name (vec_dest, new_stmt);
5066 gimple_assign_set_lhs (new_stmt, new_temp);
5067 if (pe)
5069 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5070 gcc_assert (!new_bb);
5072 else
5073 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5075 msq_init = gimple_assign_lhs (new_stmt);
5078 /* 4. Create realignment token using a target builtin, if available.
5079 It is done either inside the containing loop, or before LOOP (as
5080 determined above). */
5082 if (targetm.vectorize.builtin_mask_for_load)
5084 gcall *new_stmt;
5085 tree builtin_decl;
5087 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5088 if (!init_addr)
5090 /* Generate the INIT_ADDR computation outside LOOP. */
5091 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5092 NULL_TREE, loop);
5093 if (loop)
5095 pe = loop_preheader_edge (loop);
5096 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5097 gcc_assert (!new_bb);
5099 else
5100 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5103 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5104 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5105 vec_dest =
5106 vect_create_destination_var (scalar_dest,
5107 gimple_call_return_type (new_stmt));
5108 new_temp = make_ssa_name (vec_dest, new_stmt);
5109 gimple_call_set_lhs (new_stmt, new_temp);
5111 if (compute_in_loop)
5112 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5113 else
5115 /* Generate the misalignment computation outside LOOP. */
5116 pe = loop_preheader_edge (loop);
5117 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5118 gcc_assert (!new_bb);
5121 *realignment_token = gimple_call_lhs (new_stmt);
5123 /* The result of the CALL_EXPR to this builtin is determined from
5124 the value of the parameter and no global variables are touched
5125 which makes the builtin a "const" function. Requiring the
5126 builtin to have the "const" attribute makes it unnecessary
5127 to call mark_call_clobbered. */
5128 gcc_assert (TREE_READONLY (builtin_decl));
5131 if (alignment_support_scheme == dr_explicit_realign)
5132 return msq;
5134 gcc_assert (!compute_in_loop);
5135 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5138 /* 5. Create msq = phi <msq_init, lsq> in loop */
5140 pe = loop_preheader_edge (containing_loop);
5141 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5142 msq = make_ssa_name (vec_dest);
5143 phi_stmt = create_phi_node (msq, containing_loop->header);
5144 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5146 return msq;
5150 /* Function vect_grouped_load_supported.
5152 COUNT is the size of the load group (the number of statements plus the
5153 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5154 only one statement, with a gap of COUNT - 1.
5156 Returns true if a suitable permute exists. */
5158 bool
5159 vect_grouped_load_supported (tree vectype, bool single_element_p,
5160 unsigned HOST_WIDE_INT count)
5162 machine_mode mode = TYPE_MODE (vectype);
5164 /* If this is single-element interleaving with an element distance
5165 that leaves unused vector loads around punt - we at least create
5166 very sub-optimal code in that case (and blow up memory,
5167 see PR65518). */
5168 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5170 if (dump_enabled_p ())
5171 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5172 "single-element interleaving not supported "
5173 "for not adjacent vector loads\n");
5174 return false;
5177 /* vect_permute_load_chain requires the group size to be equal to 3 or
5178 be a power of two. */
5179 if (count != 3 && exact_log2 (count) == -1)
5181 if (dump_enabled_p ())
5182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5183 "the size of the group of accesses"
5184 " is not a power of 2 or not equal to 3\n");
5185 return false;
5188 /* Check that the permutation is supported. */
5189 if (VECTOR_MODE_P (mode))
5191 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5192 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5194 if (count == 3)
5196 unsigned int k;
5197 for (k = 0; k < 3; k++)
5199 for (i = 0; i < nelt; i++)
5200 if (3 * i + k < 2 * nelt)
5201 sel[i] = 3 * i + k;
5202 else
5203 sel[i] = 0;
5204 if (!can_vec_perm_p (mode, false, sel))
5206 if (dump_enabled_p ())
5207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5208 "shuffle of 3 loads is not supported by"
5209 " target\n");
5210 return false;
5212 for (i = 0, j = 0; i < nelt; i++)
5213 if (3 * i + k < 2 * nelt)
5214 sel[i] = i;
5215 else
5216 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5217 if (!can_vec_perm_p (mode, false, sel))
5219 if (dump_enabled_p ())
5220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5221 "shuffle of 3 loads is not supported by"
5222 " target\n");
5223 return false;
5226 return true;
5228 else
5230 /* If length is not equal to 3 then only power of 2 is supported. */
5231 gcc_assert (exact_log2 (count) != -1);
5232 for (i = 0; i < nelt; i++)
5233 sel[i] = i * 2;
5234 if (can_vec_perm_p (mode, false, sel))
5236 for (i = 0; i < nelt; i++)
5237 sel[i] = i * 2 + 1;
5238 if (can_vec_perm_p (mode, false, sel))
5239 return true;
5244 if (dump_enabled_p ())
5245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5246 "extract even/odd not supported by target\n");
5247 return false;
5250 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5251 type VECTYPE. */
5253 bool
5254 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5256 return vect_lanes_optab_supported_p ("vec_load_lanes",
5257 vec_load_lanes_optab,
5258 vectype, count);
5261 /* Function vect_permute_load_chain.
5263 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5264 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5265 the input data correctly. Return the final references for loads in
5266 RESULT_CHAIN.
5268 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5269 The input is 4 vectors each containing 8 elements. We assign a number to each
5270 element, the input sequence is:
5272 1st vec: 0 1 2 3 4 5 6 7
5273 2nd vec: 8 9 10 11 12 13 14 15
5274 3rd vec: 16 17 18 19 20 21 22 23
5275 4th vec: 24 25 26 27 28 29 30 31
5277 The output sequence should be:
5279 1st vec: 0 4 8 12 16 20 24 28
5280 2nd vec: 1 5 9 13 17 21 25 29
5281 3rd vec: 2 6 10 14 18 22 26 30
5282 4th vec: 3 7 11 15 19 23 27 31
5284 i.e., the first output vector should contain the first elements of each
5285 interleaving group, etc.
5287 We use extract_even/odd instructions to create such output. The input of
5288 each extract_even/odd operation is two vectors
5289 1st vec 2nd vec
5290 0 1 2 3 4 5 6 7
5292 and the output is the vector of extracted even/odd elements. The output of
5293 extract_even will be: 0 2 4 6
5294 and of extract_odd: 1 3 5 7
5297 The permutation is done in log LENGTH stages. In each stage extract_even
5298 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5299 their order. In our example,
5301 E1: extract_even (1st vec, 2nd vec)
5302 E2: extract_odd (1st vec, 2nd vec)
5303 E3: extract_even (3rd vec, 4th vec)
5304 E4: extract_odd (3rd vec, 4th vec)
5306 The output for the first stage will be:
5308 E1: 0 2 4 6 8 10 12 14
5309 E2: 1 3 5 7 9 11 13 15
5310 E3: 16 18 20 22 24 26 28 30
5311 E4: 17 19 21 23 25 27 29 31
5313 In order to proceed and create the correct sequence for the next stage (or
5314 for the correct output, if the second stage is the last one, as in our
5315 example), we first put the output of extract_even operation and then the
5316 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5317 The input for the second stage is:
5319 1st vec (E1): 0 2 4 6 8 10 12 14
5320 2nd vec (E3): 16 18 20 22 24 26 28 30
5321 3rd vec (E2): 1 3 5 7 9 11 13 15
5322 4th vec (E4): 17 19 21 23 25 27 29 31
5324 The output of the second stage:
5326 E1: 0 4 8 12 16 20 24 28
5327 E2: 2 6 10 14 18 22 26 30
5328 E3: 1 5 9 13 17 21 25 29
5329 E4: 3 7 11 15 19 23 27 31
5331 And RESULT_CHAIN after reordering:
5333 1st vec (E1): 0 4 8 12 16 20 24 28
5334 2nd vec (E3): 1 5 9 13 17 21 25 29
5335 3rd vec (E2): 2 6 10 14 18 22 26 30
5336 4th vec (E4): 3 7 11 15 19 23 27 31. */
5338 static void
5339 vect_permute_load_chain (vec<tree> dr_chain,
5340 unsigned int length,
5341 gimple *stmt,
5342 gimple_stmt_iterator *gsi,
5343 vec<tree> *result_chain)
5345 tree data_ref, first_vect, second_vect;
5346 tree perm_mask_even, perm_mask_odd;
5347 tree perm3_mask_low, perm3_mask_high;
5348 gimple *perm_stmt;
5349 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5350 unsigned int i, j, log_length = exact_log2 (length);
5351 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5352 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5354 result_chain->quick_grow (length);
5355 memcpy (result_chain->address (), dr_chain.address (),
5356 length * sizeof (tree));
5358 if (length == 3)
5360 unsigned int k;
5362 for (k = 0; k < 3; k++)
5364 for (i = 0; i < nelt; i++)
5365 if (3 * i + k < 2 * nelt)
5366 sel[i] = 3 * i + k;
5367 else
5368 sel[i] = 0;
5369 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5371 for (i = 0, j = 0; i < nelt; i++)
5372 if (3 * i + k < 2 * nelt)
5373 sel[i] = i;
5374 else
5375 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5377 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5379 first_vect = dr_chain[0];
5380 second_vect = dr_chain[1];
5382 /* Create interleaving stmt (low part of):
5383 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5384 ...}> */
5385 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5386 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5387 second_vect, perm3_mask_low);
5388 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5390 /* Create interleaving stmt (high part of):
5391 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5392 ...}> */
5393 first_vect = data_ref;
5394 second_vect = dr_chain[2];
5395 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5396 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5397 second_vect, perm3_mask_high);
5398 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5399 (*result_chain)[k] = data_ref;
5402 else
5404 /* If length is not equal to 3 then only power of 2 is supported. */
5405 gcc_assert (exact_log2 (length) != -1);
5407 for (i = 0; i < nelt; ++i)
5408 sel[i] = i * 2;
5409 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5411 for (i = 0; i < nelt; ++i)
5412 sel[i] = i * 2 + 1;
5413 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5415 for (i = 0; i < log_length; i++)
5417 for (j = 0; j < length; j += 2)
5419 first_vect = dr_chain[j];
5420 second_vect = dr_chain[j+1];
5422 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5423 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5424 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5425 first_vect, second_vect,
5426 perm_mask_even);
5427 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5428 (*result_chain)[j/2] = data_ref;
5430 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5431 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5432 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5433 first_vect, second_vect,
5434 perm_mask_odd);
5435 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5436 (*result_chain)[j/2+length/2] = data_ref;
5438 memcpy (dr_chain.address (), result_chain->address (),
5439 length * sizeof (tree));
5444 /* Function vect_shift_permute_load_chain.
5446 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5447 sequence of stmts to reorder the input data accordingly.
5448 Return the final references for loads in RESULT_CHAIN.
5449 Return true if successed, false otherwise.
5451 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5452 The input is 3 vectors each containing 8 elements. We assign a
5453 number to each element, the input sequence is:
5455 1st vec: 0 1 2 3 4 5 6 7
5456 2nd vec: 8 9 10 11 12 13 14 15
5457 3rd vec: 16 17 18 19 20 21 22 23
5459 The output sequence should be:
5461 1st vec: 0 3 6 9 12 15 18 21
5462 2nd vec: 1 4 7 10 13 16 19 22
5463 3rd vec: 2 5 8 11 14 17 20 23
5465 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5467 First we shuffle all 3 vectors to get correct elements order:
5469 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5470 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5471 3rd vec: (16 19 22) (17 20 23) (18 21)
5473 Next we unite and shift vector 3 times:
5475 1st step:
5476 shift right by 6 the concatenation of:
5477 "1st vec" and "2nd vec"
5478 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5479 "2nd vec" and "3rd vec"
5480 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5481 "3rd vec" and "1st vec"
5482 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5483 | New vectors |
5485 So that now new vectors are:
5487 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5488 2nd vec: (10 13) (16 19 22) (17 20 23)
5489 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5491 2nd step:
5492 shift right by 5 the concatenation of:
5493 "1st vec" and "3rd vec"
5494 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5495 "2nd vec" and "1st vec"
5496 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5497 "3rd vec" and "2nd vec"
5498 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5499 | New vectors |
5501 So that now new vectors are:
5503 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5504 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5505 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5507 3rd step:
5508 shift right by 5 the concatenation of:
5509 "1st vec" and "1st vec"
5510 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5511 shift right by 3 the concatenation of:
5512 "2nd vec" and "2nd vec"
5513 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5514 | New vectors |
5516 So that now all vectors are READY:
5517 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5518 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5519 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5521 This algorithm is faster than one in vect_permute_load_chain if:
5522 1. "shift of a concatination" is faster than general permutation.
5523 This is usually so.
5524 2. The TARGET machine can't execute vector instructions in parallel.
5525 This is because each step of the algorithm depends on previous.
5526 The algorithm in vect_permute_load_chain is much more parallel.
5528 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5531 static bool
5532 vect_shift_permute_load_chain (vec<tree> dr_chain,
5533 unsigned int length,
5534 gimple *stmt,
5535 gimple_stmt_iterator *gsi,
5536 vec<tree> *result_chain)
5538 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5539 tree perm2_mask1, perm2_mask2, perm3_mask;
5540 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5541 gimple *perm_stmt;
5543 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5544 unsigned int i;
5545 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5546 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5547 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5548 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5550 result_chain->quick_grow (length);
5551 memcpy (result_chain->address (), dr_chain.address (),
5552 length * sizeof (tree));
5554 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5556 unsigned int j, log_length = exact_log2 (length);
5557 for (i = 0; i < nelt / 2; ++i)
5558 sel[i] = i * 2;
5559 for (i = 0; i < nelt / 2; ++i)
5560 sel[nelt / 2 + i] = i * 2 + 1;
5561 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5563 if (dump_enabled_p ())
5564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5565 "shuffle of 2 fields structure is not \
5566 supported by target\n");
5567 return false;
5569 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5571 for (i = 0; i < nelt / 2; ++i)
5572 sel[i] = i * 2 + 1;
5573 for (i = 0; i < nelt / 2; ++i)
5574 sel[nelt / 2 + i] = i * 2;
5575 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5577 if (dump_enabled_p ())
5578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5579 "shuffle of 2 fields structure is not \
5580 supported by target\n");
5581 return false;
5583 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5585 /* Generating permutation constant to shift all elements.
5586 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5587 for (i = 0; i < nelt; i++)
5588 sel[i] = nelt / 2 + i;
5589 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5591 if (dump_enabled_p ())
5592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5593 "shift permutation is not supported by target\n");
5594 return false;
5596 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5598 /* Generating permutation constant to select vector from 2.
5599 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5600 for (i = 0; i < nelt / 2; i++)
5601 sel[i] = i;
5602 for (i = nelt / 2; i < nelt; i++)
5603 sel[i] = nelt + i;
5604 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5606 if (dump_enabled_p ())
5607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5608 "select is not supported by target\n");
5609 return false;
5611 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5613 for (i = 0; i < log_length; i++)
5615 for (j = 0; j < length; j += 2)
5617 first_vect = dr_chain[j];
5618 second_vect = dr_chain[j + 1];
5620 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5621 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5622 first_vect, first_vect,
5623 perm2_mask1);
5624 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5625 vect[0] = data_ref;
5627 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5628 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5629 second_vect, second_vect,
5630 perm2_mask2);
5631 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5632 vect[1] = data_ref;
5634 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5635 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5636 vect[0], vect[1], shift1_mask);
5637 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5638 (*result_chain)[j/2 + length/2] = data_ref;
5640 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5641 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5642 vect[0], vect[1], select_mask);
5643 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5644 (*result_chain)[j/2] = data_ref;
5646 memcpy (dr_chain.address (), result_chain->address (),
5647 length * sizeof (tree));
5649 return true;
5651 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5653 unsigned int k = 0, l = 0;
5655 /* Generating permutation constant to get all elements in rigth order.
5656 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5657 for (i = 0; i < nelt; i++)
5659 if (3 * k + (l % 3) >= nelt)
5661 k = 0;
5662 l += (3 - (nelt % 3));
5664 sel[i] = 3 * k + (l % 3);
5665 k++;
5667 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5669 if (dump_enabled_p ())
5670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5671 "shuffle of 3 fields structure is not \
5672 supported by target\n");
5673 return false;
5675 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5677 /* Generating permutation constant to shift all elements.
5678 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5679 for (i = 0; i < nelt; i++)
5680 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5681 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5683 if (dump_enabled_p ())
5684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5685 "shift permutation is not supported by target\n");
5686 return false;
5688 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5690 /* Generating permutation constant to shift all elements.
5691 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5692 for (i = 0; i < nelt; i++)
5693 sel[i] = 2 * (nelt / 3) + 1 + i;
5694 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5696 if (dump_enabled_p ())
5697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5698 "shift permutation is not supported by target\n");
5699 return false;
5701 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5703 /* Generating permutation constant to shift all elements.
5704 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5705 for (i = 0; i < nelt; i++)
5706 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5707 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5709 if (dump_enabled_p ())
5710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5711 "shift permutation is not supported by target\n");
5712 return false;
5714 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5716 /* Generating permutation constant to shift all elements.
5717 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5718 for (i = 0; i < nelt; i++)
5719 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5720 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5722 if (dump_enabled_p ())
5723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5724 "shift permutation is not supported by target\n");
5725 return false;
5727 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5729 for (k = 0; k < 3; k++)
5731 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5732 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5733 dr_chain[k], dr_chain[k],
5734 perm3_mask);
5735 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5736 vect[k] = data_ref;
5739 for (k = 0; k < 3; k++)
5741 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5742 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5743 vect[k % 3], vect[(k + 1) % 3],
5744 shift1_mask);
5745 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5746 vect_shift[k] = data_ref;
5749 for (k = 0; k < 3; k++)
5751 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5752 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5753 vect_shift[(4 - k) % 3],
5754 vect_shift[(3 - k) % 3],
5755 shift2_mask);
5756 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5757 vect[k] = data_ref;
5760 (*result_chain)[3 - (nelt % 3)] = vect[2];
5762 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5763 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5764 vect[0], shift3_mask);
5765 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5766 (*result_chain)[nelt % 3] = data_ref;
5768 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5769 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5770 vect[1], shift4_mask);
5771 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5772 (*result_chain)[0] = data_ref;
5773 return true;
5775 return false;
5778 /* Function vect_transform_grouped_load.
5780 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5781 to perform their permutation and ascribe the result vectorized statements to
5782 the scalar statements.
5785 void
5786 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5787 gimple_stmt_iterator *gsi)
5789 machine_mode mode;
5790 vec<tree> result_chain = vNULL;
5792 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5793 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5794 vectors, that are ready for vector computation. */
5795 result_chain.create (size);
5797 /* If reassociation width for vector type is 2 or greater target machine can
5798 execute 2 or more vector instructions in parallel. Otherwise try to
5799 get chain for loads group using vect_shift_permute_load_chain. */
5800 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5801 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5802 || exact_log2 (size) != -1
5803 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5804 gsi, &result_chain))
5805 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5806 vect_record_grouped_load_vectors (stmt, result_chain);
5807 result_chain.release ();
5810 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5811 generated as part of the vectorization of STMT. Assign the statement
5812 for each vector to the associated scalar statement. */
5814 void
5815 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5817 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5818 gimple *next_stmt, *new_stmt;
5819 unsigned int i, gap_count;
5820 tree tmp_data_ref;
5822 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5823 Since we scan the chain starting from it's first node, their order
5824 corresponds the order of data-refs in RESULT_CHAIN. */
5825 next_stmt = first_stmt;
5826 gap_count = 1;
5827 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5829 if (!next_stmt)
5830 break;
5832 /* Skip the gaps. Loads created for the gaps will be removed by dead
5833 code elimination pass later. No need to check for the first stmt in
5834 the group, since it always exists.
5835 GROUP_GAP is the number of steps in elements from the previous
5836 access (if there is no gap GROUP_GAP is 1). We skip loads that
5837 correspond to the gaps. */
5838 if (next_stmt != first_stmt
5839 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5841 gap_count++;
5842 continue;
5845 while (next_stmt)
5847 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5848 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5849 copies, and we put the new vector statement in the first available
5850 RELATED_STMT. */
5851 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5852 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5853 else
5855 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5857 gimple *prev_stmt =
5858 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5859 gimple *rel_stmt =
5860 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5861 while (rel_stmt)
5863 prev_stmt = rel_stmt;
5864 rel_stmt =
5865 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5868 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5869 new_stmt;
5873 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5874 gap_count = 1;
5875 /* If NEXT_STMT accesses the same DR as the previous statement,
5876 put the same TMP_DATA_REF as its vectorized statement; otherwise
5877 get the next data-ref from RESULT_CHAIN. */
5878 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5879 break;
5884 /* Function vect_force_dr_alignment_p.
5886 Returns whether the alignment of a DECL can be forced to be aligned
5887 on ALIGNMENT bit boundary. */
5889 bool
5890 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5892 if (TREE_CODE (decl) != VAR_DECL)
5893 return false;
5895 if (decl_in_symtab_p (decl)
5896 && !symtab_node::get (decl)->can_increase_alignment_p ())
5897 return false;
5899 if (TREE_STATIC (decl))
5900 return (alignment <= MAX_OFILE_ALIGNMENT);
5901 else
5902 return (alignment <= MAX_STACK_ALIGNMENT);
5906 /* Return whether the data reference DR is supported with respect to its
5907 alignment.
5908 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5909 it is aligned, i.e., check if it is possible to vectorize it with different
5910 alignment. */
5912 enum dr_alignment_support
5913 vect_supportable_dr_alignment (struct data_reference *dr,
5914 bool check_aligned_accesses)
5916 gimple *stmt = DR_STMT (dr);
5917 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5918 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5919 machine_mode mode = TYPE_MODE (vectype);
5920 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5921 struct loop *vect_loop = NULL;
5922 bool nested_in_vect_loop = false;
5924 if (aligned_access_p (dr) && !check_aligned_accesses)
5925 return dr_aligned;
5927 /* For now assume all conditional loads/stores support unaligned
5928 access without any special code. */
5929 if (is_gimple_call (stmt)
5930 && gimple_call_internal_p (stmt)
5931 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5932 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5933 return dr_unaligned_supported;
5935 if (loop_vinfo)
5937 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5938 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5941 /* Possibly unaligned access. */
5943 /* We can choose between using the implicit realignment scheme (generating
5944 a misaligned_move stmt) and the explicit realignment scheme (generating
5945 aligned loads with a REALIGN_LOAD). There are two variants to the
5946 explicit realignment scheme: optimized, and unoptimized.
5947 We can optimize the realignment only if the step between consecutive
5948 vector loads is equal to the vector size. Since the vector memory
5949 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5950 is guaranteed that the misalignment amount remains the same throughout the
5951 execution of the vectorized loop. Therefore, we can create the
5952 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5953 at the loop preheader.
5955 However, in the case of outer-loop vectorization, when vectorizing a
5956 memory access in the inner-loop nested within the LOOP that is now being
5957 vectorized, while it is guaranteed that the misalignment of the
5958 vectorized memory access will remain the same in different outer-loop
5959 iterations, it is *not* guaranteed that is will remain the same throughout
5960 the execution of the inner-loop. This is because the inner-loop advances
5961 with the original scalar step (and not in steps of VS). If the inner-loop
5962 step happens to be a multiple of VS, then the misalignment remains fixed
5963 and we can use the optimized realignment scheme. For example:
5965 for (i=0; i<N; i++)
5966 for (j=0; j<M; j++)
5967 s += a[i+j];
5969 When vectorizing the i-loop in the above example, the step between
5970 consecutive vector loads is 1, and so the misalignment does not remain
5971 fixed across the execution of the inner-loop, and the realignment cannot
5972 be optimized (as illustrated in the following pseudo vectorized loop):
5974 for (i=0; i<N; i+=4)
5975 for (j=0; j<M; j++){
5976 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5977 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5978 // (assuming that we start from an aligned address).
5981 We therefore have to use the unoptimized realignment scheme:
5983 for (i=0; i<N; i+=4)
5984 for (j=k; j<M; j+=4)
5985 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5986 // that the misalignment of the initial address is
5987 // 0).
5989 The loop can then be vectorized as follows:
5991 for (k=0; k<4; k++){
5992 rt = get_realignment_token (&vp[k]);
5993 for (i=0; i<N; i+=4){
5994 v1 = vp[i+k];
5995 for (j=k; j<M; j+=4){
5996 v2 = vp[i+j+VS-1];
5997 va = REALIGN_LOAD <v1,v2,rt>;
5998 vs += va;
5999 v1 = v2;
6002 } */
6004 if (DR_IS_READ (dr))
6006 bool is_packed = false;
6007 tree type = (TREE_TYPE (DR_REF (dr)));
6009 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6010 && (!targetm.vectorize.builtin_mask_for_load
6011 || targetm.vectorize.builtin_mask_for_load ()))
6013 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6015 /* If we are doing SLP then the accesses need not have the
6016 same alignment, instead it depends on the SLP group size. */
6017 if (loop_vinfo
6018 && STMT_SLP_TYPE (stmt_info)
6019 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6020 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6021 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6023 else if (!loop_vinfo
6024 || (nested_in_vect_loop
6025 && (TREE_INT_CST_LOW (DR_STEP (dr))
6026 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6027 return dr_explicit_realign;
6028 else
6029 return dr_explicit_realign_optimized;
6031 if (!known_alignment_for_access_p (dr))
6032 is_packed = not_size_aligned (DR_REF (dr));
6034 if ((TYPE_USER_ALIGN (type) && !is_packed)
6035 || targetm.vectorize.
6036 support_vector_misalignment (mode, type,
6037 DR_MISALIGNMENT (dr), is_packed))
6038 /* Can't software pipeline the loads, but can at least do them. */
6039 return dr_unaligned_supported;
6041 else
6043 bool is_packed = false;
6044 tree type = (TREE_TYPE (DR_REF (dr)));
6046 if (!known_alignment_for_access_p (dr))
6047 is_packed = not_size_aligned (DR_REF (dr));
6049 if ((TYPE_USER_ALIGN (type) && !is_packed)
6050 || targetm.vectorize.
6051 support_vector_misalignment (mode, type,
6052 DR_MISALIGNMENT (dr), is_packed))
6053 return dr_unaligned_supported;
6056 /* Unsupported. */
6057 return dr_unaligned_unsupported;