rs6000-c.c (altivec_resolve_overloaded_builtin): Reformat two multi-line strings.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob9a5408e14a52a2da4795a85cd0fe5dae3775aaf5
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
61 bool limit_p;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
91 return true;
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
230 /* We do not have to consider dependences between accesses that belong
231 to the same group. */
232 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
233 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
234 return false;
236 /* Even if we have an anti-dependence then, as the vectorized loop covers at
237 least two scalar iterations, there is always also a true dependence.
238 As the vectorizer does not re-order loads and stores we can ignore
239 the anti-dependence if TBAA can disambiguate both DRs similar to the
240 case with known negative distance anti-dependences (positive
241 distance anti-dependences would violate TBAA constraints). */
242 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
243 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
244 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
245 get_alias_set (DR_REF (drb))))
246 return false;
248 /* Unknown data dependence. */
249 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
251 /* If user asserted safelen consecutive iterations can be
252 executed concurrently, assume independence. */
253 if (loop->safelen >= 2)
255 if (loop->safelen < *max_vf)
256 *max_vf = loop->safelen;
257 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
258 return false;
261 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
262 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
264 if (dump_enabled_p ())
266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
267 "versioning for alias not supported for: "
268 "can't determine dependence between ");
269 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
270 DR_REF (dra));
271 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
272 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
273 DR_REF (drb));
274 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
276 return true;
279 if (dump_enabled_p ())
281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
282 "versioning for alias required: "
283 "can't determine dependence between ");
284 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
285 DR_REF (dra));
286 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
287 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
288 DR_REF (drb));
289 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
292 /* Add to list of ddrs that need to be tested at run-time. */
293 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
296 /* Known data dependence. */
297 if (DDR_NUM_DIST_VECTS (ddr) == 0)
299 /* If user asserted safelen consecutive iterations can be
300 executed concurrently, assume independence. */
301 if (loop->safelen >= 2)
303 if (loop->safelen < *max_vf)
304 *max_vf = loop->safelen;
305 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
306 return false;
309 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
310 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
312 if (dump_enabled_p ())
314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
315 "versioning for alias not supported for: "
316 "bad dist vector for ");
317 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
318 DR_REF (dra));
319 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
320 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
321 DR_REF (drb));
322 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
324 return true;
327 if (dump_enabled_p ())
329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
330 "versioning for alias required: "
331 "bad dist vector for ");
332 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
333 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
334 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
335 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
337 /* Add to list of ddrs that need to be tested at run-time. */
338 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
341 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
342 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
344 int dist = dist_v[loop_depth];
346 if (dump_enabled_p ())
347 dump_printf_loc (MSG_NOTE, vect_location,
348 "dependence distance = %d.\n", dist);
350 if (dist == 0)
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_NOTE, vect_location,
355 "dependence distance == 0 between ");
356 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
357 dump_printf (MSG_NOTE, " and ");
358 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
359 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
362 /* When we perform grouped accesses and perform implicit CSE
363 by detecting equal accesses and doing disambiguation with
364 runtime alias tests like for
365 .. = a[i];
366 .. = a[i+1];
367 a[i] = ..;
368 a[i+1] = ..;
369 *p = ..;
370 .. = a[i];
371 .. = a[i+1];
372 where we will end up loading { a[i], a[i+1] } once, make
373 sure that inserting group loads before the first load and
374 stores after the last store will do the right thing.
375 Similar for groups like
376 a[i] = ...;
377 ... = a[i];
378 a[i+1] = ...;
379 where loads from the group interleave with the store. */
380 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
381 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
383 gimple *earlier_stmt;
384 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
385 if (DR_IS_WRITE
386 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
388 if (dump_enabled_p ())
389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
390 "READ_WRITE dependence in interleaving."
391 "\n");
392 return true;
396 continue;
399 if (dist > 0 && DDR_REVERSED_P (ddr))
401 /* If DDR_REVERSED_P the order of the data-refs in DDR was
402 reversed (to make distance vector positive), and the actual
403 distance is negative. */
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
406 "dependence distance negative.\n");
407 /* Record a negative dependence distance to later limit the
408 amount of stmt copying / unrolling we can perform.
409 Only need to handle read-after-write dependence. */
410 if (DR_IS_READ (drb)
411 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
412 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
413 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
414 continue;
417 if (abs (dist) >= 2
418 && abs (dist) < *max_vf)
420 /* The dependence distance requires reduction of the maximal
421 vectorization factor. */
422 *max_vf = abs (dist);
423 if (dump_enabled_p ())
424 dump_printf_loc (MSG_NOTE, vect_location,
425 "adjusting maximal vectorization factor to %i\n",
426 *max_vf);
429 if (abs (dist) >= *max_vf)
431 /* Dependence distance does not create dependence, as far as
432 vectorization is concerned, in this case. */
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_NOTE, vect_location,
435 "dependence distance >= VF.\n");
436 continue;
439 if (dump_enabled_p ())
441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
442 "not vectorized, possible dependence "
443 "between data-refs ");
444 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
445 dump_printf (MSG_NOTE, " and ");
446 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
447 dump_printf (MSG_NOTE, "\n");
450 return true;
453 return false;
456 /* Function vect_analyze_data_ref_dependences.
458 Examine all the data references in the loop, and make sure there do not
459 exist any data dependences between them. Set *MAX_VF according to
460 the maximum vectorization factor the data dependences allow. */
462 bool
463 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
465 unsigned int i;
466 struct data_dependence_relation *ddr;
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "=== vect_analyze_data_ref_dependences ===\n");
472 LOOP_VINFO_DDRS (loop_vinfo)
473 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
474 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
475 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
476 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
477 &LOOP_VINFO_DDRS (loop_vinfo),
478 LOOP_VINFO_LOOP_NEST (loop_vinfo), false))
479 return false;
481 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
482 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
483 return false;
485 return true;
489 /* Function vect_slp_analyze_data_ref_dependence.
491 Return TRUE if there (might) exist a dependence between a memory-reference
492 DRA and a memory-reference DRB. When versioning for alias may check a
493 dependence at run-time, return FALSE. Adjust *MAX_VF according to
494 the data dependence. */
496 static bool
497 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
499 struct data_reference *dra = DDR_A (ddr);
500 struct data_reference *drb = DDR_B (ddr);
502 /* We need to check dependences of statements marked as unvectorizable
503 as well, they still can prohibit vectorization. */
505 /* Independent data accesses. */
506 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
507 return false;
509 if (dra == drb)
510 return false;
512 /* Read-read is OK. */
513 if (DR_IS_READ (dra) && DR_IS_READ (drb))
514 return false;
516 /* If dra and drb are part of the same interleaving chain consider
517 them independent. */
518 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
519 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
520 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
521 return false;
523 /* Unknown data dependence. */
524 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
526 if (dump_enabled_p ())
528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
529 "can't determine dependence between ");
530 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
531 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
532 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
533 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
536 else if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location,
539 "determined dependence between ");
540 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
541 dump_printf (MSG_NOTE, " and ");
542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
543 dump_printf (MSG_NOTE, "\n");
546 return true;
550 /* Analyze dependences involved in the transform of SLP NODE. STORES
551 contain the vector of scalar stores of this instance if we are
552 disambiguating the loads. */
554 static bool
555 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
556 vec<gimple *> stores, gimple *last_store)
558 /* This walks over all stmts involved in the SLP load/store done
559 in NODE verifying we can sink them up to the last stmt in the
560 group. */
561 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
562 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
564 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
565 if (access == last_access)
566 continue;
567 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
568 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
569 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
571 gimple *stmt = gsi_stmt (gsi);
572 if (! gimple_vuse (stmt)
573 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
574 continue;
576 /* If we couldn't record a (single) data reference for this
577 stmt we have to give up. */
578 /* ??? Here and below if dependence analysis fails we can resort
579 to the alias oracle which can handle more kinds of stmts. */
580 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
581 if (!dr_b)
582 return false;
584 /* If we run into a store of this same instance (we've just
585 marked those) then delay dependence checking until we run
586 into the last store because this is where it will have
587 been sunk to (and we verify if we can do that as well). */
588 if (gimple_visited_p (stmt))
590 if (stmt != last_store)
591 continue;
592 unsigned i;
593 gimple *store;
594 FOR_EACH_VEC_ELT (stores, i, store)
596 data_reference *store_dr
597 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
598 ddr_p ddr = initialize_data_dependence_relation
599 (dr_a, store_dr, vNULL);
600 if (vect_slp_analyze_data_ref_dependence (ddr))
602 free_dependence_relation (ddr);
603 return false;
605 free_dependence_relation (ddr);
609 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
610 if (vect_slp_analyze_data_ref_dependence (ddr))
612 free_dependence_relation (ddr);
613 return false;
615 free_dependence_relation (ddr);
618 return true;
622 /* Function vect_analyze_data_ref_dependences.
624 Examine all the data references in the basic-block, and make sure there
625 do not exist any data dependences between them. Set *MAX_VF according to
626 the maximum vectorization factor the data dependences allow. */
628 bool
629 vect_slp_analyze_instance_dependence (slp_instance instance)
631 if (dump_enabled_p ())
632 dump_printf_loc (MSG_NOTE, vect_location,
633 "=== vect_slp_analyze_instance_dependence ===\n");
635 /* The stores of this instance are at the root of the SLP tree. */
636 slp_tree store = SLP_INSTANCE_TREE (instance);
637 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
638 store = NULL;
640 /* Verify we can sink stores to the vectorized stmt insert location. */
641 gimple *last_store = NULL;
642 if (store)
644 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
645 return false;
647 /* Mark stores in this instance and remember the last one. */
648 last_store = vect_find_last_scalar_stmt_in_slp (store);
649 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
650 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
653 bool res = true;
655 /* Verify we can sink loads to the vectorized stmt insert location,
656 special-casing stores of this instance. */
657 slp_tree load;
658 unsigned int i;
659 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
660 if (! vect_slp_analyze_node_dependences (instance, load,
661 store
662 ? SLP_TREE_SCALAR_STMTS (store)
663 : vNULL, last_store))
665 res = false;
666 break;
669 /* Unset the visited flag. */
670 if (store)
671 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
672 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
674 return res;
677 /* Function vect_compute_data_ref_alignment
679 Compute the misalignment of the data reference DR.
681 Output:
682 1. If during the misalignment computation it is found that the data reference
683 cannot be vectorized then false is returned.
684 2. DR_MISALIGNMENT (DR) is defined.
686 FOR NOW: No analysis is actually performed. Misalignment is calculated
687 only for trivial cases. TODO. */
689 bool
690 vect_compute_data_ref_alignment (struct data_reference *dr)
692 gimple *stmt = DR_STMT (dr);
693 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
694 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
695 struct loop *loop = NULL;
696 tree ref = DR_REF (dr);
697 tree vectype;
698 tree base, base_addr;
699 tree misalign = NULL_TREE;
700 tree aligned_to;
701 tree step;
702 unsigned HOST_WIDE_INT alignment;
704 if (dump_enabled_p ())
705 dump_printf_loc (MSG_NOTE, vect_location,
706 "vect_compute_data_ref_alignment:\n");
708 if (loop_vinfo)
709 loop = LOOP_VINFO_LOOP (loop_vinfo);
711 /* Initialize misalignment to unknown. */
712 SET_DR_MISALIGNMENT (dr, -1);
714 if (tree_fits_shwi_p (DR_STEP (dr)))
715 misalign = DR_INIT (dr);
716 aligned_to = DR_ALIGNED_TO (dr);
717 base_addr = DR_BASE_ADDRESS (dr);
718 vectype = STMT_VINFO_VECTYPE (stmt_info);
720 /* In case the dataref is in an inner-loop of the loop that is being
721 vectorized (LOOP), we use the base and misalignment information
722 relative to the outer-loop (LOOP). This is ok only if the misalignment
723 stays the same throughout the execution of the inner-loop, which is why
724 we have to check that the stride of the dataref in the inner-loop evenly
725 divides by the vector size. */
726 if (loop && nested_in_vect_loop_p (loop, stmt))
728 tree step = DR_STEP (dr);
730 if (tree_fits_shwi_p (step)
731 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
733 if (dump_enabled_p ())
734 dump_printf_loc (MSG_NOTE, vect_location,
735 "inner step divides the vector-size.\n");
736 misalign = STMT_VINFO_DR_INIT (stmt_info);
737 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
738 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
740 else
742 if (dump_enabled_p ())
743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
744 "inner step doesn't divide the vector-size.\n");
745 misalign = NULL_TREE;
749 /* Similarly we can only use base and misalignment information relative to
750 an innermost loop if the misalignment stays the same throughout the
751 execution of the loop. As above, this is the case if the stride of
752 the dataref evenly divides by the vector size. */
753 else
755 tree step = DR_STEP (dr);
756 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
758 if (tree_fits_shwi_p (step)
759 && ((tree_to_shwi (step) * vf)
760 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
762 if (dump_enabled_p ())
763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
764 "step doesn't divide the vector-size.\n");
765 misalign = NULL_TREE;
769 /* To look at alignment of the base we have to preserve an inner MEM_REF
770 as that carries alignment information of the actual access. */
771 base = ref;
772 while (handled_component_p (base))
773 base = TREE_OPERAND (base, 0);
774 if (TREE_CODE (base) == MEM_REF)
775 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
776 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
777 unsigned int base_alignment = get_object_alignment (base);
779 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
780 DR_VECT_AUX (dr)->base_element_aligned = true;
782 alignment = TYPE_ALIGN_UNIT (vectype);
784 if ((compare_tree_int (aligned_to, alignment) < 0)
785 || !misalign)
787 if (dump_enabled_p ())
789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
790 "Unknown alignment for access: ");
791 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
792 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
794 return true;
797 if (base_alignment < TYPE_ALIGN (vectype))
799 /* Strip an inner MEM_REF to a bare decl if possible. */
800 if (TREE_CODE (base) == MEM_REF
801 && integer_zerop (TREE_OPERAND (base, 1))
802 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
803 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
805 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
807 if (dump_enabled_p ())
809 dump_printf_loc (MSG_NOTE, vect_location,
810 "can't force alignment of ref: ");
811 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
812 dump_printf (MSG_NOTE, "\n");
814 return true;
817 /* Force the alignment of the decl.
818 NOTE: This is the only change to the code we make during
819 the analysis phase, before deciding to vectorize the loop. */
820 if (dump_enabled_p ())
822 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
823 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
824 dump_printf (MSG_NOTE, "\n");
827 DR_VECT_AUX (dr)->base_decl = base;
828 DR_VECT_AUX (dr)->base_misaligned = true;
829 DR_VECT_AUX (dr)->base_element_aligned = true;
832 if (loop && nested_in_vect_loop_p (loop, stmt))
833 step = STMT_VINFO_DR_STEP (stmt_info);
834 else
835 step = DR_STEP (dr);
836 /* If this is a backward running DR then first access in the larger
837 vectype actually is N-1 elements before the address in the DR.
838 Adjust misalign accordingly. */
839 if (tree_int_cst_sgn (step) < 0)
841 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
842 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
843 otherwise we wouldn't be here. */
844 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
845 /* PLUS because STEP was negative. */
846 misalign = size_binop (PLUS_EXPR, misalign, offset);
849 SET_DR_MISALIGNMENT (dr,
850 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
852 if (dump_enabled_p ())
854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
855 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
856 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
857 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
860 return true;
864 /* Function vect_update_misalignment_for_peel
866 DR - the data reference whose misalignment is to be adjusted.
867 DR_PEEL - the data reference whose misalignment is being made
868 zero in the vector loop by the peel.
869 NPEEL - the number of iterations in the peel loop if the misalignment
870 of DR_PEEL is known at compile time. */
872 static void
873 vect_update_misalignment_for_peel (struct data_reference *dr,
874 struct data_reference *dr_peel, int npeel)
876 unsigned int i;
877 vec<dr_p> same_align_drs;
878 struct data_reference *current_dr;
879 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
880 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
881 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
882 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
884 /* For interleaved data accesses the step in the loop must be multiplied by
885 the size of the interleaving group. */
886 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
887 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
888 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
889 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
891 /* It can be assumed that the data refs with the same alignment as dr_peel
892 are aligned in the vector loop. */
893 same_align_drs
894 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
895 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
897 if (current_dr != dr)
898 continue;
899 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
900 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
901 SET_DR_MISALIGNMENT (dr, 0);
902 return;
905 if (known_alignment_for_access_p (dr)
906 && known_alignment_for_access_p (dr_peel))
908 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
909 int misal = DR_MISALIGNMENT (dr);
910 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
911 misal += negative ? -npeel * dr_size : npeel * dr_size;
912 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
913 SET_DR_MISALIGNMENT (dr, misal);
914 return;
917 if (dump_enabled_p ())
918 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
919 SET_DR_MISALIGNMENT (dr, -1);
923 /* Function verify_data_ref_alignment
925 Return TRUE if DR can be handled with respect to alignment. */
927 static bool
928 verify_data_ref_alignment (data_reference_p dr)
930 enum dr_alignment_support supportable_dr_alignment
931 = vect_supportable_dr_alignment (dr, false);
932 if (!supportable_dr_alignment)
934 if (dump_enabled_p ())
936 if (DR_IS_READ (dr))
937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
938 "not vectorized: unsupported unaligned load.");
939 else
940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
941 "not vectorized: unsupported unaligned "
942 "store.");
944 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
945 DR_REF (dr));
946 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
948 return false;
951 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
952 dump_printf_loc (MSG_NOTE, vect_location,
953 "Vectorizing an unaligned access.\n");
955 return true;
958 /* Function vect_verify_datarefs_alignment
960 Return TRUE if all data references in the loop can be
961 handled with respect to alignment. */
963 bool
964 vect_verify_datarefs_alignment (loop_vec_info vinfo)
966 vec<data_reference_p> datarefs = vinfo->datarefs;
967 struct data_reference *dr;
968 unsigned int i;
970 FOR_EACH_VEC_ELT (datarefs, i, dr)
972 gimple *stmt = DR_STMT (dr);
973 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
975 if (!STMT_VINFO_RELEVANT_P (stmt_info))
976 continue;
978 /* For interleaving, only the alignment of the first access matters. */
979 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
980 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
981 continue;
983 /* Strided accesses perform only component accesses, alignment is
984 irrelevant for them. */
985 if (STMT_VINFO_STRIDED_P (stmt_info)
986 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
987 continue;
989 if (! verify_data_ref_alignment (dr))
990 return false;
993 return true;
996 /* Given an memory reference EXP return whether its alignment is less
997 than its size. */
999 static bool
1000 not_size_aligned (tree exp)
1002 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1003 return true;
1005 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1006 > get_object_alignment (exp));
1009 /* Function vector_alignment_reachable_p
1011 Return true if vector alignment for DR is reachable by peeling
1012 a few loop iterations. Return false otherwise. */
1014 static bool
1015 vector_alignment_reachable_p (struct data_reference *dr)
1017 gimple *stmt = DR_STMT (dr);
1018 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1019 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1021 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1023 /* For interleaved access we peel only if number of iterations in
1024 the prolog loop ({VF - misalignment}), is a multiple of the
1025 number of the interleaved accesses. */
1026 int elem_size, mis_in_elements;
1027 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1029 /* FORNOW: handle only known alignment. */
1030 if (!known_alignment_for_access_p (dr))
1031 return false;
1033 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1034 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1036 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1037 return false;
1040 /* If misalignment is known at the compile time then allow peeling
1041 only if natural alignment is reachable through peeling. */
1042 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1044 HOST_WIDE_INT elmsize =
1045 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1046 if (dump_enabled_p ())
1048 dump_printf_loc (MSG_NOTE, vect_location,
1049 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1050 dump_printf (MSG_NOTE,
1051 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1053 if (DR_MISALIGNMENT (dr) % elmsize)
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "data size does not divide the misalignment.\n");
1058 return false;
1062 if (!known_alignment_for_access_p (dr))
1064 tree type = TREE_TYPE (DR_REF (dr));
1065 bool is_packed = not_size_aligned (DR_REF (dr));
1066 if (dump_enabled_p ())
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068 "Unknown misalignment, is_packed = %d\n",is_packed);
1069 if ((TYPE_USER_ALIGN (type) && !is_packed)
1070 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1071 return true;
1072 else
1073 return false;
1076 return true;
1080 /* Calculate the cost of the memory access represented by DR. */
1082 static void
1083 vect_get_data_access_cost (struct data_reference *dr,
1084 unsigned int *inside_cost,
1085 unsigned int *outside_cost,
1086 stmt_vector_for_cost *body_cost_vec)
1088 gimple *stmt = DR_STMT (dr);
1089 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1090 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1091 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1092 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1093 int ncopies = vf / nunits;
1095 if (DR_IS_READ (dr))
1096 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1097 NULL, body_cost_vec, false);
1098 else
1099 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1101 if (dump_enabled_p ())
1102 dump_printf_loc (MSG_NOTE, vect_location,
1103 "vect_get_data_access_cost: inside_cost = %d, "
1104 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1108 typedef struct _vect_peel_info
1110 int npeel;
1111 struct data_reference *dr;
1112 unsigned int count;
1113 } *vect_peel_info;
1115 typedef struct _vect_peel_extended_info
1117 struct _vect_peel_info peel_info;
1118 unsigned int inside_cost;
1119 unsigned int outside_cost;
1120 stmt_vector_for_cost body_cost_vec;
1121 } *vect_peel_extended_info;
1124 /* Peeling hashtable helpers. */
1126 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1128 static inline hashval_t hash (const _vect_peel_info *);
1129 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1132 inline hashval_t
1133 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1135 return (hashval_t) peel_info->npeel;
1138 inline bool
1139 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1141 return (a->npeel == b->npeel);
1145 /* Insert DR into peeling hash table with NPEEL as key. */
1147 static void
1148 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1149 loop_vec_info loop_vinfo, struct data_reference *dr,
1150 int npeel)
1152 struct _vect_peel_info elem, *slot;
1153 _vect_peel_info **new_slot;
1154 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1156 elem.npeel = npeel;
1157 slot = peeling_htab->find (&elem);
1158 if (slot)
1159 slot->count++;
1160 else
1162 slot = XNEW (struct _vect_peel_info);
1163 slot->npeel = npeel;
1164 slot->dr = dr;
1165 slot->count = 1;
1166 new_slot = peeling_htab->find_slot (slot, INSERT);
1167 *new_slot = slot;
1170 if (!supportable_dr_alignment
1171 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1172 slot->count += VECT_MAX_COST;
1176 /* Traverse peeling hash table to find peeling option that aligns maximum
1177 number of data accesses. */
1180 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1181 _vect_peel_extended_info *max)
1183 vect_peel_info elem = *slot;
1185 if (elem->count > max->peel_info.count
1186 || (elem->count == max->peel_info.count
1187 && max->peel_info.npeel > elem->npeel))
1189 max->peel_info.npeel = elem->npeel;
1190 max->peel_info.count = elem->count;
1191 max->peel_info.dr = elem->dr;
1194 return 1;
1198 /* Traverse peeling hash table and calculate cost for each peeling option.
1199 Find the one with the lowest cost. */
1202 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1203 _vect_peel_extended_info *min)
1205 vect_peel_info elem = *slot;
1206 int save_misalignment, dummy;
1207 unsigned int inside_cost = 0, outside_cost = 0, i;
1208 gimple *stmt = DR_STMT (elem->dr);
1209 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1210 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1211 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1212 struct data_reference *dr;
1213 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1215 prologue_cost_vec.create (2);
1216 body_cost_vec.create (2);
1217 epilogue_cost_vec.create (2);
1219 FOR_EACH_VEC_ELT (datarefs, i, dr)
1221 stmt = DR_STMT (dr);
1222 stmt_info = vinfo_for_stmt (stmt);
1223 /* For interleaving, only the alignment of the first access
1224 matters. */
1225 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1226 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1227 continue;
1229 /* Strided accesses perform only component accesses, alignment is
1230 irrelevant for them. */
1231 if (STMT_VINFO_STRIDED_P (stmt_info)
1232 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1233 continue;
1235 save_misalignment = DR_MISALIGNMENT (dr);
1236 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1237 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1238 &body_cost_vec);
1239 SET_DR_MISALIGNMENT (dr, save_misalignment);
1242 outside_cost += vect_get_known_peeling_cost
1243 (loop_vinfo, elem->npeel, &dummy,
1244 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1245 &prologue_cost_vec, &epilogue_cost_vec);
1247 /* Prologue and epilogue costs are added to the target model later.
1248 These costs depend only on the scalar iteration cost, the
1249 number of peeling iterations finally chosen, and the number of
1250 misaligned statements. So discard the information found here. */
1251 prologue_cost_vec.release ();
1252 epilogue_cost_vec.release ();
1254 if (inside_cost < min->inside_cost
1255 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1257 min->inside_cost = inside_cost;
1258 min->outside_cost = outside_cost;
1259 min->body_cost_vec.release ();
1260 min->body_cost_vec = body_cost_vec;
1261 min->peel_info.dr = elem->dr;
1262 min->peel_info.npeel = elem->npeel;
1264 else
1265 body_cost_vec.release ();
1267 return 1;
1271 /* Choose best peeling option by traversing peeling hash table and either
1272 choosing an option with the lowest cost (if cost model is enabled) or the
1273 option that aligns as many accesses as possible. */
1275 static struct data_reference *
1276 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1277 loop_vec_info loop_vinfo,
1278 unsigned int *npeel,
1279 stmt_vector_for_cost *body_cost_vec)
1281 struct _vect_peel_extended_info res;
1283 res.peel_info.dr = NULL;
1284 res.body_cost_vec = stmt_vector_for_cost ();
1286 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1288 res.inside_cost = INT_MAX;
1289 res.outside_cost = INT_MAX;
1290 peeling_htab->traverse <_vect_peel_extended_info *,
1291 vect_peeling_hash_get_lowest_cost> (&res);
1293 else
1295 res.peel_info.count = 0;
1296 peeling_htab->traverse <_vect_peel_extended_info *,
1297 vect_peeling_hash_get_most_frequent> (&res);
1300 *npeel = res.peel_info.npeel;
1301 *body_cost_vec = res.body_cost_vec;
1302 return res.peel_info.dr;
1306 /* Function vect_enhance_data_refs_alignment
1308 This pass will use loop versioning and loop peeling in order to enhance
1309 the alignment of data references in the loop.
1311 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1312 original loop is to be vectorized. Any other loops that are created by
1313 the transformations performed in this pass - are not supposed to be
1314 vectorized. This restriction will be relaxed.
1316 This pass will require a cost model to guide it whether to apply peeling
1317 or versioning or a combination of the two. For example, the scheme that
1318 intel uses when given a loop with several memory accesses, is as follows:
1319 choose one memory access ('p') which alignment you want to force by doing
1320 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1321 other accesses are not necessarily aligned, or (2) use loop versioning to
1322 generate one loop in which all accesses are aligned, and another loop in
1323 which only 'p' is necessarily aligned.
1325 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1326 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1327 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1329 Devising a cost model is the most critical aspect of this work. It will
1330 guide us on which access to peel for, whether to use loop versioning, how
1331 many versions to create, etc. The cost model will probably consist of
1332 generic considerations as well as target specific considerations (on
1333 powerpc for example, misaligned stores are more painful than misaligned
1334 loads).
1336 Here are the general steps involved in alignment enhancements:
1338 -- original loop, before alignment analysis:
1339 for (i=0; i<N; i++){
1340 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1341 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1344 -- After vect_compute_data_refs_alignment:
1345 for (i=0; i<N; i++){
1346 x = q[i]; # DR_MISALIGNMENT(q) = 3
1347 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1350 -- Possibility 1: we do loop versioning:
1351 if (p is aligned) {
1352 for (i=0; i<N; i++){ # loop 1A
1353 x = q[i]; # DR_MISALIGNMENT(q) = 3
1354 p[i] = y; # DR_MISALIGNMENT(p) = 0
1357 else {
1358 for (i=0; i<N; i++){ # loop 1B
1359 x = q[i]; # DR_MISALIGNMENT(q) = 3
1360 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1364 -- Possibility 2: we do loop peeling:
1365 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1366 x = q[i];
1367 p[i] = y;
1369 for (i = 3; i < N; i++){ # loop 2A
1370 x = q[i]; # DR_MISALIGNMENT(q) = 0
1371 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1374 -- Possibility 3: combination of loop peeling and versioning:
1375 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1376 x = q[i];
1377 p[i] = y;
1379 if (p is aligned) {
1380 for (i = 3; i<N; i++){ # loop 3A
1381 x = q[i]; # DR_MISALIGNMENT(q) = 0
1382 p[i] = y; # DR_MISALIGNMENT(p) = 0
1385 else {
1386 for (i = 3; i<N; i++){ # loop 3B
1387 x = q[i]; # DR_MISALIGNMENT(q) = 0
1388 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1392 These loops are later passed to loop_transform to be vectorized. The
1393 vectorizer will use the alignment information to guide the transformation
1394 (whether to generate regular loads/stores, or with special handling for
1395 misalignment). */
1397 bool
1398 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1400 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1401 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1402 enum dr_alignment_support supportable_dr_alignment;
1403 struct data_reference *dr0 = NULL, *first_store = NULL;
1404 struct data_reference *dr;
1405 unsigned int i, j;
1406 bool do_peeling = false;
1407 bool do_versioning = false;
1408 bool stat;
1409 gimple *stmt;
1410 stmt_vec_info stmt_info;
1411 unsigned int npeel = 0;
1412 bool all_misalignments_unknown = true;
1413 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1414 unsigned possible_npeel_number = 1;
1415 tree vectype;
1416 unsigned int nelements, mis, same_align_drs_max = 0;
1417 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1418 hash_table<peel_info_hasher> peeling_htab (1);
1420 if (dump_enabled_p ())
1421 dump_printf_loc (MSG_NOTE, vect_location,
1422 "=== vect_enhance_data_refs_alignment ===\n");
1424 /* Reset data so we can safely be called multiple times. */
1425 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1426 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1428 /* While cost model enhancements are expected in the future, the high level
1429 view of the code at this time is as follows:
1431 A) If there is a misaligned access then see if peeling to align
1432 this access can make all data references satisfy
1433 vect_supportable_dr_alignment. If so, update data structures
1434 as needed and return true.
1436 B) If peeling wasn't possible and there is a data reference with an
1437 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1438 then see if loop versioning checks can be used to make all data
1439 references satisfy vect_supportable_dr_alignment. If so, update
1440 data structures as needed and return true.
1442 C) If neither peeling nor versioning were successful then return false if
1443 any data reference does not satisfy vect_supportable_dr_alignment.
1445 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1447 Note, Possibility 3 above (which is peeling and versioning together) is not
1448 being done at this time. */
1450 /* (1) Peeling to force alignment. */
1452 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1453 Considerations:
1454 + How many accesses will become aligned due to the peeling
1455 - How many accesses will become unaligned due to the peeling,
1456 and the cost of misaligned accesses.
1457 - The cost of peeling (the extra runtime checks, the increase
1458 in code size). */
1460 FOR_EACH_VEC_ELT (datarefs, i, dr)
1462 stmt = DR_STMT (dr);
1463 stmt_info = vinfo_for_stmt (stmt);
1465 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1466 continue;
1468 /* For interleaving, only the alignment of the first access
1469 matters. */
1470 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1471 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1472 continue;
1474 /* For invariant accesses there is nothing to enhance. */
1475 if (integer_zerop (DR_STEP (dr)))
1476 continue;
1478 /* Strided accesses perform only component accesses, alignment is
1479 irrelevant for them. */
1480 if (STMT_VINFO_STRIDED_P (stmt_info)
1481 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1482 continue;
1484 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1485 do_peeling = vector_alignment_reachable_p (dr);
1486 if (do_peeling)
1488 if (known_alignment_for_access_p (dr))
1490 unsigned int npeel_tmp;
1491 bool negative = tree_int_cst_compare (DR_STEP (dr),
1492 size_zero_node) < 0;
1494 /* Save info about DR in the hash table. */
1495 vectype = STMT_VINFO_VECTYPE (stmt_info);
1496 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1497 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1498 TREE_TYPE (DR_REF (dr))));
1499 npeel_tmp = (negative
1500 ? (mis - nelements) : (nelements - mis))
1501 & (nelements - 1);
1503 /* For multiple types, it is possible that the bigger type access
1504 will have more than one peeling option. E.g., a loop with two
1505 types: one of size (vector size / 4), and the other one of
1506 size (vector size / 8). Vectorization factor will 8. If both
1507 access are misaligned by 3, the first one needs one scalar
1508 iteration to be aligned, and the second one needs 5. But the
1509 first one will be aligned also by peeling 5 scalar
1510 iterations, and in that case both accesses will be aligned.
1511 Hence, except for the immediate peeling amount, we also want
1512 to try to add full vector size, while we don't exceed
1513 vectorization factor.
1514 We do this automtically for cost model, since we calculate cost
1515 for every peeling option. */
1516 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1518 if (STMT_SLP_TYPE (stmt_info))
1519 possible_npeel_number
1520 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1521 else
1522 possible_npeel_number = vf / nelements;
1525 /* Handle the aligned case. We may decide to align some other
1526 access, making DR unaligned. */
1527 if (DR_MISALIGNMENT (dr) == 0)
1529 npeel_tmp = 0;
1530 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1531 possible_npeel_number++;
1534 for (j = 0; j < possible_npeel_number; j++)
1536 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1537 dr, npeel_tmp);
1538 npeel_tmp += nelements;
1541 all_misalignments_unknown = false;
1542 /* Data-ref that was chosen for the case that all the
1543 misalignments are unknown is not relevant anymore, since we
1544 have a data-ref with known alignment. */
1545 dr0 = NULL;
1547 else
1549 /* If we don't know any misalignment values, we prefer
1550 peeling for data-ref that has the maximum number of data-refs
1551 with the same alignment, unless the target prefers to align
1552 stores over load. */
1553 if (all_misalignments_unknown)
1555 unsigned same_align_drs
1556 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1557 if (!dr0
1558 || same_align_drs_max < same_align_drs)
1560 same_align_drs_max = same_align_drs;
1561 dr0 = dr;
1563 /* For data-refs with the same number of related
1564 accesses prefer the one where the misalign
1565 computation will be invariant in the outermost loop. */
1566 else if (same_align_drs_max == same_align_drs)
1568 struct loop *ivloop0, *ivloop;
1569 ivloop0 = outermost_invariant_loop_for_expr
1570 (loop, DR_BASE_ADDRESS (dr0));
1571 ivloop = outermost_invariant_loop_for_expr
1572 (loop, DR_BASE_ADDRESS (dr));
1573 if ((ivloop && !ivloop0)
1574 || (ivloop && ivloop0
1575 && flow_loop_nested_p (ivloop, ivloop0)))
1576 dr0 = dr;
1579 if (!first_store && DR_IS_WRITE (dr))
1580 first_store = dr;
1583 /* If there are both known and unknown misaligned accesses in the
1584 loop, we choose peeling amount according to the known
1585 accesses. */
1586 if (!supportable_dr_alignment)
1588 dr0 = dr;
1589 if (!first_store && DR_IS_WRITE (dr))
1590 first_store = dr;
1594 else
1596 if (!aligned_access_p (dr))
1598 if (dump_enabled_p ())
1599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1600 "vector alignment may not be reachable\n");
1601 break;
1606 /* Check if we can possibly peel the loop. */
1607 if (!vect_can_advance_ivs_p (loop_vinfo)
1608 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1609 || loop->inner)
1610 do_peeling = false;
1612 if (do_peeling
1613 && all_misalignments_unknown
1614 && vect_supportable_dr_alignment (dr0, false))
1616 /* Check if the target requires to prefer stores over loads, i.e., if
1617 misaligned stores are more expensive than misaligned loads (taking
1618 drs with same alignment into account). */
1619 if (first_store && DR_IS_READ (dr0))
1621 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1622 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1623 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1624 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1625 stmt_vector_for_cost dummy;
1626 dummy.create (2);
1628 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1629 &dummy);
1630 vect_get_data_access_cost (first_store, &store_inside_cost,
1631 &store_outside_cost, &dummy);
1633 dummy.release ();
1635 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1636 aligning the load DR0). */
1637 load_inside_penalty = store_inside_cost;
1638 load_outside_penalty = store_outside_cost;
1639 for (i = 0;
1640 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1641 DR_STMT (first_store))).iterate (i, &dr);
1642 i++)
1643 if (DR_IS_READ (dr))
1645 load_inside_penalty += load_inside_cost;
1646 load_outside_penalty += load_outside_cost;
1648 else
1650 load_inside_penalty += store_inside_cost;
1651 load_outside_penalty += store_outside_cost;
1654 /* Calculate the penalty for leaving DR0 unaligned (by
1655 aligning the FIRST_STORE). */
1656 store_inside_penalty = load_inside_cost;
1657 store_outside_penalty = load_outside_cost;
1658 for (i = 0;
1659 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1660 DR_STMT (dr0))).iterate (i, &dr);
1661 i++)
1662 if (DR_IS_READ (dr))
1664 store_inside_penalty += load_inside_cost;
1665 store_outside_penalty += load_outside_cost;
1667 else
1669 store_inside_penalty += store_inside_cost;
1670 store_outside_penalty += store_outside_cost;
1673 if (load_inside_penalty > store_inside_penalty
1674 || (load_inside_penalty == store_inside_penalty
1675 && load_outside_penalty > store_outside_penalty))
1676 dr0 = first_store;
1679 /* In case there are only loads with different unknown misalignments, use
1680 peeling only if it may help to align other accesses in the loop or
1681 if it may help improving load bandwith when we'd end up using
1682 unaligned loads. */
1683 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1684 if (!first_store
1685 && !STMT_VINFO_SAME_ALIGN_REFS (
1686 vinfo_for_stmt (DR_STMT (dr0))).length ()
1687 && (vect_supportable_dr_alignment (dr0, false)
1688 != dr_unaligned_supported
1689 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1690 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1691 do_peeling = false;
1694 if (do_peeling && !dr0)
1696 /* Peeling is possible, but there is no data access that is not supported
1697 unless aligned. So we try to choose the best possible peeling. */
1699 /* We should get here only if there are drs with known misalignment. */
1700 gcc_assert (!all_misalignments_unknown);
1702 /* Choose the best peeling from the hash table. */
1703 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1704 loop_vinfo, &npeel,
1705 &body_cost_vec);
1706 if (!dr0 || !npeel)
1707 do_peeling = false;
1710 if (do_peeling)
1712 stmt = DR_STMT (dr0);
1713 stmt_info = vinfo_for_stmt (stmt);
1714 vectype = STMT_VINFO_VECTYPE (stmt_info);
1715 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1717 if (known_alignment_for_access_p (dr0))
1719 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1720 size_zero_node) < 0;
1721 if (!npeel)
1723 /* Since it's known at compile time, compute the number of
1724 iterations in the peeled loop (the peeling factor) for use in
1725 updating DR_MISALIGNMENT values. The peeling factor is the
1726 vectorization factor minus the misalignment as an element
1727 count. */
1728 mis = DR_MISALIGNMENT (dr0);
1729 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1730 npeel = ((negative ? mis - nelements : nelements - mis)
1731 & (nelements - 1));
1734 /* For interleaved data access every iteration accesses all the
1735 members of the group, therefore we divide the number of iterations
1736 by the group size. */
1737 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1738 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1739 npeel /= GROUP_SIZE (stmt_info);
1741 if (dump_enabled_p ())
1742 dump_printf_loc (MSG_NOTE, vect_location,
1743 "Try peeling by %d\n", npeel);
1746 /* Ensure that all data refs can be vectorized after the peel. */
1747 FOR_EACH_VEC_ELT (datarefs, i, dr)
1749 int save_misalignment;
1751 if (dr == dr0)
1752 continue;
1754 stmt = DR_STMT (dr);
1755 stmt_info = vinfo_for_stmt (stmt);
1756 /* For interleaving, only the alignment of the first access
1757 matters. */
1758 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1759 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1760 continue;
1762 /* Strided accesses perform only component accesses, alignment is
1763 irrelevant for them. */
1764 if (STMT_VINFO_STRIDED_P (stmt_info)
1765 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1766 continue;
1768 save_misalignment = DR_MISALIGNMENT (dr);
1769 vect_update_misalignment_for_peel (dr, dr0, npeel);
1770 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1771 SET_DR_MISALIGNMENT (dr, save_misalignment);
1773 if (!supportable_dr_alignment)
1775 do_peeling = false;
1776 break;
1780 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1782 stat = vect_verify_datarefs_alignment (loop_vinfo);
1783 if (!stat)
1784 do_peeling = false;
1785 else
1787 body_cost_vec.release ();
1788 return stat;
1792 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1793 if (do_peeling)
1795 unsigned max_allowed_peel
1796 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1797 if (max_allowed_peel != (unsigned)-1)
1799 unsigned max_peel = npeel;
1800 if (max_peel == 0)
1802 gimple *dr_stmt = DR_STMT (dr0);
1803 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1804 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1805 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1807 if (max_peel > max_allowed_peel)
1809 do_peeling = false;
1810 if (dump_enabled_p ())
1811 dump_printf_loc (MSG_NOTE, vect_location,
1812 "Disable peeling, max peels reached: %d\n", max_peel);
1817 /* Cost model #2 - if peeling may result in a remaining loop not
1818 iterating enough to be vectorized then do not peel. */
1819 if (do_peeling
1820 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1822 unsigned max_peel
1823 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1824 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1825 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1826 do_peeling = false;
1829 if (do_peeling)
1831 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1832 If the misalignment of DR_i is identical to that of dr0 then set
1833 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1834 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1835 by the peeling factor times the element size of DR_i (MOD the
1836 vectorization factor times the size). Otherwise, the
1837 misalignment of DR_i must be set to unknown. */
1838 FOR_EACH_VEC_ELT (datarefs, i, dr)
1839 if (dr != dr0)
1841 /* Strided accesses perform only component accesses, alignment
1842 is irrelevant for them. */
1843 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1844 if (STMT_VINFO_STRIDED_P (stmt_info)
1845 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1846 continue;
1848 vect_update_misalignment_for_peel (dr, dr0, npeel);
1851 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1852 if (npeel)
1853 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1854 else
1855 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1856 = DR_MISALIGNMENT (dr0);
1857 SET_DR_MISALIGNMENT (dr0, 0);
1858 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_NOTE, vect_location,
1861 "Alignment of access forced using peeling.\n");
1862 dump_printf_loc (MSG_NOTE, vect_location,
1863 "Peeling for alignment will be applied.\n");
1865 /* The inside-loop cost will be accounted for in vectorizable_load
1866 and vectorizable_store correctly with adjusted alignments.
1867 Drop the body_cst_vec on the floor here. */
1868 body_cost_vec.release ();
1870 stat = vect_verify_datarefs_alignment (loop_vinfo);
1871 gcc_assert (stat);
1872 return stat;
1876 body_cost_vec.release ();
1878 /* (2) Versioning to force alignment. */
1880 /* Try versioning if:
1881 1) optimize loop for speed
1882 2) there is at least one unsupported misaligned data ref with an unknown
1883 misalignment, and
1884 3) all misaligned data refs with a known misalignment are supported, and
1885 4) the number of runtime alignment checks is within reason. */
1887 do_versioning =
1888 optimize_loop_nest_for_speed_p (loop)
1889 && (!loop->inner); /* FORNOW */
1891 if (do_versioning)
1893 FOR_EACH_VEC_ELT (datarefs, i, dr)
1895 stmt = DR_STMT (dr);
1896 stmt_info = vinfo_for_stmt (stmt);
1898 /* For interleaving, only the alignment of the first access
1899 matters. */
1900 if (aligned_access_p (dr)
1901 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1902 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1903 continue;
1905 if (STMT_VINFO_STRIDED_P (stmt_info))
1907 /* Strided loads perform only component accesses, alignment is
1908 irrelevant for them. */
1909 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1910 continue;
1911 do_versioning = false;
1912 break;
1915 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1917 if (!supportable_dr_alignment)
1919 gimple *stmt;
1920 int mask;
1921 tree vectype;
1923 if (known_alignment_for_access_p (dr)
1924 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1925 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1927 do_versioning = false;
1928 break;
1931 stmt = DR_STMT (dr);
1932 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1933 gcc_assert (vectype);
1935 /* The rightmost bits of an aligned address must be zeros.
1936 Construct the mask needed for this test. For example,
1937 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1938 mask must be 15 = 0xf. */
1939 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1941 /* FORNOW: use the same mask to test all potentially unaligned
1942 references in the loop. The vectorizer currently supports
1943 a single vector size, see the reference to
1944 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1945 vectorization factor is computed. */
1946 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1947 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1948 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1949 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1950 DR_STMT (dr));
1954 /* Versioning requires at least one misaligned data reference. */
1955 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1956 do_versioning = false;
1957 else if (!do_versioning)
1958 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1961 if (do_versioning)
1963 vec<gimple *> may_misalign_stmts
1964 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1965 gimple *stmt;
1967 /* It can now be assumed that the data references in the statements
1968 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1969 of the loop being vectorized. */
1970 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1972 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1973 dr = STMT_VINFO_DATA_REF (stmt_info);
1974 SET_DR_MISALIGNMENT (dr, 0);
1975 if (dump_enabled_p ())
1976 dump_printf_loc (MSG_NOTE, vect_location,
1977 "Alignment of access forced using versioning.\n");
1980 if (dump_enabled_p ())
1981 dump_printf_loc (MSG_NOTE, vect_location,
1982 "Versioning for alignment will be applied.\n");
1984 /* Peeling and versioning can't be done together at this time. */
1985 gcc_assert (! (do_peeling && do_versioning));
1987 stat = vect_verify_datarefs_alignment (loop_vinfo);
1988 gcc_assert (stat);
1989 return stat;
1992 /* This point is reached if neither peeling nor versioning is being done. */
1993 gcc_assert (! (do_peeling || do_versioning));
1995 stat = vect_verify_datarefs_alignment (loop_vinfo);
1996 return stat;
2000 /* Function vect_find_same_alignment_drs.
2002 Update group and alignment relations according to the chosen
2003 vectorization factor. */
2005 static void
2006 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2007 loop_vec_info loop_vinfo)
2009 unsigned int i;
2010 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2011 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2012 struct data_reference *dra = DDR_A (ddr);
2013 struct data_reference *drb = DDR_B (ddr);
2014 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2015 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2016 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2017 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2018 lambda_vector dist_v;
2019 unsigned int loop_depth;
2021 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2022 return;
2024 if (dra == drb)
2025 return;
2027 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2028 return;
2030 /* Loop-based vectorization and known data dependence. */
2031 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2032 return;
2034 /* Data-dependence analysis reports a distance vector of zero
2035 for data-references that overlap only in the first iteration
2036 but have different sign step (see PR45764).
2037 So as a sanity check require equal DR_STEP. */
2038 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2039 return;
2041 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2042 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2044 int dist = dist_v[loop_depth];
2046 if (dump_enabled_p ())
2047 dump_printf_loc (MSG_NOTE, vect_location,
2048 "dependence distance = %d.\n", dist);
2050 /* Same loop iteration. */
2051 if (dist == 0
2052 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2054 /* Two references with distance zero have the same alignment. */
2055 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2056 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2057 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_NOTE, vect_location,
2060 "accesses have the same alignment.\n");
2061 dump_printf (MSG_NOTE,
2062 "dependence distance modulo vf == 0 between ");
2063 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2064 dump_printf (MSG_NOTE, " and ");
2065 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2066 dump_printf (MSG_NOTE, "\n");
2073 /* Function vect_analyze_data_refs_alignment
2075 Analyze the alignment of the data-references in the loop.
2076 Return FALSE if a data reference is found that cannot be vectorized. */
2078 bool
2079 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2081 if (dump_enabled_p ())
2082 dump_printf_loc (MSG_NOTE, vect_location,
2083 "=== vect_analyze_data_refs_alignment ===\n");
2085 /* Mark groups of data references with same alignment using
2086 data dependence information. */
2087 vec<ddr_p> ddrs = vinfo->ddrs;
2088 struct data_dependence_relation *ddr;
2089 unsigned int i;
2091 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2092 vect_find_same_alignment_drs (ddr, vinfo);
2094 vec<data_reference_p> datarefs = vinfo->datarefs;
2095 struct data_reference *dr;
2097 FOR_EACH_VEC_ELT (datarefs, i, dr)
2099 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2100 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2101 && !vect_compute_data_ref_alignment (dr))
2103 /* Strided accesses perform only component accesses, misalignment
2104 information is irrelevant for them. */
2105 if (STMT_VINFO_STRIDED_P (stmt_info)
2106 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2107 continue;
2109 if (dump_enabled_p ())
2110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2111 "not vectorized: can't calculate alignment "
2112 "for data ref.\n");
2114 return false;
2118 return true;
2122 /* Analyze alignment of DRs of stmts in NODE. */
2124 static bool
2125 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2127 /* We vectorize from the first scalar stmt in the node unless
2128 the node is permuted in which case we start from the first
2129 element in the group. */
2130 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2131 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2132 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2133 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2135 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2136 if (! vect_compute_data_ref_alignment (dr)
2137 /* For creating the data-ref pointer we need alignment of the
2138 first element anyway. */
2139 || (dr != first_dr
2140 && ! vect_compute_data_ref_alignment (first_dr))
2141 || ! verify_data_ref_alignment (dr))
2143 if (dump_enabled_p ())
2144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2145 "not vectorized: bad data alignment in basic "
2146 "block.\n");
2147 return false;
2150 return true;
2153 /* Function vect_slp_analyze_instance_alignment
2155 Analyze the alignment of the data-references in the SLP instance.
2156 Return FALSE if a data reference is found that cannot be vectorized. */
2158 bool
2159 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2161 if (dump_enabled_p ())
2162 dump_printf_loc (MSG_NOTE, vect_location,
2163 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2165 slp_tree node;
2166 unsigned i;
2167 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2168 if (! vect_slp_analyze_and_verify_node_alignment (node))
2169 return false;
2171 node = SLP_INSTANCE_TREE (instance);
2172 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2173 && ! vect_slp_analyze_and_verify_node_alignment
2174 (SLP_INSTANCE_TREE (instance)))
2175 return false;
2177 return true;
2181 /* Analyze groups of accesses: check that DR belongs to a group of
2182 accesses of legal size, step, etc. Detect gaps, single element
2183 interleaving, and other special cases. Set grouped access info.
2184 Collect groups of strided stores for further use in SLP analysis.
2185 Worker for vect_analyze_group_access. */
2187 static bool
2188 vect_analyze_group_access_1 (struct data_reference *dr)
2190 tree step = DR_STEP (dr);
2191 tree scalar_type = TREE_TYPE (DR_REF (dr));
2192 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2193 gimple *stmt = DR_STMT (dr);
2194 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2195 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2196 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2197 HOST_WIDE_INT dr_step = -1;
2198 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2199 bool slp_impossible = false;
2201 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2202 size of the interleaving group (including gaps). */
2203 if (tree_fits_shwi_p (step))
2205 dr_step = tree_to_shwi (step);
2206 /* Check that STEP is a multiple of type size. Otherwise there is
2207 a non-element-sized gap at the end of the group which we
2208 cannot represent in GROUP_GAP or GROUP_SIZE.
2209 ??? As we can handle non-constant step fine here we should
2210 simply remove uses of GROUP_GAP between the last and first
2211 element and instead rely on DR_STEP. GROUP_SIZE then would
2212 simply not include that gap. */
2213 if ((dr_step % type_size) != 0)
2215 if (dump_enabled_p ())
2217 dump_printf_loc (MSG_NOTE, vect_location,
2218 "Step ");
2219 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2220 dump_printf (MSG_NOTE,
2221 " is not a multiple of the element size for ");
2222 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2223 dump_printf (MSG_NOTE, "\n");
2225 return false;
2227 groupsize = absu_hwi (dr_step) / type_size;
2229 else
2230 groupsize = 0;
2232 /* Not consecutive access is possible only if it is a part of interleaving. */
2233 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2235 /* Check if it this DR is a part of interleaving, and is a single
2236 element of the group that is accessed in the loop. */
2238 /* Gaps are supported only for loads. STEP must be a multiple of the type
2239 size. The size of the group must be a power of 2. */
2240 if (DR_IS_READ (dr)
2241 && (dr_step % type_size) == 0
2242 && groupsize > 0
2243 && exact_log2 (groupsize) != -1)
2245 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2246 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2247 GROUP_GAP (stmt_info) = groupsize - 1;
2248 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE, vect_location,
2251 "Detected single element interleaving ");
2252 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2253 dump_printf (MSG_NOTE, " step ");
2254 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2255 dump_printf (MSG_NOTE, "\n");
2258 return true;
2261 if (dump_enabled_p ())
2263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264 "not consecutive access ");
2265 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2268 if (bb_vinfo)
2270 /* Mark the statement as unvectorizable. */
2271 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2272 return true;
2275 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2276 STMT_VINFO_STRIDED_P (stmt_info) = true;
2277 return true;
2280 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2282 /* First stmt in the interleaving chain. Check the chain. */
2283 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2284 struct data_reference *data_ref = dr;
2285 unsigned int count = 1;
2286 tree prev_init = DR_INIT (data_ref);
2287 gimple *prev = stmt;
2288 HOST_WIDE_INT diff, gaps = 0;
2290 while (next)
2292 /* Skip same data-refs. In case that two or more stmts share
2293 data-ref (supported only for loads), we vectorize only the first
2294 stmt, and the rest get their vectorized loads from the first
2295 one. */
2296 if (!tree_int_cst_compare (DR_INIT (data_ref),
2297 DR_INIT (STMT_VINFO_DATA_REF (
2298 vinfo_for_stmt (next)))))
2300 if (DR_IS_WRITE (data_ref))
2302 if (dump_enabled_p ())
2303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2304 "Two store stmts share the same dr.\n");
2305 return false;
2308 if (dump_enabled_p ())
2309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2310 "Two or more load stmts share the same dr.\n");
2312 /* For load use the same data-ref load. */
2313 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2315 prev = next;
2316 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2317 continue;
2320 prev = next;
2321 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2323 /* All group members have the same STEP by construction. */
2324 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2326 /* Check that the distance between two accesses is equal to the type
2327 size. Otherwise, we have gaps. */
2328 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2329 - TREE_INT_CST_LOW (prev_init)) / type_size;
2330 if (diff != 1)
2332 /* FORNOW: SLP of accesses with gaps is not supported. */
2333 slp_impossible = true;
2334 if (DR_IS_WRITE (data_ref))
2336 if (dump_enabled_p ())
2337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2338 "interleaved store with gaps\n");
2339 return false;
2342 gaps += diff - 1;
2345 last_accessed_element += diff;
2347 /* Store the gap from the previous member of the group. If there is no
2348 gap in the access, GROUP_GAP is always 1. */
2349 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2351 prev_init = DR_INIT (data_ref);
2352 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2353 /* Count the number of data-refs in the chain. */
2354 count++;
2357 if (groupsize == 0)
2358 groupsize = count + gaps;
2360 if (groupsize > UINT_MAX)
2362 if (dump_enabled_p ())
2363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2364 "group is too large\n");
2365 return false;
2368 /* Check that the size of the interleaving is equal to count for stores,
2369 i.e., that there are no gaps. */
2370 if (groupsize != count
2371 && !DR_IS_READ (dr))
2373 if (dump_enabled_p ())
2374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2375 "interleaved store with gaps\n");
2376 return false;
2379 /* If there is a gap after the last load in the group it is the
2380 difference between the groupsize and the last accessed
2381 element.
2382 When there is no gap, this difference should be 0. */
2383 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2385 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2386 if (dump_enabled_p ())
2388 dump_printf_loc (MSG_NOTE, vect_location,
2389 "Detected interleaving ");
2390 if (DR_IS_READ (dr))
2391 dump_printf (MSG_NOTE, "load ");
2392 else
2393 dump_printf (MSG_NOTE, "store ");
2394 dump_printf (MSG_NOTE, "of size %u starting with ",
2395 (unsigned)groupsize);
2396 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2397 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2398 dump_printf_loc (MSG_NOTE, vect_location,
2399 "There is a gap of %u elements after the group\n",
2400 GROUP_GAP (vinfo_for_stmt (stmt)));
2403 /* SLP: create an SLP data structure for every interleaving group of
2404 stores for further analysis in vect_analyse_slp. */
2405 if (DR_IS_WRITE (dr) && !slp_impossible)
2407 if (loop_vinfo)
2408 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2409 if (bb_vinfo)
2410 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2414 return true;
2417 /* Analyze groups of accesses: check that DR belongs to a group of
2418 accesses of legal size, step, etc. Detect gaps, single element
2419 interleaving, and other special cases. Set grouped access info.
2420 Collect groups of strided stores for further use in SLP analysis. */
2422 static bool
2423 vect_analyze_group_access (struct data_reference *dr)
2425 if (!vect_analyze_group_access_1 (dr))
2427 /* Dissolve the group if present. */
2428 gimple *next;
2429 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2430 while (stmt)
2432 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2433 next = GROUP_NEXT_ELEMENT (vinfo);
2434 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2435 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2436 stmt = next;
2438 return false;
2440 return true;
2443 /* Analyze the access pattern of the data-reference DR.
2444 In case of non-consecutive accesses call vect_analyze_group_access() to
2445 analyze groups of accesses. */
2447 static bool
2448 vect_analyze_data_ref_access (struct data_reference *dr)
2450 tree step = DR_STEP (dr);
2451 tree scalar_type = TREE_TYPE (DR_REF (dr));
2452 gimple *stmt = DR_STMT (dr);
2453 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2454 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2455 struct loop *loop = NULL;
2457 if (loop_vinfo)
2458 loop = LOOP_VINFO_LOOP (loop_vinfo);
2460 if (loop_vinfo && !step)
2462 if (dump_enabled_p ())
2463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2464 "bad data-ref access in loop\n");
2465 return false;
2468 /* Allow loads with zero step in inner-loop vectorization. */
2469 if (loop_vinfo && integer_zerop (step))
2471 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2472 if (!nested_in_vect_loop_p (loop, stmt))
2473 return DR_IS_READ (dr);
2474 /* Allow references with zero step for outer loops marked
2475 with pragma omp simd only - it guarantees absence of
2476 loop-carried dependencies between inner loop iterations. */
2477 if (!loop->force_vectorize)
2479 if (dump_enabled_p ())
2480 dump_printf_loc (MSG_NOTE, vect_location,
2481 "zero step in inner loop of nest\n");
2482 return false;
2486 if (loop && nested_in_vect_loop_p (loop, stmt))
2488 /* Interleaved accesses are not yet supported within outer-loop
2489 vectorization for references in the inner-loop. */
2490 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2492 /* For the rest of the analysis we use the outer-loop step. */
2493 step = STMT_VINFO_DR_STEP (stmt_info);
2494 if (integer_zerop (step))
2496 if (dump_enabled_p ())
2497 dump_printf_loc (MSG_NOTE, vect_location,
2498 "zero step in outer loop.\n");
2499 return DR_IS_READ (dr);
2503 /* Consecutive? */
2504 if (TREE_CODE (step) == INTEGER_CST)
2506 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2507 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2508 || (dr_step < 0
2509 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2511 /* Mark that it is not interleaving. */
2512 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2513 return true;
2517 if (loop && nested_in_vect_loop_p (loop, stmt))
2519 if (dump_enabled_p ())
2520 dump_printf_loc (MSG_NOTE, vect_location,
2521 "grouped access in outer loop.\n");
2522 return false;
2526 /* Assume this is a DR handled by non-constant strided load case. */
2527 if (TREE_CODE (step) != INTEGER_CST)
2528 return (STMT_VINFO_STRIDED_P (stmt_info)
2529 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2530 || vect_analyze_group_access (dr)));
2532 /* Not consecutive access - check if it's a part of interleaving group. */
2533 return vect_analyze_group_access (dr);
2538 /* A helper function used in the comparator function to sort data
2539 references. T1 and T2 are two data references to be compared.
2540 The function returns -1, 0, or 1. */
2542 static int
2543 compare_tree (tree t1, tree t2)
2545 int i, cmp;
2546 enum tree_code code;
2547 char tclass;
2549 if (t1 == t2)
2550 return 0;
2551 if (t1 == NULL)
2552 return -1;
2553 if (t2 == NULL)
2554 return 1;
2556 STRIP_NOPS (t1);
2557 STRIP_NOPS (t2);
2559 if (TREE_CODE (t1) != TREE_CODE (t2))
2560 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2562 code = TREE_CODE (t1);
2563 switch (code)
2565 /* For const values, we can just use hash values for comparisons. */
2566 case INTEGER_CST:
2567 case REAL_CST:
2568 case FIXED_CST:
2569 case STRING_CST:
2570 case COMPLEX_CST:
2571 case VECTOR_CST:
2573 hashval_t h1 = iterative_hash_expr (t1, 0);
2574 hashval_t h2 = iterative_hash_expr (t2, 0);
2575 if (h1 != h2)
2576 return h1 < h2 ? -1 : 1;
2577 break;
2580 case SSA_NAME:
2581 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2582 if (cmp != 0)
2583 return cmp;
2585 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2586 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2587 break;
2589 default:
2590 tclass = TREE_CODE_CLASS (code);
2592 /* For var-decl, we could compare their UIDs. */
2593 if (tclass == tcc_declaration)
2595 if (DECL_UID (t1) != DECL_UID (t2))
2596 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2597 break;
2600 /* For expressions with operands, compare their operands recursively. */
2601 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2603 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2604 if (cmp != 0)
2605 return cmp;
2609 return 0;
2613 /* Compare two data-references DRA and DRB to group them into chunks
2614 suitable for grouping. */
2616 static int
2617 dr_group_sort_cmp (const void *dra_, const void *drb_)
2619 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2620 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2621 int cmp;
2623 /* Stabilize sort. */
2624 if (dra == drb)
2625 return 0;
2627 /* DRs in different loops never belong to the same group. */
2628 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2629 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2630 if (loopa != loopb)
2631 return loopa->num < loopb->num ? -1 : 1;
2633 /* Ordering of DRs according to base. */
2634 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2636 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2637 if (cmp != 0)
2638 return cmp;
2641 /* And according to DR_OFFSET. */
2642 if (!dr_equal_offsets_p (dra, drb))
2644 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2645 if (cmp != 0)
2646 return cmp;
2649 /* Put reads before writes. */
2650 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2651 return DR_IS_READ (dra) ? -1 : 1;
2653 /* Then sort after access size. */
2654 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2655 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2657 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2658 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2659 if (cmp != 0)
2660 return cmp;
2663 /* And after step. */
2664 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2666 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2667 if (cmp != 0)
2668 return cmp;
2671 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2672 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2673 if (cmp == 0)
2674 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2675 return cmp;
2678 /* Function vect_analyze_data_ref_accesses.
2680 Analyze the access pattern of all the data references in the loop.
2682 FORNOW: the only access pattern that is considered vectorizable is a
2683 simple step 1 (consecutive) access.
2685 FORNOW: handle only arrays and pointer accesses. */
2687 bool
2688 vect_analyze_data_ref_accesses (vec_info *vinfo)
2690 unsigned int i;
2691 vec<data_reference_p> datarefs = vinfo->datarefs;
2692 struct data_reference *dr;
2694 if (dump_enabled_p ())
2695 dump_printf_loc (MSG_NOTE, vect_location,
2696 "=== vect_analyze_data_ref_accesses ===\n");
2698 if (datarefs.is_empty ())
2699 return true;
2701 /* Sort the array of datarefs to make building the interleaving chains
2702 linear. Don't modify the original vector's order, it is needed for
2703 determining what dependencies are reversed. */
2704 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2705 datarefs_copy.qsort (dr_group_sort_cmp);
2707 /* Build the interleaving chains. */
2708 for (i = 0; i < datarefs_copy.length () - 1;)
2710 data_reference_p dra = datarefs_copy[i];
2711 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2712 stmt_vec_info lastinfo = NULL;
2713 for (i = i + 1; i < datarefs_copy.length (); ++i)
2715 data_reference_p drb = datarefs_copy[i];
2716 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2718 /* ??? Imperfect sorting (non-compatible types, non-modulo
2719 accesses, same accesses) can lead to a group to be artificially
2720 split here as we don't just skip over those. If it really
2721 matters we can push those to a worklist and re-iterate
2722 over them. The we can just skip ahead to the next DR here. */
2724 /* DRs in a different loop should not be put into the same
2725 interleaving group. */
2726 if (gimple_bb (DR_STMT (dra))->loop_father
2727 != gimple_bb (DR_STMT (drb))->loop_father)
2728 break;
2730 /* Check that the data-refs have same first location (except init)
2731 and they are both either store or load (not load and store,
2732 not masked loads or stores). */
2733 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2734 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2735 DR_BASE_ADDRESS (drb), 0)
2736 || !dr_equal_offsets_p (dra, drb)
2737 || !gimple_assign_single_p (DR_STMT (dra))
2738 || !gimple_assign_single_p (DR_STMT (drb)))
2739 break;
2741 /* Check that the data-refs have the same constant size. */
2742 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2743 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2744 if (!tree_fits_uhwi_p (sza)
2745 || !tree_fits_uhwi_p (szb)
2746 || !tree_int_cst_equal (sza, szb))
2747 break;
2749 /* Check that the data-refs have the same step. */
2750 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2751 break;
2753 /* Do not place the same access in the interleaving chain twice. */
2754 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2755 break;
2757 /* Check the types are compatible.
2758 ??? We don't distinguish this during sorting. */
2759 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2760 TREE_TYPE (DR_REF (drb))))
2761 break;
2763 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2764 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2765 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2766 gcc_assert (init_a <= init_b);
2768 /* If init_b == init_a + the size of the type * k, we have an
2769 interleaving, and DRA is accessed before DRB. */
2770 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2771 if (type_size_a == 0
2772 || (init_b - init_a) % type_size_a != 0)
2773 break;
2775 /* If we have a store, the accesses are adjacent. This splits
2776 groups into chunks we support (we don't support vectorization
2777 of stores with gaps). */
2778 if (!DR_IS_READ (dra)
2779 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2780 (DR_INIT (datarefs_copy[i-1]))
2781 != type_size_a))
2782 break;
2784 /* If the step (if not zero or non-constant) is greater than the
2785 difference between data-refs' inits this splits groups into
2786 suitable sizes. */
2787 if (tree_fits_shwi_p (DR_STEP (dra)))
2789 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2790 if (step != 0 && step <= (init_b - init_a))
2791 break;
2794 if (dump_enabled_p ())
2796 dump_printf_loc (MSG_NOTE, vect_location,
2797 "Detected interleaving ");
2798 if (DR_IS_READ (dra))
2799 dump_printf (MSG_NOTE, "load ");
2800 else
2801 dump_printf (MSG_NOTE, "store ");
2802 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2803 dump_printf (MSG_NOTE, " and ");
2804 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2805 dump_printf (MSG_NOTE, "\n");
2808 /* Link the found element into the group list. */
2809 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2811 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2812 lastinfo = stmtinfo_a;
2814 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2815 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2816 lastinfo = stmtinfo_b;
2820 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2821 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2822 && !vect_analyze_data_ref_access (dr))
2824 if (dump_enabled_p ())
2825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2826 "not vectorized: complicated access pattern.\n");
2828 if (is_a <bb_vec_info> (vinfo))
2830 /* Mark the statement as not vectorizable. */
2831 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2832 continue;
2834 else
2836 datarefs_copy.release ();
2837 return false;
2841 datarefs_copy.release ();
2842 return true;
2846 /* Operator == between two dr_with_seg_len objects.
2848 This equality operator is used to make sure two data refs
2849 are the same one so that we will consider to combine the
2850 aliasing checks of those two pairs of data dependent data
2851 refs. */
2853 static bool
2854 operator == (const dr_with_seg_len& d1,
2855 const dr_with_seg_len& d2)
2857 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2858 DR_BASE_ADDRESS (d2.dr), 0)
2859 && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2860 && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2861 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2864 /* Function comp_dr_with_seg_len_pair.
2866 Comparison function for sorting objects of dr_with_seg_len_pair_t
2867 so that we can combine aliasing checks in one scan. */
2869 static int
2870 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2872 const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2873 const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
2874 const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
2875 const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
2877 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2878 if a and c have the same basic address snd step, and b and d have the same
2879 address and step. Therefore, if any a&c or b&d don't have the same address
2880 and step, we don't care the order of those two pairs after sorting. */
2881 int comp_res;
2883 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
2884 DR_BASE_ADDRESS (b1.dr))) != 0)
2885 return comp_res;
2886 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
2887 DR_BASE_ADDRESS (b2.dr))) != 0)
2888 return comp_res;
2889 if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
2890 return comp_res;
2891 if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
2892 return comp_res;
2893 if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
2894 return comp_res;
2895 if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
2896 return comp_res;
2897 if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
2898 return comp_res;
2899 if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
2900 return comp_res;
2902 return 0;
2905 /* Function vect_vfa_segment_size.
2907 Create an expression that computes the size of segment
2908 that will be accessed for a data reference. The functions takes into
2909 account that realignment loads may access one more vector.
2911 Input:
2912 DR: The data reference.
2913 LENGTH_FACTOR: segment length to consider.
2915 Return an expression whose value is the size of segment which will be
2916 accessed by DR. */
2918 static tree
2919 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2921 tree segment_length;
2923 if (integer_zerop (DR_STEP (dr)))
2924 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2925 else
2926 segment_length = size_binop (MULT_EXPR,
2927 fold_convert (sizetype, DR_STEP (dr)),
2928 fold_convert (sizetype, length_factor));
2930 if (vect_supportable_dr_alignment (dr, false)
2931 == dr_explicit_realign_optimized)
2933 tree vector_size = TYPE_SIZE_UNIT
2934 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2936 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2938 return segment_length;
2941 /* Function vect_no_alias_p.
2943 Given data references A and B with equal base and offset, the alias
2944 relation can be decided at compilation time, return TRUE if they do
2945 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2946 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2947 respectively. */
2949 static bool
2950 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2951 tree segment_length_a, tree segment_length_b)
2953 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2954 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2955 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2956 return false;
2958 tree seg_a_min = DR_INIT (a);
2959 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2960 seg_a_min, segment_length_a);
2961 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2962 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2963 [a, a+12) */
2964 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2966 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2967 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2968 seg_a_max, unit_size);
2969 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2970 DR_INIT (a), unit_size);
2972 tree seg_b_min = DR_INIT (b);
2973 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2974 seg_b_min, segment_length_b);
2975 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2977 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
2978 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
2979 seg_b_max, unit_size);
2980 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
2981 DR_INIT (b), unit_size);
2984 if (tree_int_cst_le (seg_a_max, seg_b_min)
2985 || tree_int_cst_le (seg_b_max, seg_a_min))
2986 return true;
2988 return false;
2991 /* Function vect_prune_runtime_alias_test_list.
2993 Prune a list of ddrs to be tested at run-time by versioning for alias.
2994 Merge several alias checks into one if possible.
2995 Return FALSE if resulting list of ddrs is longer then allowed by
2996 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2998 bool
2999 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3001 vec<ddr_p> may_alias_ddrs =
3002 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3003 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
3004 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3005 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3006 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3008 ddr_p ddr;
3009 unsigned int i;
3010 tree length_factor;
3012 if (dump_enabled_p ())
3013 dump_printf_loc (MSG_NOTE, vect_location,
3014 "=== vect_prune_runtime_alias_test_list ===\n");
3016 if (may_alias_ddrs.is_empty ())
3017 return true;
3019 /* Basically, for each pair of dependent data refs store_ptr_0
3020 and load_ptr_0, we create an expression:
3022 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3023 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3025 for aliasing checks. However, in some cases we can decrease
3026 the number of checks by combining two checks into one. For
3027 example, suppose we have another pair of data refs store_ptr_0
3028 and load_ptr_1, and if the following condition is satisfied:
3030 load_ptr_0 < load_ptr_1 &&
3031 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3033 (this condition means, in each iteration of vectorized loop,
3034 the accessed memory of store_ptr_0 cannot be between the memory
3035 of load_ptr_0 and load_ptr_1.)
3037 we then can use only the following expression to finish the
3038 alising checks between store_ptr_0 & load_ptr_0 and
3039 store_ptr_0 & load_ptr_1:
3041 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3042 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3044 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3045 same basic address. */
3047 comp_alias_ddrs.create (may_alias_ddrs.length ());
3049 /* First, we collect all data ref pairs for aliasing checks. */
3050 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3052 int comp_res;
3053 struct data_reference *dr_a, *dr_b;
3054 gimple *dr_group_first_a, *dr_group_first_b;
3055 tree segment_length_a, segment_length_b;
3056 gimple *stmt_a, *stmt_b;
3058 dr_a = DDR_A (ddr);
3059 stmt_a = DR_STMT (DDR_A (ddr));
3060 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3061 if (dr_group_first_a)
3063 stmt_a = dr_group_first_a;
3064 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3067 dr_b = DDR_B (ddr);
3068 stmt_b = DR_STMT (DDR_B (ddr));
3069 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3070 if (dr_group_first_b)
3072 stmt_b = dr_group_first_b;
3073 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3076 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3077 length_factor = scalar_loop_iters;
3078 else
3079 length_factor = size_int (vect_factor);
3080 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3081 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3083 comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3084 if (comp_res == 0)
3085 comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
3087 /* Alias is known at compilation time. */
3088 if (comp_res == 0
3089 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3090 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3091 && TREE_CODE (segment_length_a) == INTEGER_CST
3092 && TREE_CODE (segment_length_b) == INTEGER_CST)
3094 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3095 continue;
3097 if (dump_enabled_p ())
3098 dump_printf_loc (MSG_NOTE, vect_location,
3099 "not vectorized: compilation time alias.\n");
3101 return false;
3104 dr_with_seg_len_pair_t dr_with_seg_len_pair
3105 (dr_with_seg_len (dr_a, segment_length_a),
3106 dr_with_seg_len (dr_b, segment_length_b));
3108 /* Canonicalize pairs by sorting the two DR members. */
3109 if (comp_res > 0)
3110 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3112 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3115 /* Second, we sort the collected data ref pairs so that we can scan
3116 them once to combine all possible aliasing checks. */
3117 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3119 /* Third, we scan the sorted dr pairs and check if we can combine
3120 alias checks of two neighboring dr pairs. */
3121 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3123 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3124 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3125 *dr_b1 = &comp_alias_ddrs[i-1].second,
3126 *dr_a2 = &comp_alias_ddrs[i].first,
3127 *dr_b2 = &comp_alias_ddrs[i].second;
3129 /* Remove duplicate data ref pairs. */
3130 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3132 if (dump_enabled_p ())
3134 dump_printf_loc (MSG_NOTE, vect_location,
3135 "found equal ranges ");
3136 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3137 DR_REF (dr_a1->dr));
3138 dump_printf (MSG_NOTE, ", ");
3139 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3140 DR_REF (dr_b1->dr));
3141 dump_printf (MSG_NOTE, " and ");
3142 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3143 DR_REF (dr_a2->dr));
3144 dump_printf (MSG_NOTE, ", ");
3145 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3146 DR_REF (dr_b2->dr));
3147 dump_printf (MSG_NOTE, "\n");
3150 comp_alias_ddrs.ordered_remove (i--);
3151 continue;
3154 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3156 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3157 and DR_A1 and DR_A2 are two consecutive memrefs. */
3158 if (*dr_a1 == *dr_a2)
3160 std::swap (dr_a1, dr_b1);
3161 std::swap (dr_a2, dr_b2);
3164 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3165 DR_BASE_ADDRESS (dr_a2->dr), 0)
3166 || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3167 DR_OFFSET (dr_a2->dr), 0)
3168 || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3169 || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3170 continue;
3172 /* Make sure dr_a1 starts left of dr_a2. */
3173 if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3174 std::swap (*dr_a1, *dr_a2);
3176 bool do_remove = false;
3177 unsigned HOST_WIDE_INT diff
3178 = (tree_to_shwi (DR_INIT (dr_a2->dr))
3179 - tree_to_shwi (DR_INIT (dr_a1->dr)));
3181 /* If the left segment does not extend beyond the start of the
3182 right segment the new segment length is that of the right
3183 plus the segment distance. */
3184 if (tree_fits_uhwi_p (dr_a1->seg_len)
3185 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3187 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3188 size_int (diff));
3189 do_remove = true;
3191 /* Generally the new segment length is the maximum of the
3192 left segment size and the right segment size plus the distance.
3193 ??? We can also build tree MAX_EXPR here but it's not clear this
3194 is profitable. */
3195 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3196 && tree_fits_uhwi_p (dr_a2->seg_len))
3198 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3199 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3200 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3201 do_remove = true;
3203 /* Now we check if the following condition is satisfied:
3205 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3207 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3208 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3209 have to make a best estimation. We can get the minimum value
3210 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3211 then either of the following two conditions can guarantee the
3212 one above:
3214 1: DIFF <= MIN_SEG_LEN_B
3215 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3216 else
3218 unsigned HOST_WIDE_INT min_seg_len_b
3219 = (tree_fits_uhwi_p (dr_b1->seg_len)
3220 ? tree_to_uhwi (dr_b1->seg_len)
3221 : vect_factor);
3223 if (diff <= min_seg_len_b
3224 || (tree_fits_uhwi_p (dr_a1->seg_len)
3225 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3227 dr_a1->seg_len = size_binop (PLUS_EXPR,
3228 dr_a2->seg_len, size_int (diff));
3229 do_remove = true;
3233 if (do_remove)
3235 if (dump_enabled_p ())
3237 dump_printf_loc (MSG_NOTE, vect_location,
3238 "merging ranges for ");
3239 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3240 dump_printf (MSG_NOTE, ", ");
3241 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3242 dump_printf (MSG_NOTE, " and ");
3243 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3244 dump_printf (MSG_NOTE, ", ");
3245 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3246 dump_printf (MSG_NOTE, "\n");
3248 comp_alias_ddrs.ordered_remove (i--);
3253 dump_printf_loc (MSG_NOTE, vect_location,
3254 "improved number of alias checks from %d to %d\n",
3255 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3256 if ((int) comp_alias_ddrs.length () >
3257 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3259 if (dump_enabled_p ())
3260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3261 "number of versioning for alias "
3262 "run-time tests exceeds %d "
3263 "(--param vect-max-version-for-alias-checks)\n",
3264 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3265 return false;
3268 /* All alias checks have been resolved at compilation time. */
3269 if (!comp_alias_ddrs.length ())
3270 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3272 return true;
3275 /* Return true if a non-affine read or write in STMT is suitable for a
3276 gather load or scatter store. Describe the operation in *INFO if so. */
3278 bool
3279 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3280 gather_scatter_info *info)
3282 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3283 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3284 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3285 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3286 tree offtype = NULL_TREE;
3287 tree decl, base, off;
3288 machine_mode pmode;
3289 int punsignedp, reversep, pvolatilep = 0;
3291 base = DR_REF (dr);
3292 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3293 see if we can use the def stmt of the address. */
3294 if (is_gimple_call (stmt)
3295 && gimple_call_internal_p (stmt)
3296 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3297 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3298 && TREE_CODE (base) == MEM_REF
3299 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3300 && integer_zerop (TREE_OPERAND (base, 1))
3301 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3303 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3304 if (is_gimple_assign (def_stmt)
3305 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3306 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3309 /* The gather and scatter builtins need address of the form
3310 loop_invariant + vector * {1, 2, 4, 8}
3312 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3313 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3314 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3315 multiplications and additions in it. To get a vector, we need
3316 a single SSA_NAME that will be defined in the loop and will
3317 contain everything that is not loop invariant and that can be
3318 vectorized. The following code attempts to find such a preexistng
3319 SSA_NAME OFF and put the loop invariants into a tree BASE
3320 that can be gimplified before the loop. */
3321 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3322 &punsignedp, &reversep, &pvolatilep);
3323 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3325 if (TREE_CODE (base) == MEM_REF)
3327 if (!integer_zerop (TREE_OPERAND (base, 1)))
3329 if (off == NULL_TREE)
3331 offset_int moff = mem_ref_offset (base);
3332 off = wide_int_to_tree (sizetype, moff);
3334 else
3335 off = size_binop (PLUS_EXPR, off,
3336 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3338 base = TREE_OPERAND (base, 0);
3340 else
3341 base = build_fold_addr_expr (base);
3343 if (off == NULL_TREE)
3344 off = size_zero_node;
3346 /* If base is not loop invariant, either off is 0, then we start with just
3347 the constant offset in the loop invariant BASE and continue with base
3348 as OFF, otherwise give up.
3349 We could handle that case by gimplifying the addition of base + off
3350 into some SSA_NAME and use that as off, but for now punt. */
3351 if (!expr_invariant_in_loop_p (loop, base))
3353 if (!integer_zerop (off))
3354 return false;
3355 off = base;
3356 base = size_int (pbitpos / BITS_PER_UNIT);
3358 /* Otherwise put base + constant offset into the loop invariant BASE
3359 and continue with OFF. */
3360 else
3362 base = fold_convert (sizetype, base);
3363 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3366 /* OFF at this point may be either a SSA_NAME or some tree expression
3367 from get_inner_reference. Try to peel off loop invariants from it
3368 into BASE as long as possible. */
3369 STRIP_NOPS (off);
3370 while (offtype == NULL_TREE)
3372 enum tree_code code;
3373 tree op0, op1, add = NULL_TREE;
3375 if (TREE_CODE (off) == SSA_NAME)
3377 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3379 if (expr_invariant_in_loop_p (loop, off))
3380 return false;
3382 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3383 break;
3385 op0 = gimple_assign_rhs1 (def_stmt);
3386 code = gimple_assign_rhs_code (def_stmt);
3387 op1 = gimple_assign_rhs2 (def_stmt);
3389 else
3391 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3392 return false;
3393 code = TREE_CODE (off);
3394 extract_ops_from_tree (off, &code, &op0, &op1);
3396 switch (code)
3398 case POINTER_PLUS_EXPR:
3399 case PLUS_EXPR:
3400 if (expr_invariant_in_loop_p (loop, op0))
3402 add = op0;
3403 off = op1;
3404 do_add:
3405 add = fold_convert (sizetype, add);
3406 if (scale != 1)
3407 add = size_binop (MULT_EXPR, add, size_int (scale));
3408 base = size_binop (PLUS_EXPR, base, add);
3409 continue;
3411 if (expr_invariant_in_loop_p (loop, op1))
3413 add = op1;
3414 off = op0;
3415 goto do_add;
3417 break;
3418 case MINUS_EXPR:
3419 if (expr_invariant_in_loop_p (loop, op1))
3421 add = fold_convert (sizetype, op1);
3422 add = size_binop (MINUS_EXPR, size_zero_node, add);
3423 off = op0;
3424 goto do_add;
3426 break;
3427 case MULT_EXPR:
3428 if (scale == 1 && tree_fits_shwi_p (op1))
3430 scale = tree_to_shwi (op1);
3431 off = op0;
3432 continue;
3434 break;
3435 case SSA_NAME:
3436 off = op0;
3437 continue;
3438 CASE_CONVERT:
3439 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3440 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3441 break;
3442 if (TYPE_PRECISION (TREE_TYPE (op0))
3443 == TYPE_PRECISION (TREE_TYPE (off)))
3445 off = op0;
3446 continue;
3448 if (TYPE_PRECISION (TREE_TYPE (op0))
3449 < TYPE_PRECISION (TREE_TYPE (off)))
3451 off = op0;
3452 offtype = TREE_TYPE (off);
3453 STRIP_NOPS (off);
3454 continue;
3456 break;
3457 default:
3458 break;
3460 break;
3463 /* If at the end OFF still isn't a SSA_NAME or isn't
3464 defined in the loop, punt. */
3465 if (TREE_CODE (off) != SSA_NAME
3466 || expr_invariant_in_loop_p (loop, off))
3467 return false;
3469 if (offtype == NULL_TREE)
3470 offtype = TREE_TYPE (off);
3472 if (DR_IS_READ (dr))
3473 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3474 offtype, scale);
3475 else
3476 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3477 offtype, scale);
3479 if (decl == NULL_TREE)
3480 return false;
3482 info->decl = decl;
3483 info->base = base;
3484 info->offset = off;
3485 info->offset_dt = vect_unknown_def_type;
3486 info->offset_vectype = NULL_TREE;
3487 info->scale = scale;
3488 return true;
3491 /* Function vect_analyze_data_refs.
3493 Find all the data references in the loop or basic block.
3495 The general structure of the analysis of data refs in the vectorizer is as
3496 follows:
3497 1- vect_analyze_data_refs(loop/bb): call
3498 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3499 in the loop/bb and their dependences.
3500 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3501 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3502 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3506 bool
3507 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3509 struct loop *loop = NULL;
3510 unsigned int i;
3511 struct data_reference *dr;
3512 tree scalar_type;
3514 if (dump_enabled_p ())
3515 dump_printf_loc (MSG_NOTE, vect_location,
3516 "=== vect_analyze_data_refs ===\n");
3518 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3519 loop = LOOP_VINFO_LOOP (loop_vinfo);
3521 /* Go through the data-refs, check that the analysis succeeded. Update
3522 pointer from stmt_vec_info struct to DR and vectype. */
3524 vec<data_reference_p> datarefs = vinfo->datarefs;
3525 FOR_EACH_VEC_ELT (datarefs, i, dr)
3527 gimple *stmt;
3528 stmt_vec_info stmt_info;
3529 tree base, offset, init;
3530 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3531 bool simd_lane_access = false;
3532 int vf;
3534 again:
3535 if (!dr || !DR_REF (dr))
3537 if (dump_enabled_p ())
3538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3539 "not vectorized: unhandled data-ref\n");
3540 return false;
3543 stmt = DR_STMT (dr);
3544 stmt_info = vinfo_for_stmt (stmt);
3546 /* Discard clobbers from the dataref vector. We will remove
3547 clobber stmts during vectorization. */
3548 if (gimple_clobber_p (stmt))
3550 free_data_ref (dr);
3551 if (i == datarefs.length () - 1)
3553 datarefs.pop ();
3554 break;
3556 datarefs.ordered_remove (i);
3557 dr = datarefs[i];
3558 goto again;
3561 /* Check that analysis of the data-ref succeeded. */
3562 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3563 || !DR_STEP (dr))
3565 bool maybe_gather
3566 = DR_IS_READ (dr)
3567 && !TREE_THIS_VOLATILE (DR_REF (dr))
3568 && targetm.vectorize.builtin_gather != NULL;
3569 bool maybe_scatter
3570 = DR_IS_WRITE (dr)
3571 && !TREE_THIS_VOLATILE (DR_REF (dr))
3572 && targetm.vectorize.builtin_scatter != NULL;
3573 bool maybe_simd_lane_access
3574 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3576 /* If target supports vector gather loads or scatter stores, or if
3577 this might be a SIMD lane access, see if they can't be used. */
3578 if (is_a <loop_vec_info> (vinfo)
3579 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3580 && !nested_in_vect_loop_p (loop, stmt))
3582 struct data_reference *newdr
3583 = create_data_ref (NULL, loop_containing_stmt (stmt),
3584 DR_REF (dr), stmt, maybe_scatter ? false : true);
3585 gcc_assert (newdr != NULL && DR_REF (newdr));
3586 if (DR_BASE_ADDRESS (newdr)
3587 && DR_OFFSET (newdr)
3588 && DR_INIT (newdr)
3589 && DR_STEP (newdr)
3590 && integer_zerop (DR_STEP (newdr)))
3592 if (maybe_simd_lane_access)
3594 tree off = DR_OFFSET (newdr);
3595 STRIP_NOPS (off);
3596 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3597 && TREE_CODE (off) == MULT_EXPR
3598 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3600 tree step = TREE_OPERAND (off, 1);
3601 off = TREE_OPERAND (off, 0);
3602 STRIP_NOPS (off);
3603 if (CONVERT_EXPR_P (off)
3604 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3605 0)))
3606 < TYPE_PRECISION (TREE_TYPE (off)))
3607 off = TREE_OPERAND (off, 0);
3608 if (TREE_CODE (off) == SSA_NAME)
3610 gimple *def = SSA_NAME_DEF_STMT (off);
3611 tree reft = TREE_TYPE (DR_REF (newdr));
3612 if (is_gimple_call (def)
3613 && gimple_call_internal_p (def)
3614 && (gimple_call_internal_fn (def)
3615 == IFN_GOMP_SIMD_LANE))
3617 tree arg = gimple_call_arg (def, 0);
3618 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3619 arg = SSA_NAME_VAR (arg);
3620 if (arg == loop->simduid
3621 /* For now. */
3622 && tree_int_cst_equal
3623 (TYPE_SIZE_UNIT (reft),
3624 step))
3626 DR_OFFSET (newdr) = ssize_int (0);
3627 DR_STEP (newdr) = step;
3628 DR_ALIGNED_TO (newdr)
3629 = size_int (BIGGEST_ALIGNMENT);
3630 dr = newdr;
3631 simd_lane_access = true;
3637 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3639 dr = newdr;
3640 if (maybe_gather)
3641 gatherscatter = GATHER;
3642 else
3643 gatherscatter = SCATTER;
3646 if (gatherscatter == SG_NONE && !simd_lane_access)
3647 free_data_ref (newdr);
3650 if (gatherscatter == SG_NONE && !simd_lane_access)
3652 if (dump_enabled_p ())
3654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3655 "not vectorized: data ref analysis "
3656 "failed ");
3657 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3660 if (is_a <bb_vec_info> (vinfo))
3661 break;
3663 return false;
3667 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3669 if (dump_enabled_p ())
3670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3671 "not vectorized: base addr of dr is a "
3672 "constant\n");
3674 if (is_a <bb_vec_info> (vinfo))
3675 break;
3677 if (gatherscatter != SG_NONE || simd_lane_access)
3678 free_data_ref (dr);
3679 return false;
3682 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3684 if (dump_enabled_p ())
3686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3687 "not vectorized: volatile type ");
3688 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3691 if (is_a <bb_vec_info> (vinfo))
3692 break;
3694 return false;
3697 if (stmt_can_throw_internal (stmt))
3699 if (dump_enabled_p ())
3701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3702 "not vectorized: statement can throw an "
3703 "exception ");
3704 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3707 if (is_a <bb_vec_info> (vinfo))
3708 break;
3710 if (gatherscatter != SG_NONE || simd_lane_access)
3711 free_data_ref (dr);
3712 return false;
3715 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3716 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3718 if (dump_enabled_p ())
3720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3721 "not vectorized: statement is bitfield "
3722 "access ");
3723 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3726 if (is_a <bb_vec_info> (vinfo))
3727 break;
3729 if (gatherscatter != SG_NONE || simd_lane_access)
3730 free_data_ref (dr);
3731 return false;
3734 base = unshare_expr (DR_BASE_ADDRESS (dr));
3735 offset = unshare_expr (DR_OFFSET (dr));
3736 init = unshare_expr (DR_INIT (dr));
3738 if (is_gimple_call (stmt)
3739 && (!gimple_call_internal_p (stmt)
3740 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3741 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3743 if (dump_enabled_p ())
3745 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3746 "not vectorized: dr in a call ");
3747 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3750 if (is_a <bb_vec_info> (vinfo))
3751 break;
3753 if (gatherscatter != SG_NONE || simd_lane_access)
3754 free_data_ref (dr);
3755 return false;
3758 /* Update DR field in stmt_vec_info struct. */
3760 /* If the dataref is in an inner-loop of the loop that is considered for
3761 for vectorization, we also want to analyze the access relative to
3762 the outer-loop (DR contains information only relative to the
3763 inner-most enclosing loop). We do that by building a reference to the
3764 first location accessed by the inner-loop, and analyze it relative to
3765 the outer-loop. */
3766 if (loop && nested_in_vect_loop_p (loop, stmt))
3768 tree outer_step, outer_base, outer_init;
3769 HOST_WIDE_INT pbitsize, pbitpos;
3770 tree poffset;
3771 machine_mode pmode;
3772 int punsignedp, preversep, pvolatilep;
3773 affine_iv base_iv, offset_iv;
3774 tree dinit;
3776 /* Build a reference to the first location accessed by the
3777 inner-loop: *(BASE+INIT). (The first location is actually
3778 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3779 tree inner_base = build_fold_indirect_ref
3780 (fold_build_pointer_plus (base, init));
3782 if (dump_enabled_p ())
3784 dump_printf_loc (MSG_NOTE, vect_location,
3785 "analyze in outer-loop: ");
3786 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3787 dump_printf (MSG_NOTE, "\n");
3790 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3791 &poffset, &pmode, &punsignedp,
3792 &preversep, &pvolatilep);
3793 gcc_assert (outer_base != NULL_TREE);
3795 if (pbitpos % BITS_PER_UNIT != 0)
3797 if (dump_enabled_p ())
3798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3799 "failed: bit offset alignment.\n");
3800 return false;
3803 if (preversep)
3805 if (dump_enabled_p ())
3806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3807 "failed: reverse storage order.\n");
3808 return false;
3811 outer_base = build_fold_addr_expr (outer_base);
3812 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3813 &base_iv, false))
3815 if (dump_enabled_p ())
3816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3817 "failed: evolution of base is not affine.\n");
3818 return false;
3821 if (offset)
3823 if (poffset)
3824 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3825 poffset);
3826 else
3827 poffset = offset;
3830 if (!poffset)
3832 offset_iv.base = ssize_int (0);
3833 offset_iv.step = ssize_int (0);
3835 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3836 &offset_iv, false))
3838 if (dump_enabled_p ())
3839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3840 "evolution of offset is not affine.\n");
3841 return false;
3844 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3845 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3846 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3847 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3848 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3850 outer_step = size_binop (PLUS_EXPR,
3851 fold_convert (ssizetype, base_iv.step),
3852 fold_convert (ssizetype, offset_iv.step));
3854 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3855 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3856 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3857 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3858 STMT_VINFO_DR_OFFSET (stmt_info) =
3859 fold_convert (ssizetype, offset_iv.base);
3860 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3861 size_int (highest_pow2_factor (offset_iv.base));
3863 if (dump_enabled_p ())
3865 dump_printf_loc (MSG_NOTE, vect_location,
3866 "\touter base_address: ");
3867 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3868 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3869 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3870 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3871 STMT_VINFO_DR_OFFSET (stmt_info));
3872 dump_printf (MSG_NOTE,
3873 "\n\touter constant offset from base address: ");
3874 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3875 STMT_VINFO_DR_INIT (stmt_info));
3876 dump_printf (MSG_NOTE, "\n\touter step: ");
3877 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3878 STMT_VINFO_DR_STEP (stmt_info));
3879 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3880 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3881 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3882 dump_printf (MSG_NOTE, "\n");
3886 if (STMT_VINFO_DATA_REF (stmt_info))
3888 if (dump_enabled_p ())
3890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3891 "not vectorized: more than one data ref "
3892 "in stmt: ");
3893 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3896 if (is_a <bb_vec_info> (vinfo))
3897 break;
3899 if (gatherscatter != SG_NONE || simd_lane_access)
3900 free_data_ref (dr);
3901 return false;
3904 STMT_VINFO_DATA_REF (stmt_info) = dr;
3905 if (simd_lane_access)
3907 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3908 free_data_ref (datarefs[i]);
3909 datarefs[i] = dr;
3912 /* Set vectype for STMT. */
3913 scalar_type = TREE_TYPE (DR_REF (dr));
3914 STMT_VINFO_VECTYPE (stmt_info)
3915 = get_vectype_for_scalar_type (scalar_type);
3916 if (!STMT_VINFO_VECTYPE (stmt_info))
3918 if (dump_enabled_p ())
3920 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3921 "not vectorized: no vectype for stmt: ");
3922 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3923 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3924 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3925 scalar_type);
3926 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3929 if (is_a <bb_vec_info> (vinfo))
3931 /* No vector type is fine, the ref can still participate
3932 in dependence analysis, we just can't vectorize it. */
3933 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3934 continue;
3937 if (gatherscatter != SG_NONE || simd_lane_access)
3939 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3940 if (gatherscatter != SG_NONE)
3941 free_data_ref (dr);
3943 return false;
3945 else
3947 if (dump_enabled_p ())
3949 dump_printf_loc (MSG_NOTE, vect_location,
3950 "got vectype for stmt: ");
3951 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3952 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3953 STMT_VINFO_VECTYPE (stmt_info));
3954 dump_printf (MSG_NOTE, "\n");
3958 /* Adjust the minimal vectorization factor according to the
3959 vector type. */
3960 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3961 if (vf > *min_vf)
3962 *min_vf = vf;
3964 if (gatherscatter != SG_NONE)
3966 gather_scatter_info gs_info;
3967 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3968 &gs_info)
3969 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3971 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3972 free_data_ref (dr);
3973 if (dump_enabled_p ())
3975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3976 (gatherscatter == GATHER) ?
3977 "not vectorized: not suitable for gather "
3978 "load " :
3979 "not vectorized: not suitable for scatter "
3980 "store ");
3981 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3983 return false;
3986 free_data_ref (datarefs[i]);
3987 datarefs[i] = dr;
3988 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3991 else if (is_a <loop_vec_info> (vinfo)
3992 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3994 if (nested_in_vect_loop_p (loop, stmt))
3996 if (dump_enabled_p ())
3998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3999 "not vectorized: not suitable for strided "
4000 "load ");
4001 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4003 return false;
4005 STMT_VINFO_STRIDED_P (stmt_info) = true;
4009 /* If we stopped analysis at the first dataref we could not analyze
4010 when trying to vectorize a basic-block mark the rest of the datarefs
4011 as not vectorizable and truncate the vector of datarefs. That
4012 avoids spending useless time in analyzing their dependence. */
4013 if (i != datarefs.length ())
4015 gcc_assert (is_a <bb_vec_info> (vinfo));
4016 for (unsigned j = i; j < datarefs.length (); ++j)
4018 data_reference_p dr = datarefs[j];
4019 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4020 free_data_ref (dr);
4022 datarefs.truncate (i);
4025 return true;
4029 /* Function vect_get_new_vect_var.
4031 Returns a name for a new variable. The current naming scheme appends the
4032 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4033 the name of vectorizer generated variables, and appends that to NAME if
4034 provided. */
4036 tree
4037 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4039 const char *prefix;
4040 tree new_vect_var;
4042 switch (var_kind)
4044 case vect_simple_var:
4045 prefix = "vect";
4046 break;
4047 case vect_scalar_var:
4048 prefix = "stmp";
4049 break;
4050 case vect_mask_var:
4051 prefix = "mask";
4052 break;
4053 case vect_pointer_var:
4054 prefix = "vectp";
4055 break;
4056 default:
4057 gcc_unreachable ();
4060 if (name)
4062 char* tmp = concat (prefix, "_", name, NULL);
4063 new_vect_var = create_tmp_reg (type, tmp);
4064 free (tmp);
4066 else
4067 new_vect_var = create_tmp_reg (type, prefix);
4069 return new_vect_var;
4072 /* Like vect_get_new_vect_var but return an SSA name. */
4074 tree
4075 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4077 const char *prefix;
4078 tree new_vect_var;
4080 switch (var_kind)
4082 case vect_simple_var:
4083 prefix = "vect";
4084 break;
4085 case vect_scalar_var:
4086 prefix = "stmp";
4087 break;
4088 case vect_pointer_var:
4089 prefix = "vectp";
4090 break;
4091 default:
4092 gcc_unreachable ();
4095 if (name)
4097 char* tmp = concat (prefix, "_", name, NULL);
4098 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4099 free (tmp);
4101 else
4102 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4104 return new_vect_var;
4107 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4109 static void
4110 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4111 stmt_vec_info stmt_info)
4113 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4114 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4115 int misalign = DR_MISALIGNMENT (dr);
4116 if (misalign == -1)
4117 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4118 else
4119 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4122 /* Function vect_create_addr_base_for_vector_ref.
4124 Create an expression that computes the address of the first memory location
4125 that will be accessed for a data reference.
4127 Input:
4128 STMT: The statement containing the data reference.
4129 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4130 OFFSET: Optional. If supplied, it is be added to the initial address.
4131 LOOP: Specify relative to which loop-nest should the address be computed.
4132 For example, when the dataref is in an inner-loop nested in an
4133 outer-loop that is now being vectorized, LOOP can be either the
4134 outer-loop, or the inner-loop. The first memory location accessed
4135 by the following dataref ('in' points to short):
4137 for (i=0; i<N; i++)
4138 for (j=0; j<M; j++)
4139 s += in[i+j]
4141 is as follows:
4142 if LOOP=i_loop: &in (relative to i_loop)
4143 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4144 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4145 initial address. Unlike OFFSET, which is number of elements to
4146 be added, BYTE_OFFSET is measured in bytes.
4148 Output:
4149 1. Return an SSA_NAME whose value is the address of the memory location of
4150 the first vector of the data reference.
4151 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4152 these statement(s) which define the returned SSA_NAME.
4154 FORNOW: We are only handling array accesses with step 1. */
4156 tree
4157 vect_create_addr_base_for_vector_ref (gimple *stmt,
4158 gimple_seq *new_stmt_list,
4159 tree offset,
4160 struct loop *loop,
4161 tree byte_offset)
4163 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4164 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4165 tree data_ref_base;
4166 const char *base_name;
4167 tree addr_base;
4168 tree dest;
4169 gimple_seq seq = NULL;
4170 tree base_offset;
4171 tree init;
4172 tree vect_ptr_type;
4173 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4174 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4176 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4178 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4180 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4182 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4183 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4184 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4186 else
4188 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4189 base_offset = unshare_expr (DR_OFFSET (dr));
4190 init = unshare_expr (DR_INIT (dr));
4193 if (loop_vinfo)
4194 base_name = get_name (data_ref_base);
4195 else
4197 base_offset = ssize_int (0);
4198 init = ssize_int (0);
4199 base_name = get_name (DR_REF (dr));
4202 /* Create base_offset */
4203 base_offset = size_binop (PLUS_EXPR,
4204 fold_convert (sizetype, base_offset),
4205 fold_convert (sizetype, init));
4207 if (offset)
4209 offset = fold_build2 (MULT_EXPR, sizetype,
4210 fold_convert (sizetype, offset), step);
4211 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4212 base_offset, offset);
4214 if (byte_offset)
4216 byte_offset = fold_convert (sizetype, byte_offset);
4217 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4218 base_offset, byte_offset);
4221 /* base + base_offset */
4222 if (loop_vinfo)
4223 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4224 else
4226 addr_base = build1 (ADDR_EXPR,
4227 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4228 unshare_expr (DR_REF (dr)));
4231 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4232 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4233 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4234 gimple_seq_add_seq (new_stmt_list, seq);
4236 if (DR_PTR_INFO (dr)
4237 && TREE_CODE (addr_base) == SSA_NAME
4238 && !SSA_NAME_PTR_INFO (addr_base))
4240 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4241 if (offset || byte_offset)
4242 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4245 if (dump_enabled_p ())
4247 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4248 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4249 dump_printf (MSG_NOTE, "\n");
4252 return addr_base;
4256 /* Function vect_create_data_ref_ptr.
4258 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4259 location accessed in the loop by STMT, along with the def-use update
4260 chain to appropriately advance the pointer through the loop iterations.
4261 Also set aliasing information for the pointer. This pointer is used by
4262 the callers to this function to create a memory reference expression for
4263 vector load/store access.
4265 Input:
4266 1. STMT: a stmt that references memory. Expected to be of the form
4267 GIMPLE_ASSIGN <name, data-ref> or
4268 GIMPLE_ASSIGN <data-ref, name>.
4269 2. AGGR_TYPE: the type of the reference, which should be either a vector
4270 or an array.
4271 3. AT_LOOP: the loop where the vector memref is to be created.
4272 4. OFFSET (optional): an offset to be added to the initial address accessed
4273 by the data-ref in STMT.
4274 5. BSI: location where the new stmts are to be placed if there is no loop
4275 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4276 pointing to the initial address.
4277 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4278 to the initial address accessed by the data-ref in STMT. This is
4279 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4280 in bytes.
4282 Output:
4283 1. Declare a new ptr to vector_type, and have it point to the base of the
4284 data reference (initial addressed accessed by the data reference).
4285 For example, for vector of type V8HI, the following code is generated:
4287 v8hi *ap;
4288 ap = (v8hi *)initial_address;
4290 if OFFSET is not supplied:
4291 initial_address = &a[init];
4292 if OFFSET is supplied:
4293 initial_address = &a[init + OFFSET];
4294 if BYTE_OFFSET is supplied:
4295 initial_address = &a[init] + BYTE_OFFSET;
4297 Return the initial_address in INITIAL_ADDRESS.
4299 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4300 update the pointer in each iteration of the loop.
4302 Return the increment stmt that updates the pointer in PTR_INCR.
4304 3. Set INV_P to true if the access pattern of the data reference in the
4305 vectorized loop is invariant. Set it to false otherwise.
4307 4. Return the pointer. */
4309 tree
4310 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4311 tree offset, tree *initial_address,
4312 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4313 bool only_init, bool *inv_p, tree byte_offset)
4315 const char *base_name;
4316 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4317 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4318 struct loop *loop = NULL;
4319 bool nested_in_vect_loop = false;
4320 struct loop *containing_loop = NULL;
4321 tree aggr_ptr_type;
4322 tree aggr_ptr;
4323 tree new_temp;
4324 gimple_seq new_stmt_list = NULL;
4325 edge pe = NULL;
4326 basic_block new_bb;
4327 tree aggr_ptr_init;
4328 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4329 tree aptr;
4330 gimple_stmt_iterator incr_gsi;
4331 bool insert_after;
4332 tree indx_before_incr, indx_after_incr;
4333 gimple *incr;
4334 tree step;
4335 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4337 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4338 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4340 if (loop_vinfo)
4342 loop = LOOP_VINFO_LOOP (loop_vinfo);
4343 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4344 containing_loop = (gimple_bb (stmt))->loop_father;
4345 pe = loop_preheader_edge (loop);
4347 else
4349 gcc_assert (bb_vinfo);
4350 only_init = true;
4351 *ptr_incr = NULL;
4354 /* Check the step (evolution) of the load in LOOP, and record
4355 whether it's invariant. */
4356 if (nested_in_vect_loop)
4357 step = STMT_VINFO_DR_STEP (stmt_info);
4358 else
4359 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4361 if (integer_zerop (step))
4362 *inv_p = true;
4363 else
4364 *inv_p = false;
4366 /* Create an expression for the first address accessed by this load
4367 in LOOP. */
4368 base_name = get_name (DR_BASE_ADDRESS (dr));
4370 if (dump_enabled_p ())
4372 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4373 dump_printf_loc (MSG_NOTE, vect_location,
4374 "create %s-pointer variable to type: ",
4375 get_tree_code_name (TREE_CODE (aggr_type)));
4376 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4377 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4378 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4379 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4380 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4381 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4382 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4383 else
4384 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4385 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4386 dump_printf (MSG_NOTE, "\n");
4389 /* (1) Create the new aggregate-pointer variable.
4390 Vector and array types inherit the alias set of their component
4391 type by default so we need to use a ref-all pointer if the data
4392 reference does not conflict with the created aggregated data
4393 reference because it is not addressable. */
4394 bool need_ref_all = false;
4395 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4396 get_alias_set (DR_REF (dr))))
4397 need_ref_all = true;
4398 /* Likewise for any of the data references in the stmt group. */
4399 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4401 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4404 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4405 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4406 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4407 get_alias_set (DR_REF (sdr))))
4409 need_ref_all = true;
4410 break;
4412 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4414 while (orig_stmt);
4416 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4417 need_ref_all);
4418 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4421 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4422 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4423 def-use update cycles for the pointer: one relative to the outer-loop
4424 (LOOP), which is what steps (3) and (4) below do. The other is relative
4425 to the inner-loop (which is the inner-most loop containing the dataref),
4426 and this is done be step (5) below.
4428 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4429 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4430 redundant. Steps (3),(4) create the following:
4432 vp0 = &base_addr;
4433 LOOP: vp1 = phi(vp0,vp2)
4436 vp2 = vp1 + step
4437 goto LOOP
4439 If there is an inner-loop nested in loop, then step (5) will also be
4440 applied, and an additional update in the inner-loop will be created:
4442 vp0 = &base_addr;
4443 LOOP: vp1 = phi(vp0,vp2)
4445 inner: vp3 = phi(vp1,vp4)
4446 vp4 = vp3 + inner_step
4447 if () goto inner
4449 vp2 = vp1 + step
4450 if () goto LOOP */
4452 /* (2) Calculate the initial address of the aggregate-pointer, and set
4453 the aggregate-pointer to point to it before the loop. */
4455 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4457 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4458 offset, loop, byte_offset);
4459 if (new_stmt_list)
4461 if (pe)
4463 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4464 gcc_assert (!new_bb);
4466 else
4467 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4470 *initial_address = new_temp;
4471 aggr_ptr_init = new_temp;
4473 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4474 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4475 inner-loop nested in LOOP (during outer-loop vectorization). */
4477 /* No update in loop is required. */
4478 if (only_init && (!loop_vinfo || at_loop == loop))
4479 aptr = aggr_ptr_init;
4480 else
4482 /* The step of the aggregate pointer is the type size. */
4483 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4484 /* One exception to the above is when the scalar step of the load in
4485 LOOP is zero. In this case the step here is also zero. */
4486 if (*inv_p)
4487 iv_step = size_zero_node;
4488 else if (tree_int_cst_sgn (step) == -1)
4489 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4491 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4493 create_iv (aggr_ptr_init,
4494 fold_convert (aggr_ptr_type, iv_step),
4495 aggr_ptr, loop, &incr_gsi, insert_after,
4496 &indx_before_incr, &indx_after_incr);
4497 incr = gsi_stmt (incr_gsi);
4498 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4500 /* Copy the points-to information if it exists. */
4501 if (DR_PTR_INFO (dr))
4503 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4504 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4506 if (ptr_incr)
4507 *ptr_incr = incr;
4509 aptr = indx_before_incr;
4512 if (!nested_in_vect_loop || only_init)
4513 return aptr;
4516 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4517 nested in LOOP, if exists. */
4519 gcc_assert (nested_in_vect_loop);
4520 if (!only_init)
4522 standard_iv_increment_position (containing_loop, &incr_gsi,
4523 &insert_after);
4524 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4525 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4526 &indx_after_incr);
4527 incr = gsi_stmt (incr_gsi);
4528 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4530 /* Copy the points-to information if it exists. */
4531 if (DR_PTR_INFO (dr))
4533 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4534 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4536 if (ptr_incr)
4537 *ptr_incr = incr;
4539 return indx_before_incr;
4541 else
4542 gcc_unreachable ();
4546 /* Function bump_vector_ptr
4548 Increment a pointer (to a vector type) by vector-size. If requested,
4549 i.e. if PTR-INCR is given, then also connect the new increment stmt
4550 to the existing def-use update-chain of the pointer, by modifying
4551 the PTR_INCR as illustrated below:
4553 The pointer def-use update-chain before this function:
4554 DATAREF_PTR = phi (p_0, p_2)
4555 ....
4556 PTR_INCR: p_2 = DATAREF_PTR + step
4558 The pointer def-use update-chain after this function:
4559 DATAREF_PTR = phi (p_0, p_2)
4560 ....
4561 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4562 ....
4563 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4565 Input:
4566 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4567 in the loop.
4568 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4569 the loop. The increment amount across iterations is expected
4570 to be vector_size.
4571 BSI - location where the new update stmt is to be placed.
4572 STMT - the original scalar memory-access stmt that is being vectorized.
4573 BUMP - optional. The offset by which to bump the pointer. If not given,
4574 the offset is assumed to be vector_size.
4576 Output: Return NEW_DATAREF_PTR as illustrated above.
4580 tree
4581 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4582 gimple *stmt, tree bump)
4584 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4585 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4586 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4587 tree update = TYPE_SIZE_UNIT (vectype);
4588 gassign *incr_stmt;
4589 ssa_op_iter iter;
4590 use_operand_p use_p;
4591 tree new_dataref_ptr;
4593 if (bump)
4594 update = bump;
4596 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4597 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4598 else
4599 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4600 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4601 dataref_ptr, update);
4602 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4604 /* Copy the points-to information if it exists. */
4605 if (DR_PTR_INFO (dr))
4607 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4608 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4611 if (!ptr_incr)
4612 return new_dataref_ptr;
4614 /* Update the vector-pointer's cross-iteration increment. */
4615 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4617 tree use = USE_FROM_PTR (use_p);
4619 if (use == dataref_ptr)
4620 SET_USE (use_p, new_dataref_ptr);
4621 else
4622 gcc_assert (tree_int_cst_compare (use, update) == 0);
4625 return new_dataref_ptr;
4629 /* Function vect_create_destination_var.
4631 Create a new temporary of type VECTYPE. */
4633 tree
4634 vect_create_destination_var (tree scalar_dest, tree vectype)
4636 tree vec_dest;
4637 const char *name;
4638 char *new_name;
4639 tree type;
4640 enum vect_var_kind kind;
4642 kind = vectype
4643 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4644 ? vect_mask_var
4645 : vect_simple_var
4646 : vect_scalar_var;
4647 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4649 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4651 name = get_name (scalar_dest);
4652 if (name)
4653 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4654 else
4655 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4656 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4657 free (new_name);
4659 return vec_dest;
4662 /* Function vect_grouped_store_supported.
4664 Returns TRUE if interleave high and interleave low permutations
4665 are supported, and FALSE otherwise. */
4667 bool
4668 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4670 machine_mode mode = TYPE_MODE (vectype);
4672 /* vect_permute_store_chain requires the group size to be equal to 3 or
4673 be a power of two. */
4674 if (count != 3 && exact_log2 (count) == -1)
4676 if (dump_enabled_p ())
4677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4678 "the size of the group of accesses"
4679 " is not a power of 2 or not eqaul to 3\n");
4680 return false;
4683 /* Check that the permutation is supported. */
4684 if (VECTOR_MODE_P (mode))
4686 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4687 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4689 if (count == 3)
4691 unsigned int j0 = 0, j1 = 0, j2 = 0;
4692 unsigned int i, j;
4694 for (j = 0; j < 3; j++)
4696 int nelt0 = ((3 - j) * nelt) % 3;
4697 int nelt1 = ((3 - j) * nelt + 1) % 3;
4698 int nelt2 = ((3 - j) * nelt + 2) % 3;
4699 for (i = 0; i < nelt; i++)
4701 if (3 * i + nelt0 < nelt)
4702 sel[3 * i + nelt0] = j0++;
4703 if (3 * i + nelt1 < nelt)
4704 sel[3 * i + nelt1] = nelt + j1++;
4705 if (3 * i + nelt2 < nelt)
4706 sel[3 * i + nelt2] = 0;
4708 if (!can_vec_perm_p (mode, false, sel))
4710 if (dump_enabled_p ())
4711 dump_printf (MSG_MISSED_OPTIMIZATION,
4712 "permutaion op not supported by target.\n");
4713 return false;
4716 for (i = 0; i < nelt; i++)
4718 if (3 * i + nelt0 < nelt)
4719 sel[3 * i + nelt0] = 3 * i + nelt0;
4720 if (3 * i + nelt1 < nelt)
4721 sel[3 * i + nelt1] = 3 * i + nelt1;
4722 if (3 * i + nelt2 < nelt)
4723 sel[3 * i + nelt2] = nelt + j2++;
4725 if (!can_vec_perm_p (mode, false, sel))
4727 if (dump_enabled_p ())
4728 dump_printf (MSG_MISSED_OPTIMIZATION,
4729 "permutaion op not supported by target.\n");
4730 return false;
4733 return true;
4735 else
4737 /* If length is not equal to 3 then only power of 2 is supported. */
4738 gcc_assert (exact_log2 (count) != -1);
4740 for (i = 0; i < nelt / 2; i++)
4742 sel[i * 2] = i;
4743 sel[i * 2 + 1] = i + nelt;
4745 if (can_vec_perm_p (mode, false, sel))
4747 for (i = 0; i < nelt; i++)
4748 sel[i] += nelt / 2;
4749 if (can_vec_perm_p (mode, false, sel))
4750 return true;
4755 if (dump_enabled_p ())
4756 dump_printf (MSG_MISSED_OPTIMIZATION,
4757 "permutaion op not supported by target.\n");
4758 return false;
4762 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4763 type VECTYPE. */
4765 bool
4766 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4768 return vect_lanes_optab_supported_p ("vec_store_lanes",
4769 vec_store_lanes_optab,
4770 vectype, count);
4774 /* Function vect_permute_store_chain.
4776 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4777 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4778 the data correctly for the stores. Return the final references for stores
4779 in RESULT_CHAIN.
4781 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4782 The input is 4 vectors each containing 8 elements. We assign a number to
4783 each element, the input sequence is:
4785 1st vec: 0 1 2 3 4 5 6 7
4786 2nd vec: 8 9 10 11 12 13 14 15
4787 3rd vec: 16 17 18 19 20 21 22 23
4788 4th vec: 24 25 26 27 28 29 30 31
4790 The output sequence should be:
4792 1st vec: 0 8 16 24 1 9 17 25
4793 2nd vec: 2 10 18 26 3 11 19 27
4794 3rd vec: 4 12 20 28 5 13 21 30
4795 4th vec: 6 14 22 30 7 15 23 31
4797 i.e., we interleave the contents of the four vectors in their order.
4799 We use interleave_high/low instructions to create such output. The input of
4800 each interleave_high/low operation is two vectors:
4801 1st vec 2nd vec
4802 0 1 2 3 4 5 6 7
4803 the even elements of the result vector are obtained left-to-right from the
4804 high/low elements of the first vector. The odd elements of the result are
4805 obtained left-to-right from the high/low elements of the second vector.
4806 The output of interleave_high will be: 0 4 1 5
4807 and of interleave_low: 2 6 3 7
4810 The permutation is done in log LENGTH stages. In each stage interleave_high
4811 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4812 where the first argument is taken from the first half of DR_CHAIN and the
4813 second argument from it's second half.
4814 In our example,
4816 I1: interleave_high (1st vec, 3rd vec)
4817 I2: interleave_low (1st vec, 3rd vec)
4818 I3: interleave_high (2nd vec, 4th vec)
4819 I4: interleave_low (2nd vec, 4th vec)
4821 The output for the first stage is:
4823 I1: 0 16 1 17 2 18 3 19
4824 I2: 4 20 5 21 6 22 7 23
4825 I3: 8 24 9 25 10 26 11 27
4826 I4: 12 28 13 29 14 30 15 31
4828 The output of the second stage, i.e. the final result is:
4830 I1: 0 8 16 24 1 9 17 25
4831 I2: 2 10 18 26 3 11 19 27
4832 I3: 4 12 20 28 5 13 21 30
4833 I4: 6 14 22 30 7 15 23 31. */
4835 void
4836 vect_permute_store_chain (vec<tree> dr_chain,
4837 unsigned int length,
4838 gimple *stmt,
4839 gimple_stmt_iterator *gsi,
4840 vec<tree> *result_chain)
4842 tree vect1, vect2, high, low;
4843 gimple *perm_stmt;
4844 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4845 tree perm_mask_low, perm_mask_high;
4846 tree data_ref;
4847 tree perm3_mask_low, perm3_mask_high;
4848 unsigned int i, n, log_length = exact_log2 (length);
4849 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4850 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4852 result_chain->quick_grow (length);
4853 memcpy (result_chain->address (), dr_chain.address (),
4854 length * sizeof (tree));
4856 if (length == 3)
4858 unsigned int j0 = 0, j1 = 0, j2 = 0;
4860 for (j = 0; j < 3; j++)
4862 int nelt0 = ((3 - j) * nelt) % 3;
4863 int nelt1 = ((3 - j) * nelt + 1) % 3;
4864 int nelt2 = ((3 - j) * nelt + 2) % 3;
4866 for (i = 0; i < nelt; i++)
4868 if (3 * i + nelt0 < nelt)
4869 sel[3 * i + nelt0] = j0++;
4870 if (3 * i + nelt1 < nelt)
4871 sel[3 * i + nelt1] = nelt + j1++;
4872 if (3 * i + nelt2 < nelt)
4873 sel[3 * i + nelt2] = 0;
4875 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4877 for (i = 0; i < nelt; i++)
4879 if (3 * i + nelt0 < nelt)
4880 sel[3 * i + nelt0] = 3 * i + nelt0;
4881 if (3 * i + nelt1 < nelt)
4882 sel[3 * i + nelt1] = 3 * i + nelt1;
4883 if (3 * i + nelt2 < nelt)
4884 sel[3 * i + nelt2] = nelt + j2++;
4886 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4888 vect1 = dr_chain[0];
4889 vect2 = dr_chain[1];
4891 /* Create interleaving stmt:
4892 low = VEC_PERM_EXPR <vect1, vect2,
4893 {j, nelt, *, j + 1, nelt + j + 1, *,
4894 j + 2, nelt + j + 2, *, ...}> */
4895 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4896 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4897 vect2, perm3_mask_low);
4898 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4900 vect1 = data_ref;
4901 vect2 = dr_chain[2];
4902 /* Create interleaving stmt:
4903 low = VEC_PERM_EXPR <vect1, vect2,
4904 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4905 6, 7, nelt + j + 2, ...}> */
4906 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4907 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4908 vect2, perm3_mask_high);
4909 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4910 (*result_chain)[j] = data_ref;
4913 else
4915 /* If length is not equal to 3 then only power of 2 is supported. */
4916 gcc_assert (exact_log2 (length) != -1);
4918 for (i = 0, n = nelt / 2; i < n; i++)
4920 sel[i * 2] = i;
4921 sel[i * 2 + 1] = i + nelt;
4923 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4925 for (i = 0; i < nelt; i++)
4926 sel[i] += nelt / 2;
4927 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4929 for (i = 0, n = log_length; i < n; i++)
4931 for (j = 0; j < length/2; j++)
4933 vect1 = dr_chain[j];
4934 vect2 = dr_chain[j+length/2];
4936 /* Create interleaving stmt:
4937 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4938 ...}> */
4939 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4940 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4941 vect2, perm_mask_high);
4942 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4943 (*result_chain)[2*j] = high;
4945 /* Create interleaving stmt:
4946 low = VEC_PERM_EXPR <vect1, vect2,
4947 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4948 ...}> */
4949 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4950 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4951 vect2, perm_mask_low);
4952 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4953 (*result_chain)[2*j+1] = low;
4955 memcpy (dr_chain.address (), result_chain->address (),
4956 length * sizeof (tree));
4961 /* Function vect_setup_realignment
4963 This function is called when vectorizing an unaligned load using
4964 the dr_explicit_realign[_optimized] scheme.
4965 This function generates the following code at the loop prolog:
4967 p = initial_addr;
4968 x msq_init = *(floor(p)); # prolog load
4969 realignment_token = call target_builtin;
4970 loop:
4971 x msq = phi (msq_init, ---)
4973 The stmts marked with x are generated only for the case of
4974 dr_explicit_realign_optimized.
4976 The code above sets up a new (vector) pointer, pointing to the first
4977 location accessed by STMT, and a "floor-aligned" load using that pointer.
4978 It also generates code to compute the "realignment-token" (if the relevant
4979 target hook was defined), and creates a phi-node at the loop-header bb
4980 whose arguments are the result of the prolog-load (created by this
4981 function) and the result of a load that takes place in the loop (to be
4982 created by the caller to this function).
4984 For the case of dr_explicit_realign_optimized:
4985 The caller to this function uses the phi-result (msq) to create the
4986 realignment code inside the loop, and sets up the missing phi argument,
4987 as follows:
4988 loop:
4989 msq = phi (msq_init, lsq)
4990 lsq = *(floor(p')); # load in loop
4991 result = realign_load (msq, lsq, realignment_token);
4993 For the case of dr_explicit_realign:
4994 loop:
4995 msq = *(floor(p)); # load in loop
4996 p' = p + (VS-1);
4997 lsq = *(floor(p')); # load in loop
4998 result = realign_load (msq, lsq, realignment_token);
5000 Input:
5001 STMT - (scalar) load stmt to be vectorized. This load accesses
5002 a memory location that may be unaligned.
5003 BSI - place where new code is to be inserted.
5004 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5005 is used.
5007 Output:
5008 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5009 target hook, if defined.
5010 Return value - the result of the loop-header phi node. */
5012 tree
5013 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5014 tree *realignment_token,
5015 enum dr_alignment_support alignment_support_scheme,
5016 tree init_addr,
5017 struct loop **at_loop)
5019 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5020 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5021 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5022 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5023 struct loop *loop = NULL;
5024 edge pe = NULL;
5025 tree scalar_dest = gimple_assign_lhs (stmt);
5026 tree vec_dest;
5027 gimple *inc;
5028 tree ptr;
5029 tree data_ref;
5030 basic_block new_bb;
5031 tree msq_init = NULL_TREE;
5032 tree new_temp;
5033 gphi *phi_stmt;
5034 tree msq = NULL_TREE;
5035 gimple_seq stmts = NULL;
5036 bool inv_p;
5037 bool compute_in_loop = false;
5038 bool nested_in_vect_loop = false;
5039 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5040 struct loop *loop_for_initial_load = NULL;
5042 if (loop_vinfo)
5044 loop = LOOP_VINFO_LOOP (loop_vinfo);
5045 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5048 gcc_assert (alignment_support_scheme == dr_explicit_realign
5049 || alignment_support_scheme == dr_explicit_realign_optimized);
5051 /* We need to generate three things:
5052 1. the misalignment computation
5053 2. the extra vector load (for the optimized realignment scheme).
5054 3. the phi node for the two vectors from which the realignment is
5055 done (for the optimized realignment scheme). */
5057 /* 1. Determine where to generate the misalignment computation.
5059 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5060 calculation will be generated by this function, outside the loop (in the
5061 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5062 caller, inside the loop.
5064 Background: If the misalignment remains fixed throughout the iterations of
5065 the loop, then both realignment schemes are applicable, and also the
5066 misalignment computation can be done outside LOOP. This is because we are
5067 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5068 are a multiple of VS (the Vector Size), and therefore the misalignment in
5069 different vectorized LOOP iterations is always the same.
5070 The problem arises only if the memory access is in an inner-loop nested
5071 inside LOOP, which is now being vectorized using outer-loop vectorization.
5072 This is the only case when the misalignment of the memory access may not
5073 remain fixed throughout the iterations of the inner-loop (as explained in
5074 detail in vect_supportable_dr_alignment). In this case, not only is the
5075 optimized realignment scheme not applicable, but also the misalignment
5076 computation (and generation of the realignment token that is passed to
5077 REALIGN_LOAD) have to be done inside the loop.
5079 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5080 or not, which in turn determines if the misalignment is computed inside
5081 the inner-loop, or outside LOOP. */
5083 if (init_addr != NULL_TREE || !loop_vinfo)
5085 compute_in_loop = true;
5086 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5090 /* 2. Determine where to generate the extra vector load.
5092 For the optimized realignment scheme, instead of generating two vector
5093 loads in each iteration, we generate a single extra vector load in the
5094 preheader of the loop, and in each iteration reuse the result of the
5095 vector load from the previous iteration. In case the memory access is in
5096 an inner-loop nested inside LOOP, which is now being vectorized using
5097 outer-loop vectorization, we need to determine whether this initial vector
5098 load should be generated at the preheader of the inner-loop, or can be
5099 generated at the preheader of LOOP. If the memory access has no evolution
5100 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5101 to be generated inside LOOP (in the preheader of the inner-loop). */
5103 if (nested_in_vect_loop)
5105 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5106 bool invariant_in_outerloop =
5107 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5108 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5110 else
5111 loop_for_initial_load = loop;
5112 if (at_loop)
5113 *at_loop = loop_for_initial_load;
5115 if (loop_for_initial_load)
5116 pe = loop_preheader_edge (loop_for_initial_load);
5118 /* 3. For the case of the optimized realignment, create the first vector
5119 load at the loop preheader. */
5121 if (alignment_support_scheme == dr_explicit_realign_optimized)
5123 /* Create msq_init = *(floor(p1)) in the loop preheader */
5124 gassign *new_stmt;
5126 gcc_assert (!compute_in_loop);
5127 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5128 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5129 NULL_TREE, &init_addr, NULL, &inc,
5130 true, &inv_p);
5131 if (TREE_CODE (ptr) == SSA_NAME)
5132 new_temp = copy_ssa_name (ptr);
5133 else
5134 new_temp = make_ssa_name (TREE_TYPE (ptr));
5135 new_stmt = gimple_build_assign
5136 (new_temp, BIT_AND_EXPR, ptr,
5137 build_int_cst (TREE_TYPE (ptr),
5138 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5139 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5140 gcc_assert (!new_bb);
5141 data_ref
5142 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5143 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5144 new_stmt = gimple_build_assign (vec_dest, data_ref);
5145 new_temp = make_ssa_name (vec_dest, new_stmt);
5146 gimple_assign_set_lhs (new_stmt, new_temp);
5147 if (pe)
5149 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5150 gcc_assert (!new_bb);
5152 else
5153 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5155 msq_init = gimple_assign_lhs (new_stmt);
5158 /* 4. Create realignment token using a target builtin, if available.
5159 It is done either inside the containing loop, or before LOOP (as
5160 determined above). */
5162 if (targetm.vectorize.builtin_mask_for_load)
5164 gcall *new_stmt;
5165 tree builtin_decl;
5167 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5168 if (!init_addr)
5170 /* Generate the INIT_ADDR computation outside LOOP. */
5171 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5172 NULL_TREE, loop);
5173 if (loop)
5175 pe = loop_preheader_edge (loop);
5176 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5177 gcc_assert (!new_bb);
5179 else
5180 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5183 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5184 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5185 vec_dest =
5186 vect_create_destination_var (scalar_dest,
5187 gimple_call_return_type (new_stmt));
5188 new_temp = make_ssa_name (vec_dest, new_stmt);
5189 gimple_call_set_lhs (new_stmt, new_temp);
5191 if (compute_in_loop)
5192 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5193 else
5195 /* Generate the misalignment computation outside LOOP. */
5196 pe = loop_preheader_edge (loop);
5197 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5198 gcc_assert (!new_bb);
5201 *realignment_token = gimple_call_lhs (new_stmt);
5203 /* The result of the CALL_EXPR to this builtin is determined from
5204 the value of the parameter and no global variables are touched
5205 which makes the builtin a "const" function. Requiring the
5206 builtin to have the "const" attribute makes it unnecessary
5207 to call mark_call_clobbered. */
5208 gcc_assert (TREE_READONLY (builtin_decl));
5211 if (alignment_support_scheme == dr_explicit_realign)
5212 return msq;
5214 gcc_assert (!compute_in_loop);
5215 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5218 /* 5. Create msq = phi <msq_init, lsq> in loop */
5220 pe = loop_preheader_edge (containing_loop);
5221 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5222 msq = make_ssa_name (vec_dest);
5223 phi_stmt = create_phi_node (msq, containing_loop->header);
5224 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5226 return msq;
5230 /* Function vect_grouped_load_supported.
5232 COUNT is the size of the load group (the number of statements plus the
5233 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5234 only one statement, with a gap of COUNT - 1.
5236 Returns true if a suitable permute exists. */
5238 bool
5239 vect_grouped_load_supported (tree vectype, bool single_element_p,
5240 unsigned HOST_WIDE_INT count)
5242 machine_mode mode = TYPE_MODE (vectype);
5244 /* If this is single-element interleaving with an element distance
5245 that leaves unused vector loads around punt - we at least create
5246 very sub-optimal code in that case (and blow up memory,
5247 see PR65518). */
5248 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5250 if (dump_enabled_p ())
5251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5252 "single-element interleaving not supported "
5253 "for not adjacent vector loads\n");
5254 return false;
5257 /* vect_permute_load_chain requires the group size to be equal to 3 or
5258 be a power of two. */
5259 if (count != 3 && exact_log2 (count) == -1)
5261 if (dump_enabled_p ())
5262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5263 "the size of the group of accesses"
5264 " is not a power of 2 or not equal to 3\n");
5265 return false;
5268 /* Check that the permutation is supported. */
5269 if (VECTOR_MODE_P (mode))
5271 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5272 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5274 if (count == 3)
5276 unsigned int k;
5277 for (k = 0; k < 3; k++)
5279 for (i = 0; i < nelt; i++)
5280 if (3 * i + k < 2 * nelt)
5281 sel[i] = 3 * i + k;
5282 else
5283 sel[i] = 0;
5284 if (!can_vec_perm_p (mode, false, sel))
5286 if (dump_enabled_p ())
5287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5288 "shuffle of 3 loads is not supported by"
5289 " target\n");
5290 return false;
5292 for (i = 0, j = 0; i < nelt; i++)
5293 if (3 * i + k < 2 * nelt)
5294 sel[i] = i;
5295 else
5296 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5297 if (!can_vec_perm_p (mode, false, sel))
5299 if (dump_enabled_p ())
5300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5301 "shuffle of 3 loads is not supported by"
5302 " target\n");
5303 return false;
5306 return true;
5308 else
5310 /* If length is not equal to 3 then only power of 2 is supported. */
5311 gcc_assert (exact_log2 (count) != -1);
5312 for (i = 0; i < nelt; i++)
5313 sel[i] = i * 2;
5314 if (can_vec_perm_p (mode, false, sel))
5316 for (i = 0; i < nelt; i++)
5317 sel[i] = i * 2 + 1;
5318 if (can_vec_perm_p (mode, false, sel))
5319 return true;
5324 if (dump_enabled_p ())
5325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5326 "extract even/odd not supported by target\n");
5327 return false;
5330 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5331 type VECTYPE. */
5333 bool
5334 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5336 return vect_lanes_optab_supported_p ("vec_load_lanes",
5337 vec_load_lanes_optab,
5338 vectype, count);
5341 /* Function vect_permute_load_chain.
5343 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5344 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5345 the input data correctly. Return the final references for loads in
5346 RESULT_CHAIN.
5348 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5349 The input is 4 vectors each containing 8 elements. We assign a number to each
5350 element, the input sequence is:
5352 1st vec: 0 1 2 3 4 5 6 7
5353 2nd vec: 8 9 10 11 12 13 14 15
5354 3rd vec: 16 17 18 19 20 21 22 23
5355 4th vec: 24 25 26 27 28 29 30 31
5357 The output sequence should be:
5359 1st vec: 0 4 8 12 16 20 24 28
5360 2nd vec: 1 5 9 13 17 21 25 29
5361 3rd vec: 2 6 10 14 18 22 26 30
5362 4th vec: 3 7 11 15 19 23 27 31
5364 i.e., the first output vector should contain the first elements of each
5365 interleaving group, etc.
5367 We use extract_even/odd instructions to create such output. The input of
5368 each extract_even/odd operation is two vectors
5369 1st vec 2nd vec
5370 0 1 2 3 4 5 6 7
5372 and the output is the vector of extracted even/odd elements. The output of
5373 extract_even will be: 0 2 4 6
5374 and of extract_odd: 1 3 5 7
5377 The permutation is done in log LENGTH stages. In each stage extract_even
5378 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5379 their order. In our example,
5381 E1: extract_even (1st vec, 2nd vec)
5382 E2: extract_odd (1st vec, 2nd vec)
5383 E3: extract_even (3rd vec, 4th vec)
5384 E4: extract_odd (3rd vec, 4th vec)
5386 The output for the first stage will be:
5388 E1: 0 2 4 6 8 10 12 14
5389 E2: 1 3 5 7 9 11 13 15
5390 E3: 16 18 20 22 24 26 28 30
5391 E4: 17 19 21 23 25 27 29 31
5393 In order to proceed and create the correct sequence for the next stage (or
5394 for the correct output, if the second stage is the last one, as in our
5395 example), we first put the output of extract_even operation and then the
5396 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5397 The input for the second stage is:
5399 1st vec (E1): 0 2 4 6 8 10 12 14
5400 2nd vec (E3): 16 18 20 22 24 26 28 30
5401 3rd vec (E2): 1 3 5 7 9 11 13 15
5402 4th vec (E4): 17 19 21 23 25 27 29 31
5404 The output of the second stage:
5406 E1: 0 4 8 12 16 20 24 28
5407 E2: 2 6 10 14 18 22 26 30
5408 E3: 1 5 9 13 17 21 25 29
5409 E4: 3 7 11 15 19 23 27 31
5411 And RESULT_CHAIN after reordering:
5413 1st vec (E1): 0 4 8 12 16 20 24 28
5414 2nd vec (E3): 1 5 9 13 17 21 25 29
5415 3rd vec (E2): 2 6 10 14 18 22 26 30
5416 4th vec (E4): 3 7 11 15 19 23 27 31. */
5418 static void
5419 vect_permute_load_chain (vec<tree> dr_chain,
5420 unsigned int length,
5421 gimple *stmt,
5422 gimple_stmt_iterator *gsi,
5423 vec<tree> *result_chain)
5425 tree data_ref, first_vect, second_vect;
5426 tree perm_mask_even, perm_mask_odd;
5427 tree perm3_mask_low, perm3_mask_high;
5428 gimple *perm_stmt;
5429 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5430 unsigned int i, j, log_length = exact_log2 (length);
5431 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5432 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5434 result_chain->quick_grow (length);
5435 memcpy (result_chain->address (), dr_chain.address (),
5436 length * sizeof (tree));
5438 if (length == 3)
5440 unsigned int k;
5442 for (k = 0; k < 3; k++)
5444 for (i = 0; i < nelt; i++)
5445 if (3 * i + k < 2 * nelt)
5446 sel[i] = 3 * i + k;
5447 else
5448 sel[i] = 0;
5449 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5451 for (i = 0, j = 0; i < nelt; i++)
5452 if (3 * i + k < 2 * nelt)
5453 sel[i] = i;
5454 else
5455 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5457 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5459 first_vect = dr_chain[0];
5460 second_vect = dr_chain[1];
5462 /* Create interleaving stmt (low part of):
5463 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5464 ...}> */
5465 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5466 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5467 second_vect, perm3_mask_low);
5468 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5470 /* Create interleaving stmt (high part of):
5471 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5472 ...}> */
5473 first_vect = data_ref;
5474 second_vect = dr_chain[2];
5475 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5476 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5477 second_vect, perm3_mask_high);
5478 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5479 (*result_chain)[k] = data_ref;
5482 else
5484 /* If length is not equal to 3 then only power of 2 is supported. */
5485 gcc_assert (exact_log2 (length) != -1);
5487 for (i = 0; i < nelt; ++i)
5488 sel[i] = i * 2;
5489 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5491 for (i = 0; i < nelt; ++i)
5492 sel[i] = i * 2 + 1;
5493 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5495 for (i = 0; i < log_length; i++)
5497 for (j = 0; j < length; j += 2)
5499 first_vect = dr_chain[j];
5500 second_vect = dr_chain[j+1];
5502 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5503 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5504 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5505 first_vect, second_vect,
5506 perm_mask_even);
5507 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5508 (*result_chain)[j/2] = data_ref;
5510 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5511 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5512 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5513 first_vect, second_vect,
5514 perm_mask_odd);
5515 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5516 (*result_chain)[j/2+length/2] = data_ref;
5518 memcpy (dr_chain.address (), result_chain->address (),
5519 length * sizeof (tree));
5524 /* Function vect_shift_permute_load_chain.
5526 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5527 sequence of stmts to reorder the input data accordingly.
5528 Return the final references for loads in RESULT_CHAIN.
5529 Return true if successed, false otherwise.
5531 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5532 The input is 3 vectors each containing 8 elements. We assign a
5533 number to each element, the input sequence is:
5535 1st vec: 0 1 2 3 4 5 6 7
5536 2nd vec: 8 9 10 11 12 13 14 15
5537 3rd vec: 16 17 18 19 20 21 22 23
5539 The output sequence should be:
5541 1st vec: 0 3 6 9 12 15 18 21
5542 2nd vec: 1 4 7 10 13 16 19 22
5543 3rd vec: 2 5 8 11 14 17 20 23
5545 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5547 First we shuffle all 3 vectors to get correct elements order:
5549 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5550 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5551 3rd vec: (16 19 22) (17 20 23) (18 21)
5553 Next we unite and shift vector 3 times:
5555 1st step:
5556 shift right by 6 the concatenation of:
5557 "1st vec" and "2nd vec"
5558 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5559 "2nd vec" and "3rd vec"
5560 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5561 "3rd vec" and "1st vec"
5562 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5563 | New vectors |
5565 So that now new vectors are:
5567 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5568 2nd vec: (10 13) (16 19 22) (17 20 23)
5569 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5571 2nd step:
5572 shift right by 5 the concatenation of:
5573 "1st vec" and "3rd vec"
5574 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5575 "2nd vec" and "1st vec"
5576 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5577 "3rd vec" and "2nd vec"
5578 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5579 | New vectors |
5581 So that now new vectors are:
5583 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5584 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5585 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5587 3rd step:
5588 shift right by 5 the concatenation of:
5589 "1st vec" and "1st vec"
5590 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5591 shift right by 3 the concatenation of:
5592 "2nd vec" and "2nd vec"
5593 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5594 | New vectors |
5596 So that now all vectors are READY:
5597 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5598 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5599 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5601 This algorithm is faster than one in vect_permute_load_chain if:
5602 1. "shift of a concatination" is faster than general permutation.
5603 This is usually so.
5604 2. The TARGET machine can't execute vector instructions in parallel.
5605 This is because each step of the algorithm depends on previous.
5606 The algorithm in vect_permute_load_chain is much more parallel.
5608 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5611 static bool
5612 vect_shift_permute_load_chain (vec<tree> dr_chain,
5613 unsigned int length,
5614 gimple *stmt,
5615 gimple_stmt_iterator *gsi,
5616 vec<tree> *result_chain)
5618 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5619 tree perm2_mask1, perm2_mask2, perm3_mask;
5620 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5621 gimple *perm_stmt;
5623 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5624 unsigned int i;
5625 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5626 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5627 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5628 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5630 result_chain->quick_grow (length);
5631 memcpy (result_chain->address (), dr_chain.address (),
5632 length * sizeof (tree));
5634 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5636 unsigned int j, log_length = exact_log2 (length);
5637 for (i = 0; i < nelt / 2; ++i)
5638 sel[i] = i * 2;
5639 for (i = 0; i < nelt / 2; ++i)
5640 sel[nelt / 2 + i] = i * 2 + 1;
5641 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5643 if (dump_enabled_p ())
5644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5645 "shuffle of 2 fields structure is not \
5646 supported by target\n");
5647 return false;
5649 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5651 for (i = 0; i < nelt / 2; ++i)
5652 sel[i] = i * 2 + 1;
5653 for (i = 0; i < nelt / 2; ++i)
5654 sel[nelt / 2 + i] = i * 2;
5655 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5657 if (dump_enabled_p ())
5658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5659 "shuffle of 2 fields structure is not \
5660 supported by target\n");
5661 return false;
5663 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5665 /* Generating permutation constant to shift all elements.
5666 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5667 for (i = 0; i < nelt; i++)
5668 sel[i] = nelt / 2 + i;
5669 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5671 if (dump_enabled_p ())
5672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5673 "shift permutation is not supported by target\n");
5674 return false;
5676 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5678 /* Generating permutation constant to select vector from 2.
5679 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5680 for (i = 0; i < nelt / 2; i++)
5681 sel[i] = i;
5682 for (i = nelt / 2; i < nelt; i++)
5683 sel[i] = nelt + i;
5684 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5686 if (dump_enabled_p ())
5687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5688 "select is not supported by target\n");
5689 return false;
5691 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5693 for (i = 0; i < log_length; i++)
5695 for (j = 0; j < length; j += 2)
5697 first_vect = dr_chain[j];
5698 second_vect = dr_chain[j + 1];
5700 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5701 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5702 first_vect, first_vect,
5703 perm2_mask1);
5704 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5705 vect[0] = data_ref;
5707 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5708 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5709 second_vect, second_vect,
5710 perm2_mask2);
5711 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5712 vect[1] = data_ref;
5714 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5715 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5716 vect[0], vect[1], shift1_mask);
5717 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5718 (*result_chain)[j/2 + length/2] = data_ref;
5720 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5721 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5722 vect[0], vect[1], select_mask);
5723 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5724 (*result_chain)[j/2] = data_ref;
5726 memcpy (dr_chain.address (), result_chain->address (),
5727 length * sizeof (tree));
5729 return true;
5731 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5733 unsigned int k = 0, l = 0;
5735 /* Generating permutation constant to get all elements in rigth order.
5736 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5737 for (i = 0; i < nelt; i++)
5739 if (3 * k + (l % 3) >= nelt)
5741 k = 0;
5742 l += (3 - (nelt % 3));
5744 sel[i] = 3 * k + (l % 3);
5745 k++;
5747 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5749 if (dump_enabled_p ())
5750 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5751 "shuffle of 3 fields structure is not \
5752 supported by target\n");
5753 return false;
5755 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5757 /* Generating permutation constant to shift all elements.
5758 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5759 for (i = 0; i < nelt; i++)
5760 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5761 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5763 if (dump_enabled_p ())
5764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5765 "shift permutation is not supported by target\n");
5766 return false;
5768 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5770 /* Generating permutation constant to shift all elements.
5771 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5772 for (i = 0; i < nelt; i++)
5773 sel[i] = 2 * (nelt / 3) + 1 + i;
5774 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5776 if (dump_enabled_p ())
5777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5778 "shift permutation is not supported by target\n");
5779 return false;
5781 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5783 /* Generating permutation constant to shift all elements.
5784 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5785 for (i = 0; i < nelt; i++)
5786 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5787 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5789 if (dump_enabled_p ())
5790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5791 "shift permutation is not supported by target\n");
5792 return false;
5794 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5796 /* Generating permutation constant to shift all elements.
5797 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5798 for (i = 0; i < nelt; i++)
5799 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5800 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5802 if (dump_enabled_p ())
5803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5804 "shift permutation is not supported by target\n");
5805 return false;
5807 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5809 for (k = 0; k < 3; k++)
5811 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5812 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5813 dr_chain[k], dr_chain[k],
5814 perm3_mask);
5815 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5816 vect[k] = data_ref;
5819 for (k = 0; k < 3; k++)
5821 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5822 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5823 vect[k % 3], vect[(k + 1) % 3],
5824 shift1_mask);
5825 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5826 vect_shift[k] = data_ref;
5829 for (k = 0; k < 3; k++)
5831 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5832 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5833 vect_shift[(4 - k) % 3],
5834 vect_shift[(3 - k) % 3],
5835 shift2_mask);
5836 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5837 vect[k] = data_ref;
5840 (*result_chain)[3 - (nelt % 3)] = vect[2];
5842 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5843 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5844 vect[0], shift3_mask);
5845 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5846 (*result_chain)[nelt % 3] = data_ref;
5848 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5849 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5850 vect[1], shift4_mask);
5851 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5852 (*result_chain)[0] = data_ref;
5853 return true;
5855 return false;
5858 /* Function vect_transform_grouped_load.
5860 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5861 to perform their permutation and ascribe the result vectorized statements to
5862 the scalar statements.
5865 void
5866 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5867 gimple_stmt_iterator *gsi)
5869 machine_mode mode;
5870 vec<tree> result_chain = vNULL;
5872 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5873 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5874 vectors, that are ready for vector computation. */
5875 result_chain.create (size);
5877 /* If reassociation width for vector type is 2 or greater target machine can
5878 execute 2 or more vector instructions in parallel. Otherwise try to
5879 get chain for loads group using vect_shift_permute_load_chain. */
5880 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5881 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5882 || exact_log2 (size) != -1
5883 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5884 gsi, &result_chain))
5885 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5886 vect_record_grouped_load_vectors (stmt, result_chain);
5887 result_chain.release ();
5890 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5891 generated as part of the vectorization of STMT. Assign the statement
5892 for each vector to the associated scalar statement. */
5894 void
5895 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5897 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5898 gimple *next_stmt, *new_stmt;
5899 unsigned int i, gap_count;
5900 tree tmp_data_ref;
5902 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5903 Since we scan the chain starting from it's first node, their order
5904 corresponds the order of data-refs in RESULT_CHAIN. */
5905 next_stmt = first_stmt;
5906 gap_count = 1;
5907 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5909 if (!next_stmt)
5910 break;
5912 /* Skip the gaps. Loads created for the gaps will be removed by dead
5913 code elimination pass later. No need to check for the first stmt in
5914 the group, since it always exists.
5915 GROUP_GAP is the number of steps in elements from the previous
5916 access (if there is no gap GROUP_GAP is 1). We skip loads that
5917 correspond to the gaps. */
5918 if (next_stmt != first_stmt
5919 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5921 gap_count++;
5922 continue;
5925 while (next_stmt)
5927 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5928 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5929 copies, and we put the new vector statement in the first available
5930 RELATED_STMT. */
5931 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5932 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5933 else
5935 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5937 gimple *prev_stmt =
5938 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5939 gimple *rel_stmt =
5940 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5941 while (rel_stmt)
5943 prev_stmt = rel_stmt;
5944 rel_stmt =
5945 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5948 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5949 new_stmt;
5953 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5954 gap_count = 1;
5955 /* If NEXT_STMT accesses the same DR as the previous statement,
5956 put the same TMP_DATA_REF as its vectorized statement; otherwise
5957 get the next data-ref from RESULT_CHAIN. */
5958 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5959 break;
5964 /* Function vect_force_dr_alignment_p.
5966 Returns whether the alignment of a DECL can be forced to be aligned
5967 on ALIGNMENT bit boundary. */
5969 bool
5970 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5972 if (TREE_CODE (decl) != VAR_DECL)
5973 return false;
5975 if (decl_in_symtab_p (decl)
5976 && !symtab_node::get (decl)->can_increase_alignment_p ())
5977 return false;
5979 if (TREE_STATIC (decl))
5980 return (alignment <= MAX_OFILE_ALIGNMENT);
5981 else
5982 return (alignment <= MAX_STACK_ALIGNMENT);
5986 /* Return whether the data reference DR is supported with respect to its
5987 alignment.
5988 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5989 it is aligned, i.e., check if it is possible to vectorize it with different
5990 alignment. */
5992 enum dr_alignment_support
5993 vect_supportable_dr_alignment (struct data_reference *dr,
5994 bool check_aligned_accesses)
5996 gimple *stmt = DR_STMT (dr);
5997 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5998 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5999 machine_mode mode = TYPE_MODE (vectype);
6000 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6001 struct loop *vect_loop = NULL;
6002 bool nested_in_vect_loop = false;
6004 if (aligned_access_p (dr) && !check_aligned_accesses)
6005 return dr_aligned;
6007 /* For now assume all conditional loads/stores support unaligned
6008 access without any special code. */
6009 if (is_gimple_call (stmt)
6010 && gimple_call_internal_p (stmt)
6011 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6012 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6013 return dr_unaligned_supported;
6015 if (loop_vinfo)
6017 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6018 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6021 /* Possibly unaligned access. */
6023 /* We can choose between using the implicit realignment scheme (generating
6024 a misaligned_move stmt) and the explicit realignment scheme (generating
6025 aligned loads with a REALIGN_LOAD). There are two variants to the
6026 explicit realignment scheme: optimized, and unoptimized.
6027 We can optimize the realignment only if the step between consecutive
6028 vector loads is equal to the vector size. Since the vector memory
6029 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6030 is guaranteed that the misalignment amount remains the same throughout the
6031 execution of the vectorized loop. Therefore, we can create the
6032 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6033 at the loop preheader.
6035 However, in the case of outer-loop vectorization, when vectorizing a
6036 memory access in the inner-loop nested within the LOOP that is now being
6037 vectorized, while it is guaranteed that the misalignment of the
6038 vectorized memory access will remain the same in different outer-loop
6039 iterations, it is *not* guaranteed that is will remain the same throughout
6040 the execution of the inner-loop. This is because the inner-loop advances
6041 with the original scalar step (and not in steps of VS). If the inner-loop
6042 step happens to be a multiple of VS, then the misalignment remains fixed
6043 and we can use the optimized realignment scheme. For example:
6045 for (i=0; i<N; i++)
6046 for (j=0; j<M; j++)
6047 s += a[i+j];
6049 When vectorizing the i-loop in the above example, the step between
6050 consecutive vector loads is 1, and so the misalignment does not remain
6051 fixed across the execution of the inner-loop, and the realignment cannot
6052 be optimized (as illustrated in the following pseudo vectorized loop):
6054 for (i=0; i<N; i+=4)
6055 for (j=0; j<M; j++){
6056 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6057 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6058 // (assuming that we start from an aligned address).
6061 We therefore have to use the unoptimized realignment scheme:
6063 for (i=0; i<N; i+=4)
6064 for (j=k; j<M; j+=4)
6065 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6066 // that the misalignment of the initial address is
6067 // 0).
6069 The loop can then be vectorized as follows:
6071 for (k=0; k<4; k++){
6072 rt = get_realignment_token (&vp[k]);
6073 for (i=0; i<N; i+=4){
6074 v1 = vp[i+k];
6075 for (j=k; j<M; j+=4){
6076 v2 = vp[i+j+VS-1];
6077 va = REALIGN_LOAD <v1,v2,rt>;
6078 vs += va;
6079 v1 = v2;
6082 } */
6084 if (DR_IS_READ (dr))
6086 bool is_packed = false;
6087 tree type = (TREE_TYPE (DR_REF (dr)));
6089 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6090 && (!targetm.vectorize.builtin_mask_for_load
6091 || targetm.vectorize.builtin_mask_for_load ()))
6093 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6095 /* If we are doing SLP then the accesses need not have the
6096 same alignment, instead it depends on the SLP group size. */
6097 if (loop_vinfo
6098 && STMT_SLP_TYPE (stmt_info)
6099 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6100 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6101 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6103 else if (!loop_vinfo
6104 || (nested_in_vect_loop
6105 && (TREE_INT_CST_LOW (DR_STEP (dr))
6106 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6107 return dr_explicit_realign;
6108 else
6109 return dr_explicit_realign_optimized;
6111 if (!known_alignment_for_access_p (dr))
6112 is_packed = not_size_aligned (DR_REF (dr));
6114 if ((TYPE_USER_ALIGN (type) && !is_packed)
6115 || targetm.vectorize.
6116 support_vector_misalignment (mode, type,
6117 DR_MISALIGNMENT (dr), is_packed))
6118 /* Can't software pipeline the loads, but can at least do them. */
6119 return dr_unaligned_supported;
6121 else
6123 bool is_packed = false;
6124 tree type = (TREE_TYPE (DR_REF (dr)));
6126 if (!known_alignment_for_access_p (dr))
6127 is_packed = not_size_aligned (DR_REF (dr));
6129 if ((TYPE_USER_ALIGN (type) && !is_packed)
6130 || targetm.vectorize.
6131 support_vector_misalignment (mode, type,
6132 DR_MISALIGNMENT (dr), is_packed))
6133 return dr_unaligned_supported;
6136 /* Unsupported. */
6137 return dr_unaligned_unsupported;